From 59b09fd097e39bd15646eac1156889692974dc5f Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 29 Sep 2023 11:10:25 -0500
Subject: [PATCH 01/12] cuDF: Build CUDA 12.0 ARM conda packages. (#14112)

This PR builds conda packages using CUDA 12 on ARM.

This work is targeting 23.12 and depends on https://github.com/rapidsai/rmm/pull/1330.

Closes #14128.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/14112
---
 .github/workflows/build.yaml | 16 ++++++++--------
 .github/workflows/pr.yaml    | 28 ++++++++++++++--------------
 .github/workflows/test.yaml  | 16 ++++++++--------
 dependencies.yaml            | 20 ++------------------
 4 files changed, 32 insertions(+), 48 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index ab028eb89cc..dc2c81d1c77 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@cuda-120-arm
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-120-arm
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@cuda-120-arm
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@cuda-120-arm
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@cuda-120-arm
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@cuda-120-arm
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -100,7 +100,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@cuda-120-arm
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 214f9c90b41..047b80f2e5c 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -26,34 +26,34 @@ jobs:
       - wheel-build-dask-cudf
       - wheel-tests-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@cuda-120-arm
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@cuda-120-arm
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@cuda-120-arm
     with:
       build_type: pull-request
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-120-arm
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-120-arm
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm
     with:
       build_type: pull-request
       test_script: "ci/test_python_cudf.sh"
@@ -61,14 +61,14 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm
     with:
       build_type: pull-request
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -78,7 +78,7 @@ jobs:
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -88,7 +88,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -98,21 +98,21 @@ jobs:
   wheel-build-cudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@cuda-120-arm
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@cuda-120-arm
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
   wheel-build-dask-cudf:
     needs: wheel-tests-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@cuda-120-arm
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: pull-request
@@ -120,7 +120,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@cuda-120-arm
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 9ca32bcfe03..e58227c30dc 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-120-arm
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -36,7 +36,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -55,7 +55,7 @@ jobs:
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -67,7 +67,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@cuda-120-arm
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -88,7 +88,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@cuda-120-arm
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: nightly
diff --git a/dependencies.yaml b/dependencies.yaml
index c8ee66bd99f..c19e8765be3 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -227,25 +227,9 @@ dependencies:
           # in sync with the version pinned in get_arrow.cmake.
           - libarrow==12.0.1.*
           - librdkafka>=1.9.0,<1.10.0a0
+          # Align nvcomp version with rapids-cmake
+          - nvcomp==2.6.1
           - spdlog>=1.11.0,<1.12
-    specific:
-      - output_types: conda
-        matrices:
-          - matrix:
-              arch: x86_64
-            packages:
-              # Align nvcomp version with rapids-cmake
-              # TODO: not yet available for aarch64 CUDA 12
-              - &nvcomp nvcomp==2.6.1
-          - matrix:
-              arch: aarch64
-              cuda: "11.8"
-            packages:
-              - *nvcomp
-          # TODO: Fallback matrix for aarch64 CUDA 12. After migrating to nvcomp 3,
-          # all CUDA/arch combinations should be supported by existing packages.
-          - matrix:
-            packages:
   build_wheels:
     common:
       - output_types: pyproject

From 66a655ce80e8b0accb80ea4e23799d23a82a35a2 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 3 Oct 2023 08:00:44 -0500
Subject: [PATCH 02/12] Fix inaccuracy in decimal128 rounding. (#14233)

Fixes a bug where floating-point values were used in decimal128 rounding, giving wrong results.

Closes https://github.com/rapidsai/cudf/issues/14210.

Authors:
   - Bradley Dice (https://github.com/bdice)

Approvers:
   - Divye Gala (https://github.com/divyegala)
   - Mark Harris (https://github.com/harrism)
---
 cpp/src/round/round.cu          |  5 ++-
 cpp/tests/round/round_tests.cpp | 79 +++++++++++++++++++++++++++++++++
 2 files changed, 83 insertions(+), 1 deletion(-)

diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 4b3f80fc6e2..41cce57d55b 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -271,7 +271,10 @@ std::unique_ptr<column> round_with(column_view const& input,
                                out_view.template end<Type>(),
                                static_cast<Type>(0));
   } else {
-    Type const n = std::pow(10, scale_movement);
+    Type n = 10;
+    for (int i = 1; i < scale_movement; ++i) {
+      n *= 10;
+    }
     thrust::transform(rmm::exec_policy(stream),
                       input.begin<Type>(),
                       input.end<Type>(),
diff --git a/cpp/tests/round/round_tests.cpp b/cpp/tests/round/round_tests.cpp
index d802c0c2706..f97bb7a5323 100644
--- a/cpp/tests/round/round_tests.cpp
+++ b/cpp/tests/round/round_tests.cpp
@@ -703,4 +703,83 @@ TEST_F(RoundTests, BoolTestHalfUp)
   EXPECT_THROW(cudf::round(input, -2, cudf::rounding_method::HALF_UP), cudf::logic_error);
 }
 
+// Use __uint128_t for demonstration.
+constexpr __uint128_t operator""_uint128_t(const char* s)
+{
+  __uint128_t ret = 0;
+  for (int i = 0; s[i] != '\0'; ++i) {
+    ret *= 10;
+    if ('0' <= s[i] && s[i] <= '9') { ret += s[i] - '0'; }
+  }
+  return ret;
+}
+
+TEST_F(RoundTests, HalfEvenErrorsA)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  {
+    // 0.5 at scale -37 should round HALF_EVEN to 0, because 0 is an even number
+    auto const input =
+      fp_wrapper{{5000000000000000000000000000000000000_uint128_t}, scale_type{-37}};
+    auto const expected = fp_wrapper{{0}, scale_type{0}};
+    auto const result   = cudf::round(input, 0, cudf::rounding_method::HALF_EVEN);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
+TEST_F(RoundTests, HalfEvenErrorsB)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  {
+    // 0.125 at scale -37 should round HALF_EVEN to 0.12, because 2 is an even number
+    auto const input =
+      fp_wrapper{{1250000000000000000000000000000000000_uint128_t}, scale_type{-37}};
+    auto const expected = fp_wrapper{{12}, scale_type{-2}};
+    auto const result   = cudf::round(input, 2, cudf::rounding_method::HALF_EVEN);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
+TEST_F(RoundTests, HalfEvenErrorsC)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  {
+    // 0.0625 at scale -37 should round HALF_EVEN to 0.062, because 2 is an even number
+    auto const input =
+      fp_wrapper{{0625000000000000000000000000000000000_uint128_t}, scale_type{-37}};
+    auto const expected = fp_wrapper{{62}, scale_type{-3}};
+    auto const result   = cudf::round(input, 3, cudf::rounding_method::HALF_EVEN);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
+TEST_F(RoundTests, HalfUpErrorsA)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  {
+    // 0.25 at scale -37 should round HALF_UP to 0.3
+    auto const input =
+      fp_wrapper{{2500000000000000000000000000000000000_uint128_t}, scale_type{-37}};
+    auto const expected = fp_wrapper{{3}, scale_type{-1}};
+    auto const result   = cudf::round(input, 1, cudf::rounding_method::HALF_UP);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 3964950ba2fecf7f962917276058a6381d396246 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 3 Oct 2023 15:11:15 -0500
Subject: [PATCH 03/12] Fix inaccurate ceil/floor and inaccurate rescaling
 casts of fixed-point values. (#14242)

This is a follow-up PR to #14233. This PR fixes a bug where floating-point values were used as intermediates in ceil/floor unary operations and cast operations that require rescaling for fixed-point types, giving inaccurate results.

See also:
- https://github.com/rapidsai/cudf/pull/14233#discussion_r1340786769
- https://github.com/rapidsai/cudf/issues/14243

Authors:
   - Bradley Dice (https://github.com/bdice)

Approvers:
   - Mike Wilson (https://github.com/hyperbolic2346)
   - Vukasin Milovanovic (https://github.com/vuule)
---
 cpp/src/unary/cast_ops.cu          |  8 +++++-
 cpp/src/unary/math_ops.cu          |  8 ++++--
 cpp/tests/unary/cast_tests.cpp     | 40 ++++++++++++++++++++++++++++++
 cpp/tests/unary/unary_ops_test.cpp | 33 ++++++++++++++++++++++++
 4 files changed, 86 insertions(+), 3 deletions(-)

diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index f40ace8d10b..1c81f266200 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -199,7 +199,13 @@ std::unique_ptr<column> rescale(column_view input,
       }
       return output_column;
     }
-    auto const scalar = make_fixed_point_scalar<T>(std::pow(10, -diff), scale_type{diff}, stream);
+
+    RepType scalar_value = 10;
+    for (int i = 1; i < -diff; ++i) {
+      scalar_value *= 10;
+    }
+
+    auto const scalar = make_fixed_point_scalar<T>(scalar_value, scale_type{diff}, stream);
     return detail::binary_operation(input, *scalar, binary_operator::DIV, type, stream, mr);
   }
 };
diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index 961f3a9e720..d0cae81a9c8 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -295,7 +295,11 @@ std::unique_ptr<column> unary_op_with(column_view const& input,
     input.type(), input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr);
 
   auto out_view = result->mutable_view();
-  Type const n  = std::pow(10, -input.type().scale());
+
+  Type n = 10;
+  for (int i = 1; i < -input.type().scale(); ++i) {
+    n *= 10;
+  }
 
   thrust::transform(rmm::exec_policy(stream),
                     input.begin<Type>(),
diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp
index 9506e1918c0..d565359a4ea 100644
--- a/cpp/tests/unary/cast_tests.cpp
+++ b/cpp/tests/unary/cast_tests.cpp
@@ -30,6 +30,8 @@
 #include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
 
+#include <cuda/std/limits>
+
 #include <type_traits>
 #include <vector>
 
@@ -967,6 +969,44 @@ TYPED_TEST(FixedPointTests, Decimal128ToDecimalXXWithLargerScale)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
+TYPED_TEST(FixedPointTests, ValidateCastRescalePrecision)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  // This test is designed to protect against floating point conversion
+  // introducing errors in fixed-point arithmetic. The rescaling that occurs
+  // during casting to different scales should only use fixed-precision math.
+  // Realistically, we are only able to show precision failures due to floating
+  // conversion in a few very specific circumstances where dividing by specific
+  // powers of 10 works against us.  Some examples: 10^23, 10^25, 10^26, 10^27,
+  // 10^30, 10^32, 10^36. See https://godbolt.org/z/cP1MddP8P for a derivation.
+  // For completeness and to ensure that we are not missing any other cases, we
+  // test casting to/from all scales in the range of each decimal type. Values
+  // that are powers of ten show this error more readily than non-powers of 10
+  // because the rescaling factor is a power of 10, meaning that errors in
+  // division are more visible.
+  constexpr auto min_scale = -cuda::std::numeric_limits<RepType>::digits10;
+  for (int input_scale = 0; input_scale >= min_scale; --input_scale) {
+    for (int result_scale = 0; result_scale >= min_scale; --result_scale) {
+      RepType input_value = 1;
+      for (int k = 0; k > input_scale; --k) {
+        input_value *= 10;
+      }
+      RepType result_value = 1;
+      for (int k = 0; k > result_scale; --k) {
+        result_value *= 10;
+      }
+      auto const input    = fp_wrapper{{input_value}, scale_type{input_scale}};
+      auto const expected = fp_wrapper{{result_value}, scale_type{result_scale}};
+      auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(result_scale));
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+    }
+  }
+}
+
 TYPED_TEST(FixedPointTests, Decimal32ToDecimalXXWithLargerScaleAndNullMask)
 {
   using namespace numeric;
diff --git a/cpp/tests/unary/unary_ops_test.cpp b/cpp/tests/unary/unary_ops_test.cpp
index 49764f22373..76d1f769856 100644
--- a/cpp/tests/unary/unary_ops_test.cpp
+++ b/cpp/tests/unary/unary_ops_test.cpp
@@ -24,6 +24,8 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
+#include <cuda/std/limits>
+
 template <typename T>
 cudf::test::fixed_width_column_wrapper<T> create_fixed_columns(cudf::size_type start,
                                                                cudf::size_type size,
@@ -372,4 +374,35 @@ TYPED_TEST(FixedPointUnaryTests, FixedPointUnaryFloorLarge)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
+TYPED_TEST(FixedPointUnaryTests, ValidateCeilFloorPrecision)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  // This test is designed to protect against floating point conversion
+  // introducing errors in fixed-point arithmetic. The rounding that occurs
+  // during ceil/floor should only use fixed-precision math. Realistically,
+  // we are only able to show precision failures due to floating conversion in
+  // a few very specific circumstances where dividing by specific powers of 10
+  // works against us.  Some examples: 10^23, 10^25, 10^26, 10^27, 10^30,
+  // 10^32, 10^36. See https://godbolt.org/z/cP1MddP8P for a derivation. For
+  // completeness and to ensure that we are not missing any other cases, we
+  // test all scales representable in the range of each decimal type.
+  constexpr auto min_scale = -cuda::std::numeric_limits<RepType>::digits10;
+  for (int input_scale = 0; input_scale >= min_scale; --input_scale) {
+    RepType input_value = 1;
+    for (int k = 0; k > input_scale; --k) {
+      input_value *= 10;
+    }
+    auto const input       = fp_wrapper{{input_value}, scale_type{input_scale}};
+    auto const expected    = fp_wrapper{{input_value}, scale_type{input_scale}};
+    auto const ceil_result = cudf::unary_operation(input, cudf::unary_operator::CEIL);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, ceil_result->view());
+    auto const floor_result = cudf::unary_operation(input, cudf::unary_operator::FLOOR);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, floor_result->view());
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 29556a2514f4d274164a27a80539410da7e132d6 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 3 Oct 2023 14:44:28 -0700
Subject: [PATCH 04/12] Remove the use of volatile in ORC (#14175)

`volatile` should no be required in our code, unless there are compiler or synchronization issues.
This PR removes the use in ORC reader and writer.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14175
---
 cpp/src/io/orc/stats_enc.cu   |  4 +-
 cpp/src/io/orc/stripe_data.cu | 82 +++++++++++++++--------------------
 cpp/src/io/orc/stripe_enc.cu  | 14 +++---
 cpp/src/io/orc/stripe_init.cu |  2 +-
 4 files changed, 46 insertions(+), 56 deletions(-)

diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index 95f1db5bfd1..479a2dfada3 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -76,8 +76,8 @@ __global__ void __launch_bounds__(block_size, 1)
 {
   using block_scan = cub::BlockScan<uint32_t, block_size, cub::BLOCK_SCAN_WARP_SCANS>;
   __shared__ typename block_scan::TempStorage temp_storage;
-  volatile uint32_t stats_size = 0;
-  auto t                       = threadIdx.x;
+  uint32_t stats_size = 0;
+  auto t              = threadIdx.x;
   __syncthreads();
   for (thread_index_type start = 0; start < statistics_count; start += block_size) {
     uint32_t stats_len = 0, stats_pos;
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 3edcd3d83b2..0b249bbdafe 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -142,9 +142,7 @@ struct orcdec_state_s {
  * @param[in] base Pointer to raw byte stream data
  * @param[in] len Stream length in bytes
  */
-static __device__ void bytestream_init(volatile orc_bytestream_s* bs,
-                                       uint8_t const* base,
-                                       uint32_t len)
+static __device__ void bytestream_init(orc_bytestream_s* bs, uint8_t const* base, uint32_t len)
 {
   uint32_t pos   = (len > 0) ? static_cast<uint32_t>(7 & reinterpret_cast<size_t>(base)) : 0;
   bs->base       = base - pos;
@@ -160,8 +158,7 @@ static __device__ void bytestream_init(volatile orc_bytestream_s* bs,
  * @param[in] bs Byte stream input
  * @param[in] bytes_consumed Number of bytes that were consumed
  */
-static __device__ void bytestream_flush_bytes(volatile orc_bytestream_s* bs,
-                                              uint32_t bytes_consumed)
+static __device__ void bytestream_flush_bytes(orc_bytestream_s* bs, uint32_t bytes_consumed)
 {
   uint32_t pos     = bs->pos;
   uint32_t len     = bs->len;
@@ -197,7 +194,7 @@ static __device__ void bytestream_fill(orc_bytestream_s* bs, int t)
  * @param[in] pos Position in byte stream
  * @return byte
  */
-inline __device__ uint8_t bytestream_readbyte(volatile orc_bytestream_s* bs, int pos)
+inline __device__ uint8_t bytestream_readbyte(orc_bytestream_s* bs, int pos)
 {
   return bs->buf.u8[pos & (bytestream_buffer_size - 1)];
 }
@@ -209,7 +206,7 @@ inline __device__ uint8_t bytestream_readbyte(volatile orc_bytestream_s* bs, int
  * @param[in] pos Position in byte stream
  * @result bits
  */
-inline __device__ uint32_t bytestream_readu32(volatile orc_bytestream_s* bs, int pos)
+inline __device__ uint32_t bytestream_readu32(orc_bytestream_s* bs, int pos)
 {
   uint32_t a = bs->buf.u32[(pos & (bytestream_buffer_size - 1)) >> 2];
   uint32_t b = bs->buf.u32[((pos + 4) & (bytestream_buffer_size - 1)) >> 2];
@@ -224,7 +221,7 @@ inline __device__ uint32_t bytestream_readu32(volatile orc_bytestream_s* bs, int
  * @param[in] numbits number of bits
  * @return bits
  */
-inline __device__ uint64_t bytestream_readu64(volatile orc_bytestream_s* bs, int pos)
+inline __device__ uint64_t bytestream_readu64(orc_bytestream_s* bs, int pos)
 {
   uint32_t a    = bs->buf.u32[(pos & (bytestream_buffer_size - 1)) >> 2];
   uint32_t b    = bs->buf.u32[((pos + 4) & (bytestream_buffer_size - 1)) >> 2];
@@ -245,9 +242,7 @@ inline __device__ uint64_t bytestream_readu64(volatile orc_bytestream_s* bs, int
  * @param[in] numbits number of bits
  * @return decoded value
  */
-inline __device__ uint32_t bytestream_readbits(volatile orc_bytestream_s* bs,
-                                               int bitpos,
-                                               uint32_t numbits)
+inline __device__ uint32_t bytestream_readbits(orc_bytestream_s* bs, int bitpos, uint32_t numbits)
 {
   int idx    = bitpos >> 5;
   uint32_t a = __byte_perm(bs->buf.u32[(idx + 0) & bytestream_buffer_mask], 0, 0x0123);
@@ -263,9 +258,7 @@ inline __device__ uint32_t bytestream_readbits(volatile orc_bytestream_s* bs,
  * @param[in] numbits number of bits
  * @return decoded value
  */
-inline __device__ uint64_t bytestream_readbits64(volatile orc_bytestream_s* bs,
-                                                 int bitpos,
-                                                 uint32_t numbits)
+inline __device__ uint64_t bytestream_readbits64(orc_bytestream_s* bs, int bitpos, uint32_t numbits)
 {
   int idx       = bitpos >> 5;
   uint32_t a    = __byte_perm(bs->buf.u32[(idx + 0) & bytestream_buffer_mask], 0, 0x0123);
@@ -288,7 +281,7 @@ inline __device__ uint64_t bytestream_readbits64(volatile orc_bytestream_s* bs,
  * @param[in] numbits number of bits
  * @param[out] result decoded value
  */
-inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
+inline __device__ void bytestream_readbe(orc_bytestream_s* bs,
                                          int bitpos,
                                          uint32_t numbits,
                                          uint32_t& result)
@@ -304,7 +297,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
  * @param[in] numbits number of bits
  * @param[out] result decoded value
  */
-inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
+inline __device__ void bytestream_readbe(orc_bytestream_s* bs,
                                          int bitpos,
                                          uint32_t numbits,
                                          int32_t& result)
@@ -321,7 +314,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
  * @param[in] numbits number of bits
  * @param[out] result decoded value
  */
-inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
+inline __device__ void bytestream_readbe(orc_bytestream_s* bs,
                                          int bitpos,
                                          uint32_t numbits,
                                          uint64_t& result)
@@ -337,7 +330,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
  * @param[in] numbits number of bits
  * @param[out] result decoded value
  */
-inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
+inline __device__ void bytestream_readbe(orc_bytestream_s* bs,
                                          int bitpos,
                                          uint32_t numbits,
                                          int64_t& result)
@@ -354,7 +347,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
  * @return length of varint in bytes
  */
 template <class T>
-inline __device__ uint32_t varint_length(volatile orc_bytestream_s* bs, int pos)
+inline __device__ uint32_t varint_length(orc_bytestream_s* bs, int pos)
 {
   if (bytestream_readbyte(bs, pos) > 0x7f) {
     uint32_t next32 = bytestream_readu32(bs, pos + 1);
@@ -392,7 +385,7 @@ inline __device__ uint32_t varint_length(volatile orc_bytestream_s* bs, int pos)
  * @return new position in byte stream buffer
  */
 template <class T>
-inline __device__ int decode_base128_varint(volatile orc_bytestream_s* bs, int pos, T& result)
+inline __device__ int decode_base128_varint(orc_bytestream_s* bs, int pos, T& result)
 {
   uint32_t v = bytestream_readbyte(bs, pos++);
   if (v > 0x7f) {
@@ -446,7 +439,7 @@ inline __device__ int decode_base128_varint(volatile orc_bytestream_s* bs, int p
 /**
  * @brief Decodes a signed int128 encoded as base-128 varint (used for decimals)
  */
-inline __device__ __int128_t decode_varint128(volatile orc_bytestream_s* bs, int pos)
+inline __device__ __int128_t decode_varint128(orc_bytestream_s* bs, int pos)
 {
   auto byte                  = bytestream_readbyte(bs, pos++);
   __int128_t const sign_mask = -(int32_t)(byte & 1);
@@ -463,7 +456,7 @@ inline __device__ __int128_t decode_varint128(volatile orc_bytestream_s* bs, int
 /**
  * @brief Decodes an unsigned 32-bit varint
  */
-inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint32_t& result)
+inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, uint32_t& result)
 {
   uint32_t u;
   pos    = decode_base128_varint<uint32_t>(bs, pos, u);
@@ -474,7 +467,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint
 /**
  * @brief Decodes an unsigned 64-bit varint
  */
-inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint64_t& result)
+inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, uint64_t& result)
 {
   uint64_t u;
   pos    = decode_base128_varint<uint64_t>(bs, pos, u);
@@ -485,7 +478,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint
 /**
  * @brief Signed version of 32-bit decode_varint
  */
-inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int32_t& result)
+inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, int32_t& result)
 {
   uint32_t u;
   pos    = decode_base128_varint<uint32_t>(bs, pos, u);
@@ -496,7 +489,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int3
 /**
  * @brief Signed version of 64-bit decode_varint
  */
-inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int64_t& result)
+inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, int64_t& result)
 {
   uint64_t u;
   pos    = decode_base128_varint<uint64_t>(bs, pos, u);
@@ -514,7 +507,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int6
  * @return number of values decoded
  */
 template <class T>
-inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals, unsigned int t)
+inline __device__ void lengths_to_positions(T* vals, uint32_t numvals, unsigned int t)
 {
   for (uint32_t n = 1; n < numvals; n <<= 1) {
     __syncthreads();
@@ -534,8 +527,8 @@ inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals,
  * @return number of values decoded
  */
 template <class T>
-static __device__ uint32_t Integer_RLEv1(
-  orc_bytestream_s* bs, volatile orc_rlev1_state_s* rle, volatile T* vals, uint32_t maxvals, int t)
+static __device__ uint32_t
+Integer_RLEv1(orc_bytestream_s* bs, orc_rlev1_state_s* rle, T* vals, uint32_t maxvals, int t)
 {
   uint32_t numvals, numruns;
   if (t == 0) {
@@ -642,8 +635,8 @@ static const __device__ __constant__ uint8_t ClosestFixedBitsMap[65] = {
  */
 template <class T>
 static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
-                                         volatile orc_rlev2_state_s* rle,
-                                         volatile T* vals,
+                                         orc_rlev2_state_s* rle,
+                                         T* vals,
                                          uint32_t maxvals,
                                          int t,
                                          bool has_buffered_values = false)
@@ -883,7 +876,7 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
  *
  * @return 32-bit value
  */
-inline __device__ uint32_t rle8_read_bool32(volatile uint32_t* vals, uint32_t bitpos)
+inline __device__ uint32_t rle8_read_bool32(uint32_t* vals, uint32_t bitpos)
 {
   uint32_t a = vals[(bitpos >> 5) + 0];
   uint32_t b = vals[(bitpos >> 5) + 1];
@@ -903,11 +896,8 @@ inline __device__ uint32_t rle8_read_bool32(volatile uint32_t* vals, uint32_t bi
  *
  * @return number of values decoded
  */
-static __device__ uint32_t Byte_RLE(orc_bytestream_s* bs,
-                                    volatile orc_byterle_state_s* rle,
-                                    volatile uint8_t* vals,
-                                    uint32_t maxvals,
-                                    int t)
+static __device__ uint32_t
+Byte_RLE(orc_bytestream_s* bs, orc_byterle_state_s* rle, uint8_t* vals, uint32_t maxvals, int t)
 {
   uint32_t numvals, numruns;
   int r, tr;
@@ -1006,8 +996,8 @@ static const __device__ __constant__ int64_t kPow5i[28] = {1,
  * @return number of values decoded
  */
 static __device__ int Decode_Decimals(orc_bytestream_s* bs,
-                                      volatile orc_byterle_state_s* scratch,
-                                      volatile orcdec_state_s::values& vals,
+                                      orc_byterle_state_s* scratch,
+                                      orcdec_state_s::values& vals,
                                       int val_scale,
                                       int numvals,
                                       type_id dtype_id,
@@ -1241,8 +1231,8 @@ __global__ void __launch_bounds__(block_size)
       }
       __syncthreads();
       while (s->top.dict.dict_len > 0) {
-        uint32_t numvals        = min(s->top.dict.dict_len, blockDim.x), len;
-        volatile uint32_t* vals = s->vals.u32;
+        uint32_t numvals = min(s->top.dict.dict_len, blockDim.x), len;
+        uint32_t* vals   = s->vals.u32;
         bytestream_fill(&s->bs, t);
         __syncthreads();
         if (is_rlev1(s->chunk.encoding_kind)) {
@@ -1310,12 +1300,12 @@ static __device__ void DecodeRowPositions(orcdec_state_s* s,
                          min((row_decoder_buffer_size - s->u.rowdec.nz_count) * 2, blockDim.x));
     if (s->chunk.valid_map_base != nullptr) {
       // We have a present stream
-      uint32_t rmax  = s->top.data.end_row - min((uint32_t)first_row, s->top.data.end_row);
-      auto r         = (uint32_t)(s->top.data.cur_row + s->top.data.nrows + t - first_row);
-      uint32_t valid = (t < nrows && r < rmax)
-                         ? (((uint8_t const*)s->chunk.valid_map_base)[r >> 3] >> (r & 7)) & 1
-                         : 0;
-      volatile auto* row_ofs_plus1 = (volatile uint16_t*)&s->u.rowdec.row[s->u.rowdec.nz_count];
+      uint32_t rmax       = s->top.data.end_row - min((uint32_t)first_row, s->top.data.end_row);
+      auto r              = (uint32_t)(s->top.data.cur_row + s->top.data.nrows + t - first_row);
+      uint32_t valid      = (t < nrows && r < rmax)
+                              ? (((uint8_t const*)s->chunk.valid_map_base)[r >> 3] >> (r & 7)) & 1
+                              : 0;
+      auto* row_ofs_plus1 = (uint16_t*)&s->u.rowdec.row[s->u.rowdec.nz_count];
       uint32_t nz_pos, row_plus1, nz_count = s->u.rowdec.nz_count, last_row;
       if (t < nrows) { row_ofs_plus1[t] = valid; }
       lengths_to_positions<uint16_t>(row_ofs_plus1, nrows, t);
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 73c41e2bbcd..4841fb1141a 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -53,7 +53,7 @@ constexpr bool zero_pll_war = true;
 struct byterle_enc_state_s {
   uint32_t literal_run;
   uint32_t repeat_run;
-  volatile uint32_t rpt_map[(512 / 32) + 1];
+  uint32_t rpt_map[(512 / 32) + 1];
 };
 
 struct intrle_enc_state_s {
@@ -63,7 +63,7 @@ struct intrle_enc_state_s {
   uint32_t literal_w;
   uint32_t hdr_bytes;
   uint32_t pl_bytes;
-  volatile uint32_t delta_map[(512 / 32) + 1];
+  uint32_t delta_map[(512 / 32) + 1];
 };
 
 struct strdata_enc_state_s {
@@ -366,7 +366,7 @@ static __device__ uint32_t IntegerRLE(
   using block_reduce = cub::BlockReduce<T, block_size>;
   uint8_t* dst       = s->stream.data_ptrs[cid] + s->strm_pos[cid];
   uint32_t out_cnt   = 0;
-  __shared__ volatile uint64_t block_vmin;
+  __shared__ uint64_t block_vmin;
 
   while (numvals > 0) {
     T v0               = (t < numvals) ? inbuf[(inpos + t) & inmask] : 0;
@@ -615,7 +615,7 @@ static __device__ void StoreStringData(uint8_t* dst,
  * @param[in] t thread id
  */
 template <class T>
-inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals, unsigned int t)
+inline __device__ void lengths_to_positions(T* vals, uint32_t numvals, unsigned int t)
 {
   for (uint32_t n = 1; n < numvals; n <<= 1) {
     __syncthreads();
@@ -1143,7 +1143,7 @@ __global__ void __launch_bounds__(256)
                            uint32_t comp_block_align)
 {
   __shared__ __align__(16) StripeStream ss;
-  __shared__ uint8_t* volatile uncomp_base_g;
+  __shared__ uint8_t* uncomp_base_g;
 
   auto const padded_block_header_size = util::round_up_unsafe(block_header_size, comp_block_align);
   auto const padded_comp_block_size   = util::round_up_unsafe(max_comp_blk_size, comp_block_align);
@@ -1196,8 +1196,8 @@ __global__ void __launch_bounds__(1024)
                              uint32_t max_comp_blk_size)
 {
   __shared__ __align__(16) StripeStream ss;
-  __shared__ uint8_t const* volatile comp_src_g;
-  __shared__ uint32_t volatile comp_len_g;
+  __shared__ uint8_t const* comp_src_g;
+  __shared__ uint32_t comp_len_g;
 
   auto const stripe_id = blockIdx.x;
   auto const stream_id = blockIdx.y;
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index 8eeca504121..b31a4a081d1 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -499,7 +499,7 @@ __global__ void __launch_bounds__(128, 8) gpuParseRowGroupIndex(RowGroup* row_gr
           : row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].start_row;
       for (int j = t4; j < rowgroup_size4; j += 4) {
         ((uint32_t*)&row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x])[j] =
-          ((volatile uint32_t*)&s->rowgroups[i])[j];
+          ((uint32_t*)&s->rowgroups[i])[j];
       }
       row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].num_rows = num_rows;
       // Updating in case of struct

From d87e181daa67d8fb1a029fc2c09e2f561d1e7234 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 4 Oct 2023 13:25:56 -0700
Subject: [PATCH 05/12] Expose streams in binaryop APIs (#14187)

Contributes to #925

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/14187
---
 cpp/include/cudf/binaryop.hpp           |   8 ++
 cpp/src/binaryop/binaryop.cpp           |  12 ++-
 cpp/src/binaryop/compiled/binary_ops.cu |   6 +-
 cpp/tests/CMakeLists.txt                |   1 +
 cpp/tests/streams/binaryop_test.cpp     | 126 ++++++++++++++++++++++++
 5 files changed, 147 insertions(+), 6 deletions(-)
 create mode 100644 cpp/tests/streams/binaryop_test.cpp

diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index 77d6a4d1e89..9df4b4eb00f 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -102,6 +102,7 @@ enum class binary_operator : int32_t {
  * @param rhs         The right operand column
  * @param op          The binary operator
  * @param output_type The desired data type of the output column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
@@ -115,6 +116,7 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   binary_operator op,
   data_type output_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -131,6 +133,7 @@ std::unique_ptr<column> binary_operation(
  * @param rhs         The right operand scalar
  * @param op          The binary operator
  * @param output_type The desired data type of the output column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
@@ -144,6 +147,7 @@ std::unique_ptr<column> binary_operation(
   scalar const& rhs,
   binary_operator op,
   data_type output_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -158,6 +162,7 @@ std::unique_ptr<column> binary_operation(
  * @param rhs         The right operand column
  * @param op          The binary operator
  * @param output_type The desired data type of the output column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
@@ -172,6 +177,7 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   binary_operator op,
   data_type output_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -189,6 +195,7 @@ std::unique_ptr<column> binary_operation(
  * @param output_type The desired data type of the output column. It is assumed
  *                    that output_type is compatible with the output data type
  *                    of the function in the PTX code
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
@@ -201,6 +208,7 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   std::string const& ptx,
   data_type output_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index ef07de8c461..6b413ab2be4 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -405,38 +405,42 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          column_view const& rhs,
                                          binary_operator op,
                                          data_type output_type,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr);
+  return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 }
 std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          scalar const& rhs,
                                          binary_operator op,
                                          data_type output_type,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr);
+  return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 }
 std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          column_view const& rhs,
                                          binary_operator op,
                                          data_type output_type,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr);
+  return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 }
 
 std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          column_view const& rhs,
                                          std::string const& ptx,
                                          data_type output_type,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, ptx, output_type, cudf::get_default_stream(), mr);
+  return detail::binary_operation(lhs, rhs, ptx, output_type, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index 1f7f342632a..85ab5c6d6cb 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -47,14 +47,16 @@ namespace {
 struct scalar_as_column_view {
   using return_type = typename std::pair<column_view, std::unique_ptr<column>>;
   template <typename T, CUDF_ENABLE_IF(is_fixed_width<T>())>
-  return_type operator()(scalar const& s, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+  return_type operator()(scalar const& s,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource*)
   {
     auto& h_scalar_type_view = static_cast<cudf::scalar_type_t<T>&>(const_cast<scalar&>(s));
     auto col_v               = column_view(s.type(),
                              1,
                              h_scalar_type_view.data(),
                              reinterpret_cast<bitmask_type const*>(s.validity_data()),
-                             !s.is_valid());
+                             !s.is_valid(stream));
     return std::pair{col_v, std::unique_ptr<column>(nullptr)};
   }
   template <typename T, CUDF_ENABLE_IF(!is_fixed_width<T>())>
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 04939f3cd6d..ac13c121530 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -622,6 +622,7 @@ ConfigureTest(
   STREAM_IDENTIFICATION_TEST identify_stream_usage/test_default_stream_identification.cu
 )
 
+ConfigureTest(STREAM_BINARYOP_TEST streams/binaryop_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/streams/binaryop_test.cpp b/cpp/tests/streams/binaryop_test.cpp
new file mode 100644
index 00000000000..2520aed0458
--- /dev/null
+++ b/cpp/tests/streams/binaryop_test.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/binaryop/util/runtime_support.h>
+
+#include <cudf/binaryop.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class BinaryopTest : public cudf::test::BaseFixture {};
+
+TEST_F(BinaryopTest, ColumnColumn)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> lhs{10, 20, 30, 40, 50};
+  cudf::test::fixed_width_column_wrapper<int32_t> rhs{15, 25, 35, 45, 55};
+
+  cudf::binary_operation(lhs,
+                         rhs,
+                         cudf::binary_operator::ADD,
+                         cudf::data_type(cudf::type_to_id<int32_t>()),
+                         cudf::test::get_default_stream());
+}
+
+TEST_F(BinaryopTest, ColumnScalar)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> lhs{10, 20, 30, 40, 50};
+  cudf::numeric_scalar<int32_t> rhs{23, true, cudf::test::get_default_stream()};
+
+  cudf::binary_operation(lhs,
+                         rhs,
+                         cudf::binary_operator::ADD,
+                         cudf::data_type(cudf::type_to_id<int32_t>()),
+                         cudf::test::get_default_stream());
+}
+
+TEST_F(BinaryopTest, ScalarColumn)
+{
+  cudf::numeric_scalar<int32_t> lhs{42, true, cudf::test::get_default_stream()};
+  cudf::test::fixed_width_column_wrapper<int32_t> rhs{15, 25, 35, 45, 55};
+
+  cudf::binary_operation(lhs,
+                         rhs,
+                         cudf::binary_operator::ADD,
+                         cudf::data_type(cudf::type_to_id<int32_t>()),
+                         cudf::test::get_default_stream());
+}
+
+class BinaryopPTXTest : public BinaryopTest {
+ protected:
+  void SetUp() override
+  {
+    if (!can_do_runtime_jit()) { GTEST_SKIP() << "Skipping tests that require 11.5 runtime"; }
+  }
+};
+
+TEST_F(BinaryopPTXTest, ColumnColumnPTX)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> lhs{10, 20, 30, 40, 50};
+  cudf::test::fixed_width_column_wrapper<int64_t> rhs{15, 25, 35, 45, 55};
+
+  // c = a*a*a + b*b
+  char const* ptx =
+    R"***(
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-24817639
+// Cuda compilation tools, release 10.0, V10.0.130
+// Based on LLVM 3.4svn
+//
+
+.version 6.3
+.target sm_70
+.address_size 64
+
+	// .globl	_ZN8__main__7add$241Eix
+.common .global .align 8 .u64 _ZN08NumbaEnv8__main__7add$241Eix;
+.common .global .align 8 .u64 _ZN08NumbaEnv5numba7targets7numbers14int_power_impl12$3clocals$3e13int_power$242Exx;
+
+.visible .func  (.param .b32 func_retval0) _ZN8__main__7add$241Eix(
+	.param .b64 _ZN8__main__7add$241Eix_param_0,
+	.param .b32 _ZN8__main__7add$241Eix_param_1,
+	.param .b64 _ZN8__main__7add$241Eix_param_2
+)
+{
+	.reg .b32 	%r<3>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [_ZN8__main__7add$241Eix_param_0];
+	ld.param.u32 	%r1, [_ZN8__main__7add$241Eix_param_1];
+	ld.param.u64 	%rd2, [_ZN8__main__7add$241Eix_param_2];
+	cvt.s64.s32	%rd3, %r1;
+	mul.wide.s32 	%rd4, %r1, %r1;
+	mul.lo.s64 	%rd5, %rd4, %rd3;
+	mul.lo.s64 	%rd6, %rd2, %rd2;
+	add.s64 	%rd7, %rd6, %rd5;
+	st.u64 	[%rd1], %rd7;
+	mov.u32 	%r2, 0;
+	st.param.b32	[func_retval0+0], %r2;
+	ret;
+}
+
+)***";
+
+  cudf::binary_operation(
+    lhs, rhs, ptx, cudf::data_type(cudf::type_to_id<int32_t>()), cudf::test::get_default_stream());
+  cudf::binary_operation(lhs, rhs, ptx, cudf::data_type(cudf::type_to_id<int64_t>()));
+}

From b120f7e73e882b4eaa6b5a2cb91aeed20bf1198d Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 4 Oct 2023 14:23:24 -0700
Subject: [PATCH 06/12] Improve `contains_column` by invoking `contains_table`
 (#14238)

Part of #https://github.com/rapidsai/cudf/issues/12261

This PR simplifies the `contains_column` implementation by invoking `contains_table` and gets rid of the use of the cudf `unordered_multiset`. It also removes the `unordered_multiset` header file from libcudf.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14238
---
 cpp/src/hash/unordered_multiset.cuh | 159 ----------------------------
 cpp/src/search/contains_column.cu   |  67 +-----------
 2 files changed, 1 insertion(+), 225 deletions(-)
 delete mode 100644 cpp/src/hash/unordered_multiset.cuh

diff --git a/cpp/src/hash/unordered_multiset.cuh b/cpp/src/hash/unordered_multiset.cuh
deleted file mode 100644
index 183042fc0f4..00000000000
--- a/cpp/src/hash/unordered_multiset.cuh
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/hashing/detail/default_hash.cuh>
-#include <cudf/hashing/detail/helper_functions.cuh>
-#include <cudf/utilities/span.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/copy.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/scan.h>
-
-#include <cuda/atomic>
-
-namespace cudf {
-namespace detail {
-/*
- *  Device view of the unordered multiset
- */
-template <typename Element,
-          typename Hasher   = cudf::hashing::detail::default_hash<Element>,
-          typename Equality = equal_to<Element>>
-class unordered_multiset_device_view {
- public:
-  unordered_multiset_device_view(size_type hash_size,
-                                 size_type const* hash_begin,
-                                 Element const* hash_data)
-    : hash_size{hash_size}, hash_begin{hash_begin}, hash_data{hash_data}, hasher(), equals()
-  {
-  }
-
-  bool __device__ contains(Element e) const
-  {
-    size_type loc = hasher(e) % (2 * hash_size);
-
-    for (size_type i = hash_begin[loc]; i < hash_begin[loc + 1]; ++i) {
-      if (equals(hash_data[i], e)) return true;
-    }
-
-    return false;
-  }
-
- private:
-  Hasher hasher;
-  Equality equals;
-  size_type hash_size;
-  size_type const* hash_begin;
-  Element const* hash_data;
-};
-
-/*
- * Fixed size set on a device.
- */
-template <typename Element,
-          typename Hasher   = cudf::hashing::detail::default_hash<Element>,
-          typename Equality = equal_to<Element>>
-class unordered_multiset {
- public:
-  /**
-   * @brief Factory to construct a new unordered_multiset
-   */
-  static unordered_multiset<Element> create(column_view const& col, rmm::cuda_stream_view stream)
-  {
-    auto d_column = column_device_view::create(col, stream);
-    auto d_col    = *d_column;
-
-    auto hash_bins_start = cudf::detail::make_zeroed_device_uvector_async<size_type>(
-      2 * d_col.size() + 1, stream, rmm::mr::get_current_device_resource());
-    auto hash_bins_end = cudf::detail::make_zeroed_device_uvector_async<size_type>(
-      2 * d_col.size() + 1, stream, rmm::mr::get_current_device_resource());
-    auto hash_data = rmm::device_uvector<Element>(d_col.size(), stream);
-
-    Hasher hasher;
-    size_type* d_hash_bins_start = hash_bins_start.data();
-    size_type* d_hash_bins_end   = hash_bins_end.data();
-    Element* d_hash_data         = hash_data.data();
-
-    thrust::for_each(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator<size_type>(0),
-      thrust::make_counting_iterator<size_type>(col.size()),
-      [d_hash_bins_start, d_col, hasher] __device__(size_t idx) {
-        if (!d_col.is_null(idx)) {
-          Element e     = d_col.element<Element>(idx);
-          size_type tmp = hasher(e) % (2 * d_col.size());
-          cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{*(d_hash_bins_start + tmp)};
-          ref.fetch_add(1, cuda::std::memory_order_relaxed);
-        }
-      });
-
-    thrust::exclusive_scan(rmm::exec_policy(stream),
-                           hash_bins_start.begin(),
-                           hash_bins_start.end(),
-                           hash_bins_end.begin());
-
-    thrust::copy(rmm::exec_policy(stream),
-                 hash_bins_end.begin(),
-                 hash_bins_end.end(),
-                 hash_bins_start.begin());
-
-    thrust::for_each(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator<size_type>(0),
-      thrust::make_counting_iterator<size_type>(col.size()),
-      [d_hash_bins_end, d_hash_data, d_col, hasher] __device__(size_t idx) {
-        if (!d_col.is_null(idx)) {
-          Element e     = d_col.element<Element>(idx);
-          size_type tmp = hasher(e) % (2 * d_col.size());
-          cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{*(d_hash_bins_end + tmp)};
-          size_type offset    = ref.fetch_add(1, cuda::std::memory_order_relaxed);
-          d_hash_data[offset] = e;
-        }
-      });
-
-    return unordered_multiset(d_col.size(), std::move(hash_bins_start), std::move(hash_data));
-  }
-
-  unordered_multiset_device_view<Element, Hasher, Equality> to_device() const
-  {
-    return unordered_multiset_device_view<Element, Hasher, Equality>(
-      size, hash_bins.data(), hash_data.data());
-  }
-
- private:
-  unordered_multiset(size_type size,
-                     rmm::device_uvector<size_type>&& hash_bins,
-                     rmm::device_uvector<Element>&& hash_data)
-    : size{size}, hash_bins{std::move(hash_bins)}, hash_data{std::move(hash_data)}
-  {
-  }
-
-  size_type size;
-  rmm::device_uvector<size_type> hash_bins;
-  rmm::device_uvector<Element> hash_data;
-};
-
-}  // namespace detail
-}  // namespace cudf
diff --git a/cpp/src/search/contains_column.cu b/cpp/src/search/contains_column.cu
index 4363bd212fe..85971647434 100644
--- a/cpp/src/search/contains_column.cu
+++ b/cpp/src/search/contains_column.cu
@@ -14,23 +14,14 @@
  * limitations under the License.
  */
 
-#include <hash/unordered_multiset.cuh>
-
-#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/search.hpp>
 #include <cudf/dictionary/detail/search.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/transform.h>
-#include <thrust/uninitialized_fill.h>
 
 namespace cudf {
 namespace detail {
@@ -38,61 +29,7 @@ namespace detail {
 namespace {
 
 struct contains_column_dispatch {
-  template <typename Element, typename Haystack>
-  struct contains_fn {
-    bool __device__ operator()(size_type const idx) const
-    {
-      if (needles_have_nulls && needles.is_null_nocheck(idx)) {
-        // Exit early. The value doesn't matter, and will be masked as a null element.
-        return true;
-      }
-
-      return haystack.contains(needles.template element<Element>(idx));
-    }
-
-    Haystack const haystack;
-    column_device_view const needles;
-    bool const needles_have_nulls;
-  };
-
-  template <typename Element, CUDF_ENABLE_IF(!is_nested<Element>())>
-  std::unique_ptr<column> operator()(column_view const& haystack,
-                                     column_view const& needles,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
-  {
-    auto result = make_numeric_column(data_type{type_to_id<bool>()},
-                                      needles.size(),
-                                      copy_bitmask(needles, stream, mr),
-                                      needles.null_count(),
-                                      stream,
-                                      mr);
-    if (needles.is_empty()) { return result; }
-
-    auto const out_begin = result->mutable_view().template begin<bool>();
-    if (haystack.is_empty()) {
-      thrust::uninitialized_fill(
-        rmm::exec_policy(stream), out_begin, out_begin + needles.size(), false);
-      return result;
-    }
-
-    auto const haystack_set = cudf::detail::unordered_multiset<Element>::create(haystack, stream);
-    auto const haystack_set_dv = haystack_set.to_device();
-    auto const needles_cdv_ptr = column_device_view::create(needles, stream);
-
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(needles.size()),
-                      out_begin,
-                      contains_fn<Element, decltype(haystack_set_dv)>{
-                        haystack_set_dv, *needles_cdv_ptr, needles.has_nulls()});
-
-    result->set_null_count(needles.null_count());
-
-    return result;
-  }
-
-  template <typename Element, CUDF_ENABLE_IF(is_nested<Element>())>
+  template <typename Element>
   std::unique_ptr<column> operator()(column_view const& haystack,
                                      column_view const& needles,
                                      rmm::cuda_stream_view stream,
@@ -144,8 +81,6 @@ std::unique_ptr<column> contains(column_view const& haystack,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(haystack.type() == needles.type(), "DTYPE mismatch");
-
   return cudf::type_dispatcher(
     haystack.type(), contains_column_dispatch{}, haystack, needles, stream, mr);
 }

From 5d311ea76ddc8bdbb357b6afdf64dfce6ece39a7 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 5 Oct 2023 13:21:31 -0400
Subject: [PATCH 07/12] Fix strings replace for adjacent, identical multi-byte
 UTF-8 character targets (#14235)

Fixes bug that can occur when replacing all occurrences in a string using a multi-byte UTF-8 target when the target matches sequentially in the same string -- some characters were missed.
Specialized gtest is also added.

Found while working on #13891

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14235
---
 cpp/src/strings/replace/replace.cu  |  2 +-
 cpp/tests/strings/replace_tests.cpp | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index a622d1a742d..acc1502f4d6 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -97,7 +97,7 @@ struct replace_row_parallel_fn {
       } else {
         bytes += d_repl.size_bytes() - d_target.size_bytes();
       }
-      position = d_str.find(d_target, position + d_target.size_bytes());
+      position = d_str.find(d_target, position + d_target.length());
       --max_n;
     }
     if (out_ptr)  // copy whats left (or right depending on your point of view)
diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp
index f143983aded..f04bb832f09 100644
--- a/cpp/tests/strings/replace_tests.cpp
+++ b/cpp/tests/strings/replace_tests.cpp
@@ -246,6 +246,28 @@ TEST_F(StringsReplaceTest, ReplaceEndOfString)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
+TEST_F(StringsReplaceTest, ReplaceAdjacentMultiByteTarget)
+{
+  auto input = cudf::test::strings_column_wrapper({"ééééééé", "eéeéeée", "eeeeeee"});
+  auto strings_view = cudf::strings_column_view(input);
+  // replace all occurrences of 'é' with 'e'
+  cudf::test::strings_column_wrapper expected({"eeeeeee", "eeeeeee", "eeeeeee"});
+
+  auto stream = cudf::get_default_stream();
+  auto mr     = rmm::mr::get_current_device_resource();
+
+  auto target  = cudf::string_scalar("é", true, stream);
+  auto repl    = cudf::string_scalar("e", true, stream);
+  auto results = cudf::strings::replace(strings_view, target, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  results = cudf::strings::detail::replace<algorithm::CHAR_PARALLEL>(
+    strings_view, target, repl, -1, stream, mr);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  results = cudf::strings::detail::replace<algorithm::ROW_PARALLEL>(
+    strings_view, target, repl, -1, stream, mr);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(StringsReplaceTest, ReplaceSlice)
 {
   std::vector<char const*> h_strings{"Héllo", "thesé", nullptr, "ARE THE", "tést strings", ""};

From 04e2cd6ff4d525390d4a416651cefa16e11c2a50 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 6 Oct 2023 09:33:16 -0400
Subject: [PATCH 08/12] cudf::detail::pinned_allocator doesn't throw from
 `deallocate` (#14251)

Fixes #14165

The deallocate function is called by the `pinned_host_vector`. Throwing from destructors is bad since they can't be caught, and generally get converted into runtime sig aborts.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Divye Gala (https://github.com/divyegala)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/14251
---
 cpp/include/cudf/detail/utilities/pinned_host_vector.hpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp b/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp
index 9e2b85ea129..eee974c8399 100644
--- a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp
@@ -169,7 +169,12 @@ class pinned_allocator {
    *        It is the responsibility of the caller to destroy
    *        the objects stored at \p p.
    */
-  __host__ inline void deallocate(pointer p, size_type /*cnt*/) { CUDF_CUDA_TRY(cudaFreeHost(p)); }
+  __host__ inline void deallocate(pointer p, size_type /*cnt*/)
+  {
+    auto dealloc_worked = cudaFreeHost(p);
+    (void)dealloc_worked;
+    assert(dealloc_worked == cudaSuccess);
+  }
 
   /**
    * @brief This method returns the maximum size of the \c cnt parameter

From fc3694730334971c6c7bd916bf36b71302cfcd42 Mon Sep 17 00:00:00 2001
From: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Date: Fri, 6 Oct 2023 14:03:32 -0400
Subject: [PATCH 09/12] Fixing parquet list of struct interpretation (#13715)

This change alters how we interpret non-annotated data in a parquet file. Most modern parquet writers would produce something like:
```
message spark_schema {
  required int32 id;
  optional group phoneNumbers (LIST) {
    repeated group phone {
      required int64 number;
      optional binary kind (STRING);
    }
  }
}
```

But the list annotation isn't required. If it didn't exist, we would incorrectly interpret this schema as a struct of struct and not a list of struct. This change alters the code to look at the child and see if it is repeated. If it is, this indicates a list.

closes #13664

Authors:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/13715
---
 cpp/src/io/parquet/page_decode.cuh         |  2 +-
 cpp/src/io/parquet/parquet.hpp             |  2 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp | 86 ++++++++++++++++++++--
 cpp/src/io/parquet/reader_impl_helpers.hpp |  1 +
 cpp/tests/io/parquet_test.cpp              | 78 ++++++++++++++++++++
 5 files changed, 162 insertions(+), 7 deletions(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index cdc29197eb3..d70cabdd35f 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -753,7 +753,7 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value
           // for nested schemas, it's more complicated.  This warp will visit 32 incoming values,
           // however not all of them will necessarily represent a value at this nesting level. so
           // the validity bit for thread t might actually represent output value t-6. the correct
-          // position for thread t's bit is cur_value_count. for cuda 11 we could use
+          // position for thread t's bit is thread_value_count. for cuda 11 we could use
           // __reduce_or_sync(), but until then we have to do a warp reduce.
           WarpReduceOr32(is_valid << thread_value_count);
 
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index c2affc774c2..1df49262e87 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -206,7 +206,7 @@ struct SchemaElement {
   {
     return type == UNDEFINED_TYPE &&
            // this assumption might be a little weak.
-           ((repetition_type != REPEATED) || (repetition_type == REPEATED && num_children == 2));
+           ((repetition_type != REPEATED) || (repetition_type == REPEATED && num_children > 1));
   }
 };
 
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index fcaa610fbb7..9778cfc47d2 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -175,6 +175,81 @@ type_id to_type_id(SchemaElement const& schema,
   return type_id::EMPTY;
 }
 
+void metadata::sanitize_schema()
+{
+  // Parquet isn't very strict about incoming metadata. Lots of things can and should be inferred.
+  // There are also a lot of rules that simply aren't followed and are expected to be worked around.
+  // This step sanitizes the metadata to something that isn't ambiguous.
+  //
+  // Take, for example, the following schema:
+  //
+  //  required group field_id=-1 user {
+  //    required int32 field_id=-1 id;
+  //    optional group field_id=-1 phoneNumbers {
+  //      repeated group field_id=-1 phone {
+  //        required int64 field_id=-1 number;
+  //        optional binary field_id=-1 kind (String);
+  //      }
+  //    }
+  //  }
+  //
+  // This real-world example has no annotations telling us what is a list or a struct. On the
+  // surface this looks like a column of id's and a column of list<struct<int64, string>>, but this
+  // actually should be interpreted as a struct<list<struct<int64, string>>>. The phoneNumbers field
+  // has to be a struct because it is a group with no repeated tag and we have no annotation. The
+  // repeated group is actually BOTH a struct due to the multiple children and a list due to
+  // repeated.
+  //
+  // This code attempts to make this less messy for the code that follows.
+
+  std::function<void(size_t)> process = [&](size_t schema_idx) -> void {
+    if (schema_idx < 0) { return; }
+    auto& schema_elem = schema[schema_idx];
+    if (schema_idx != 0 && schema_elem.type == UNDEFINED_TYPE) {
+      auto const parent_type = schema[schema_elem.parent_idx].converted_type;
+      if (schema_elem.repetition_type == REPEATED && schema_elem.num_children > 1 &&
+          parent_type != LIST && parent_type != MAP) {
+        // This is a list of structs, so we need to mark this as a list, but also
+        // add a struct child and move this element's children to the struct
+        schema_elem.converted_type  = LIST;
+        schema_elem.repetition_type = OPTIONAL;
+        auto const struct_node_idx  = schema.size();
+
+        SchemaElement struct_elem;
+        struct_elem.name            = "struct_node";
+        struct_elem.repetition_type = REQUIRED;
+        struct_elem.num_children    = schema_elem.num_children;
+        struct_elem.type            = UNDEFINED_TYPE;
+        struct_elem.converted_type  = UNKNOWN;
+
+        // swap children
+        struct_elem.children_idx = std::move(schema_elem.children_idx);
+        schema_elem.children_idx = {struct_node_idx};
+        schema_elem.num_children = 1;
+
+        struct_elem.max_definition_level = schema_elem.max_definition_level;
+        struct_elem.max_repetition_level = schema_elem.max_repetition_level;
+        schema_elem.max_definition_level--;
+        schema_elem.max_repetition_level = schema[schema_elem.parent_idx].max_repetition_level;
+
+        // change parent index on new node and on children
+        struct_elem.parent_idx = schema_idx;
+        for (auto& child_idx : struct_elem.children_idx) {
+          schema[child_idx].parent_idx = struct_node_idx;
+        }
+        // add our struct
+        schema.push_back(struct_elem);
+      }
+    }
+
+    for (auto& child_idx : schema_elem.children_idx) {
+      process(child_idx);
+    }
+  };
+
+  process(0);
+}
+
 metadata::metadata(datasource* source)
 {
   constexpr auto header_len = sizeof(file_header_s);
@@ -195,6 +270,7 @@ metadata::metadata(datasource* source)
   CompactProtocolReader cp(buffer->data(), ender->footer_len);
   CUDF_EXPECTS(cp.read(this), "Cannot parse metadata");
   CUDF_EXPECTS(cp.InitSchema(this), "Cannot initialize schema");
+  sanitize_schema();
 }
 
 std::vector<metadata> aggregate_reader_metadata::metadatas_from_sources(
@@ -445,8 +521,10 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
           child_col_name_info, schema_elem.children_idx[0], out_col_array, has_list_parent);
       }
 
+      auto const one_level_list = schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx));
+
       // if we're at the root, this is a new output column
-      auto const col_type = schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx))
+      auto const col_type = one_level_list
                               ? type_id::LIST
                               : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
       auto const dtype    = to_data_type(col_type, schema_elem);
@@ -485,7 +563,7 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
           input_column_info{schema_idx, schema_elem.name, schema_elem.max_repetition_level > 0});
 
         // set up child output column for one-level encoding list
-        if (schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx))) {
+        if (one_level_list) {
           // determine the element data type
           auto const element_type =
             to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
@@ -506,9 +584,7 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
         std::copy(nesting.cbegin(), nesting.cend(), std::back_inserter(input_col.nesting));
 
         // pop off the extra nesting element.
-        if (schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx))) {
-          nesting.pop_back();
-        }
+        if (one_level_list) { nesting.pop_back(); }
 
         path_is_valid = true;  // If we're able to reach leaf then path is valid
       }
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 61e4f94df0f..9ee17f26a10 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -58,6 +58,7 @@ using namespace cudf::io::parquet;
  */
 struct metadata : public FileMetaData {
   explicit metadata(datasource* source);
+  void sanitize_schema();
 };
 
 class aggregate_reader_metadata {
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 81e0e12eeb9..73c946a5feb 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -6732,4 +6732,82 @@ TEST_P(ParquetV2Test, CheckEncodings)
   }
 }
 
+TEST_F(ParquetReaderTest, RepeatedNoAnnotations)
+{
+  constexpr unsigned char repeated_bytes[] = {
+    0x50, 0x41, 0x52, 0x31, 0x15, 0x04, 0x15, 0x30, 0x15, 0x30, 0x4c, 0x15, 0x0c, 0x15, 0x00, 0x12,
+    0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00,
+    0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x15, 0x00, 0x15, 0x0a, 0x15, 0x0a,
+    0x2c, 0x15, 0x0c, 0x15, 0x10, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x03, 0x03, 0x88, 0xc6, 0x02,
+    0x26, 0x80, 0x01, 0x1c, 0x15, 0x02, 0x19, 0x25, 0x00, 0x10, 0x19, 0x18, 0x02, 0x69, 0x64, 0x15,
+    0x00, 0x16, 0x0c, 0x16, 0x78, 0x16, 0x78, 0x26, 0x54, 0x26, 0x08, 0x00, 0x00, 0x15, 0x04, 0x15,
+    0x40, 0x15, 0x40, 0x4c, 0x15, 0x08, 0x15, 0x00, 0x12, 0x00, 0x00, 0xe3, 0x0c, 0x23, 0x4b, 0x01,
+    0x00, 0x00, 0x00, 0xc7, 0x35, 0x3a, 0x42, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x6b, 0x74, 0x84, 0x00,
+    0x00, 0x00, 0x00, 0x55, 0xa1, 0xae, 0xc6, 0x00, 0x00, 0x00, 0x00, 0x15, 0x00, 0x15, 0x22, 0x15,
+    0x22, 0x2c, 0x15, 0x10, 0x15, 0x10, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x03, 0xc0, 0x03, 0x00, 0x00, 0x00, 0x03, 0x90, 0xaa, 0x02, 0x03, 0x94, 0x03, 0x26, 0xda, 0x02,
+    0x1c, 0x15, 0x04, 0x19, 0x25, 0x00, 0x10, 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e,
+    0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x06, 0x6e, 0x75, 0x6d,
+    0x62, 0x65, 0x72, 0x15, 0x00, 0x16, 0x10, 0x16, 0xa0, 0x01, 0x16, 0xa0, 0x01, 0x26, 0x96, 0x02,
+    0x26, 0xba, 0x01, 0x00, 0x00, 0x15, 0x04, 0x15, 0x24, 0x15, 0x24, 0x4c, 0x15, 0x04, 0x15, 0x00,
+    0x12, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x68, 0x6f, 0x6d, 0x65, 0x06, 0x00, 0x00, 0x00, 0x6d,
+    0x6f, 0x62, 0x69, 0x6c, 0x65, 0x15, 0x00, 0x15, 0x20, 0x15, 0x20, 0x2c, 0x15, 0x10, 0x15, 0x10,
+    0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0xc0, 0x03, 0x00, 0x00, 0x00,
+    0x03, 0x90, 0xef, 0x01, 0x03, 0x04, 0x26, 0xcc, 0x04, 0x1c, 0x15, 0x0c, 0x19, 0x25, 0x00, 0x10,
+    0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05,
+    0x70, 0x68, 0x6f, 0x6e, 0x65, 0x04, 0x6b, 0x69, 0x6e, 0x64, 0x15, 0x00, 0x16, 0x10, 0x16, 0x82,
+    0x01, 0x16, 0x82, 0x01, 0x26, 0x8a, 0x04, 0x26, 0xca, 0x03, 0x00, 0x00, 0x15, 0x02, 0x19, 0x6c,
+    0x48, 0x04, 0x75, 0x73, 0x65, 0x72, 0x15, 0x04, 0x00, 0x15, 0x02, 0x25, 0x00, 0x18, 0x02, 0x69,
+    0x64, 0x00, 0x35, 0x02, 0x18, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65,
+    0x72, 0x73, 0x15, 0x02, 0x00, 0x35, 0x04, 0x18, 0x05, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x15, 0x04,
+    0x00, 0x15, 0x04, 0x25, 0x00, 0x18, 0x06, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x00, 0x15, 0x0c,
+    0x25, 0x02, 0x18, 0x04, 0x6b, 0x69, 0x6e, 0x64, 0x25, 0x00, 0x00, 0x16, 0x00, 0x19, 0x1c, 0x19,
+    0x3c, 0x26, 0x80, 0x01, 0x1c, 0x15, 0x02, 0x19, 0x25, 0x00, 0x10, 0x19, 0x18, 0x02, 0x69, 0x64,
+    0x15, 0x00, 0x16, 0x0c, 0x16, 0x78, 0x16, 0x78, 0x26, 0x54, 0x26, 0x08, 0x00, 0x00, 0x26, 0xda,
+    0x02, 0x1c, 0x15, 0x04, 0x19, 0x25, 0x00, 0x10, 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65,
+    0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x06, 0x6e, 0x75,
+    0x6d, 0x62, 0x65, 0x72, 0x15, 0x00, 0x16, 0x10, 0x16, 0xa0, 0x01, 0x16, 0xa0, 0x01, 0x26, 0x96,
+    0x02, 0x26, 0xba, 0x01, 0x00, 0x00, 0x26, 0xcc, 0x04, 0x1c, 0x15, 0x0c, 0x19, 0x25, 0x00, 0x10,
+    0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05,
+    0x70, 0x68, 0x6f, 0x6e, 0x65, 0x04, 0x6b, 0x69, 0x6e, 0x64, 0x15, 0x00, 0x16, 0x10, 0x16, 0x82,
+    0x01, 0x16, 0x82, 0x01, 0x26, 0x8a, 0x04, 0x26, 0xca, 0x03, 0x00, 0x00, 0x16, 0x9a, 0x03, 0x16,
+    0x0c, 0x00, 0x28, 0x49, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2d, 0x72, 0x73, 0x20, 0x76,
+    0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x30, 0x2e, 0x33, 0x2e, 0x30, 0x20, 0x28, 0x62, 0x75,
+    0x69, 0x6c, 0x64, 0x20, 0x62, 0x34, 0x35, 0x63, 0x65, 0x37, 0x63, 0x62, 0x61, 0x32, 0x31, 0x39,
+    0x39, 0x66, 0x32, 0x32, 0x64, 0x39, 0x33, 0x32, 0x36, 0x39, 0x63, 0x31, 0x35, 0x30, 0x64, 0x38,
+    0x61, 0x38, 0x33, 0x39, 0x31, 0x36, 0x63, 0x36, 0x39, 0x62, 0x35, 0x65, 0x29, 0x00, 0x32, 0x01,
+    0x00, 0x00, 0x50, 0x41, 0x52, 0x31};
+
+  auto read_opts = cudf::io::parquet_reader_options::builder(
+    cudf::io::source_info{reinterpret_cast<char const*>(repeated_bytes), sizeof(repeated_bytes)});
+  auto result = cudf::io::read_parquet(read_opts);
+
+  EXPECT_EQ(result.tbl->view().column(0).size(), 6);
+  EXPECT_EQ(result.tbl->view().num_columns(), 2);
+
+  column_wrapper<int32_t> col0{1, 2, 3, 4, 5, 6};
+  column_wrapper<int64_t> child0{{5555555555l, 1111111111l, 1111111111l, 2222222222l, 3333333333l}};
+  cudf::test::strings_column_wrapper child1{{"-", "home", "home", "-", "mobile"}, {0, 1, 1, 0, 1}};
+  auto struct_col = cudf::test::structs_column_wrapper{{child0, child1}};
+
+  auto list_offsets_column =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 0, 1, 2, 5}.release();
+  auto num_list_rows = list_offsets_column->size() - 1;
+
+  auto mask = cudf::create_null_mask(6, cudf::mask_state::ALL_VALID);
+  cudf::set_null_mask(static_cast<cudf::bitmask_type*>(mask.data()), 0, 2, false);
+
+  auto list_col = cudf::make_lists_column(
+    num_list_rows, std::move(list_offsets_column), struct_col.release(), 2, std::move(mask));
+
+  std::vector<std::unique_ptr<cudf::column>> struct_children;
+  struct_children.push_back(std::move(list_col));
+
+  auto outer_struct =
+    cudf::test::structs_column_wrapper{{std::move(struct_children)}, {0, 0, 1, 1, 1, 1}};
+  table_view expected{{col0, outer_struct}};
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(result.tbl->view(), expected);
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 96664ec7436033f59aa5b9740e6f54aec707e3cf Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 6 Oct 2023 15:09:11 -0700
Subject: [PATCH 10/12] Add pylibcudf.Scalar that interoperates with Arrow
 scalars (#14133)

This PR adds a new Scalar object to pylibcudf that will function as the pylibcudf equivalent of cudf::scalar. Unlike columns, which are typically operated on in the form of views rather than owning types by libcudf, owning scalars are accepted by (const) ref by libcudf APIs and no corresponding view type exists. Therefore, pylibcudf.Scalar differs from pylibcudf.Column by actually owning an instance of the underlying libcudf type (cudf::scalar). Construction of pylibcudf Scalars is expected to be done from an Arrow scalar.

This PR relies on #14124 and should not be merged until after that one.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14133
---
 python/cudf/cudf/_lib/CMakeLists.txt          |   8 +-
 python/cudf/cudf/_lib/datetime.pyx            |   6 +-
 python/cudf/cudf/_lib/interop.pyx             |  95 +------------
 python/cudf/cudf/_lib/nvtext/CMakeLists.txt   |   8 ++
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |  25 +++-
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd  |   5 +-
 python/cudf/cudf/_lib/pylibcudf/__init__.py   |   5 +-
 python/cudf/cudf/_lib/pylibcudf/interop.pxd   |   9 ++
 python/cudf/cudf/_lib/pylibcudf/interop.pyx   |  23 +++
 python/cudf/cudf/_lib/pylibcudf/scalar.pxd    |  32 +++++
 python/cudf/cudf/_lib/pylibcudf/scalar.pyx    | 133 ++++++++++++++++++
 python/cudf/cudf/_lib/pylibcudf/table.pxd     |   3 +
 python/cudf/cudf/_lib/pylibcudf/table.pyx     |  33 ++++-
 python/cudf/cudf/_lib/scalar.pxd              |  13 +-
 python/cudf/cudf/_lib/scalar.pyx              |  88 ++++++++----
 python/cudf/cudf/_lib/strings/CMakeLists.txt  |  10 +-
 .../cudf/_lib/strings/convert/CMakeLists.txt  |  10 +-
 .../cudf/_lib/strings/split/CMakeLists.txt    |  10 +-
 18 files changed, 378 insertions(+), 138 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/interop.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/interop.pyx
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/scalar.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/scalar.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 947659c290a..1b543b94589 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -107,8 +107,12 @@ if(${PYARROW_RESULT})
   message(FATAL_ERROR "Error while trying to obtain pyarrow include directory:\n${PYARROW_ERROR}")
 endif()
 
-set(targets_using_arrow_headers interop avro csv orc json parquet)
-foreach(target IN LISTS targets_using_arrow_headers)
+# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts
+# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the
+# requirement for arrow headers infects all of cudf. That in turn requires including numpy headers.
+# These requirements will go away once all scalar-related Cython code is removed from cudf.
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
   target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
 endforeach()
 
diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx
index 81949dbaa20..3d96f59c4d6 100644
--- a/python/cudf/cudf/_lib/datetime.pyx
+++ b/python/cudf/cudf/_lib/datetime.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -10,6 +10,7 @@ from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.filling cimport calendrical_month_sequence
+from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
@@ -166,10 +167,11 @@ def date_range(DeviceScalar start, size_type n, offset):
         + offset.kwds.get("months", 0)
     )
 
+    cdef const scalar* c_start = start.c_value.get()
     with nogil:
         c_result = move(calendrical_month_sequence(
             n,
-            start.c_value.get()[0],
+            c_start[0],
             months
         ))
     return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index 639754fc54f..8fd2a409d90 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -4,14 +4,7 @@ from cpython cimport pycapsule
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-from pyarrow.lib cimport (
-    CScalar,
-    CTable,
-    pyarrow_unwrap_scalar,
-    pyarrow_unwrap_table,
-    pyarrow_wrap_scalar,
-    pyarrow_wrap_table,
-)
+from pyarrow.lib cimport CTable, pyarrow_unwrap_table, pyarrow_wrap_table
 
 from cudf._lib.cpp.interop cimport (
     DLManagedTensor,
@@ -21,22 +14,12 @@ from cudf._lib.cpp.interop cimport (
     to_arrow as cpp_to_arrow,
     to_dlpack as cpp_to_dlpack,
 )
-from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport type_id
-from cudf._lib.cpp.wrappers.decimals cimport (
-    decimal32,
-    decimal64,
-    decimal128,
-    scale_type,
-)
-from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 from cudf.api.types import is_list_dtype, is_struct_dtype
 from cudf.core.buffer import acquire_spill_lock
-from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype
 
 
 def from_dlpack(dlpack_capsule):
@@ -199,79 +182,3 @@ def from_arrow(object input_table):
         c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0]))
 
     return columns_from_unique_ptr(move(c_result))
-
-
-@acquire_spill_lock()
-def to_arrow_scalar(DeviceScalar source_scalar):
-    """Convert a scalar to a PyArrow scalar.
-
-    Parameters
-    ----------
-    source_scalar : the scalar to convert
-
-    Returns
-    -------
-    pyarrow.lib.Scalar
-    """
-    cdef vector[column_metadata] cpp_metadata = gather_metadata(
-        [("", source_scalar.dtype)]
-    )
-    cdef const scalar* source_scalar_ptr = source_scalar.get_raw_ptr()
-
-    cdef shared_ptr[CScalar] cpp_arrow_scalar
-    with nogil:
-        cpp_arrow_scalar = cpp_to_arrow(
-            source_scalar_ptr[0], cpp_metadata[0]
-        )
-
-    return pyarrow_wrap_scalar(cpp_arrow_scalar)
-
-
-@acquire_spill_lock()
-def from_arrow_scalar(object input_scalar, output_dtype=None):
-    """Convert from PyArrow scalar to a cudf scalar.
-
-    Parameters
-    ----------
-    input_scalar : PyArrow scalar
-    output_dtype : output type to cast to, ignored except for decimals
-
-    Returns
-    -------
-    cudf._lib.DeviceScalar
-    """
-    cdef shared_ptr[CScalar] cpp_arrow_scalar = (
-        pyarrow_unwrap_scalar(input_scalar)
-    )
-    cdef unique_ptr[scalar] c_result
-
-    with nogil:
-        c_result = move(cpp_from_arrow(cpp_arrow_scalar.get()[0]))
-
-    cdef type_id ctype = c_result.get().type().id()
-    if ctype == type_id.DECIMAL128:
-        if output_dtype is None:
-            # Decimals must be cast to the cudf dtype of the right width
-            raise ValueError(
-                "Decimal scalars must be constructed with a dtype"
-            )
-
-        if isinstance(output_dtype, Decimal32Dtype):
-            c_result.reset(
-                new fixed_point_scalar[decimal32](
-                    (<fixed_point_scalar[decimal128]*> c_result.get()).value(),
-                    scale_type(-input_scalar.type.scale),
-                    c_result.get().is_valid()
-                )
-            )
-        elif isinstance(output_dtype, Decimal64Dtype):
-            c_result.reset(
-                new fixed_point_scalar[decimal64](
-                    (<fixed_point_scalar[decimal128]*> c_result.get()).value(),
-                    scale_type(-input_scalar.type.scale),
-                    c_result.get().is_valid()
-                )
-            )
-        # Decimal128Dtype is a no-op, no conversion needed.
-
-    return DeviceScalar.from_unique_ptr(move(c_result), output_dtype)
diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
index 515b9c1d6e4..d4e2392ee04 100644
--- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
@@ -22,3 +22,11 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX nvtext_ ASSOCIATED_TARGETS cudf
 )
+# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts
+# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the
+# requirement for arrow headers infects all of cudf. That in turn requires including numpy headers.
+# These requirements will go away once all scalar-related Cython code is removed from cudf.
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
+  target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
+endforeach()
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index 0ce42dc43ff..5185b2d4bb5 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -12,10 +12,33 @@
 # the License.
 # =============================================================================
 
-set(cython_sources column.pyx copying.pyx gpumemoryview.pyx table.pyx types.pyx utils.pyx)
+set(cython_sources column.pyx copying.pyx gpumemoryview.pyx interop.pyx scalar.pyx table.pyx
+                   types.pyx utils.pyx
+)
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
 )
+
+find_package(Python 3.9 REQUIRED COMPONENTS Interpreter)
+
+execute_process(
+  COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_include())"
+  OUTPUT_VARIABLE PYARROW_INCLUDE_DIR
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
+endforeach()
+
+# TODO: Clean up this include when switching to scikit-build-core. See cudf/_lib/CMakeLists.txt for
+# more info
+find_package(NumPy REQUIRED)
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
+  # Switch to the line below when we switch back to FindPython.cmake in CMake 3.24.
+  # target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}")
+endforeach()
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index ba7822b0a54..7a35854392c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -1,9 +1,10 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
 
 # TODO: Verify consistent usage of relative/absolute imports in pylibcudf.
-from . cimport copying
+from . cimport copying, interop
 from .column cimport Column
 from .gpumemoryview cimport gpumemoryview
+from .scalar cimport Scalar
 from .table cimport Table
 # TODO: cimport type_id once
 # https://github.com/cython/cython/issues/5609 is resolved
@@ -12,7 +13,9 @@ from .types cimport DataType
 __all__ = [
     "Column",
     "DataType",
+    "Scalar",
     "Table",
     "copying",
     "gpumemoryview",
+    "interop",
 ]
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index 3edff9a53e8..72b74a57b87 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -1,16 +1,19 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
 
-from . import copying
+from . import copying, interop
 from .column import Column
 from .gpumemoryview import gpumemoryview
+from .scalar import Scalar
 from .table import Table
 from .types import DataType, TypeId
 
 __all__ = [
     "Column",
     "DataType",
+    "Scalar",
     "Table",
     "TypeId",
     "copying",
     "gpumemoryview",
+    "interop",
 ]
diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/interop.pxd
new file mode 100644
index 00000000000..3a79e5425d4
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/interop.pxd
@@ -0,0 +1,9 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from cudf._lib.cpp.interop cimport column_metadata
+
+
+cdef class ColumnMetadata:
+    cdef public object name
+    cdef public object children_meta
+    cdef column_metadata to_libcudf(self)
diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
new file mode 100644
index 00000000000..0cdca275027
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
@@ -0,0 +1,23 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from cudf._lib.cpp.interop cimport column_metadata
+
+
+cdef class ColumnMetadata:
+    def __init__(self, name):
+        self.name = name
+        self.children_meta = []
+
+    cdef column_metadata to_libcudf(self):
+        """Convert to C++ column_metadata.
+
+        Since this class is mutable and cheap, it is easier to create the C++
+        object on the fly rather than have it directly backing the storage for
+        the Cython class.
+        """
+        cdef column_metadata c_metadata
+        cdef ColumnMetadata child_meta
+        c_metadata.name = self.name.encode()
+        for child_meta in self.children_meta:
+            c_metadata.children_meta.push_back(child_meta.to_libcudf())
+        return c_metadata
diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd
new file mode 100644
index 00000000000..09d853d832f
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd
@@ -0,0 +1,32 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from pyarrow cimport lib as pa
+
+from rmm._lib.memory_resource cimport DeviceMemoryResource
+
+from cudf._lib.cpp.scalar.scalar cimport scalar
+
+from .interop cimport ColumnMetadata
+from .types cimport DataType
+
+
+cdef class Scalar:
+    cdef unique_ptr[scalar] c_obj
+    cdef DataType _data_type
+
+    # Holds a reference to the DeviceMemoryResource used for allocation.
+    # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is
+    # needed for deallocation
+    cdef DeviceMemoryResource mr
+
+    cdef const scalar* get(self) except *
+
+    cpdef DataType type(self)
+    cpdef bool is_valid(self)
+
+    @staticmethod
+    cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=*)
+
+    cpdef pa.Scalar to_arrow(self, ColumnMetadata metadata)
diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
new file mode 100644
index 00000000000..04f588bd3e6
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
@@ -0,0 +1,133 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from cython cimport no_gc_clear
+from cython.operator cimport dereference
+from libcpp.memory cimport shared_ptr, unique_ptr
+from libcpp.utility cimport move
+from pyarrow cimport lib as pa
+
+from rmm._lib.memory_resource cimport get_current_device_resource
+
+from cudf._lib.cpp.interop cimport (
+    column_metadata,
+    from_arrow as cpp_from_arrow,
+    to_arrow as cpp_to_arrow,
+)
+from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar
+from cudf._lib.cpp.wrappers.decimals cimport (
+    decimal32,
+    decimal64,
+    decimal128,
+    scale_type,
+)
+
+from .interop cimport ColumnMetadata
+from .types cimport DataType, type_id
+
+
+# The DeviceMemoryResource attribute could be released prematurely
+# by the gc if the Scalar is in a reference cycle. Removing the tp_clear
+# function with the no_gc_clear decoration prevents that. See
+# https://github.com/rapidsai/rmm/pull/931 for details.
+@no_gc_clear
+cdef class Scalar:
+    """A scalar value in device memory."""
+    # Unlike for columns, libcudf does not support scalar views. All APIs that
+    # accept scalar values accept references to the owning object rather than a
+    # special view type. As a result, pylibcudf.Scalar has a simpler structure
+    # than pylibcudf.Column because it can be a true wrapper around a libcudf
+    # column
+
+    def __cinit__(self, *args, **kwargs):
+        self.mr = get_current_device_resource()
+
+    def __init__(self, pa.Scalar value=None):
+        # TODO: This case is not something we really want to
+        # support, but it here for now to ease the transition of
+        # DeviceScalar.
+        if value is not None:
+            raise ValueError("Scalar should be constructed with a factory")
+
+    @staticmethod
+    def from_arrow(pa.Scalar value, DataType data_type=None):
+        # Allow passing a dtype, but only for the purpose of decimals for now
+
+        cdef shared_ptr[pa.CScalar] cscalar = (
+            pa.pyarrow_unwrap_scalar(value)
+        )
+        cdef unique_ptr[scalar] c_result
+
+        with nogil:
+            c_result = move(cpp_from_arrow(cscalar.get()[0]))
+
+        cdef Scalar s = Scalar.from_libcudf(move(c_result))
+
+        if s.type().id() != type_id.DECIMAL128:
+            if data_type is not None:
+                raise ValueError(
+                    "dtype may not be passed for non-decimal types"
+                )
+            return s
+
+        if data_type is None:
+            raise ValueError(
+                "Decimal scalars must be constructed with a dtype"
+            )
+
+        cdef type_id tid = data_type.id()
+
+        if tid == type_id.DECIMAL32:
+            s.c_obj.reset(
+                new fixed_point_scalar[decimal32](
+                    (<fixed_point_scalar[decimal128]*> s.c_obj.get()).value(),
+                    scale_type(-value.type.scale),
+                    s.c_obj.get().is_valid()
+                )
+            )
+        elif tid == type_id.DECIMAL64:
+            s.c_obj.reset(
+                new fixed_point_scalar[decimal64](
+                    (<fixed_point_scalar[decimal128]*> s.c_obj.get()).value(),
+                    scale_type(-value.type.scale),
+                    s.c_obj.get().is_valid()
+                )
+            )
+        elif tid != type_id.DECIMAL128:
+            raise ValueError(
+                "Decimal scalars may only be cast to decimals"
+            )
+
+        return s
+
+    cpdef pa.Scalar to_arrow(self, ColumnMetadata metadata):
+        cdef shared_ptr[pa.CScalar] c_result
+        cdef column_metadata c_metadata = metadata.to_libcudf()
+
+        with nogil:
+            c_result = move(cpp_to_arrow(dereference(self.c_obj.get()), c_metadata))
+
+        return pa.pyarrow_wrap_scalar(c_result)
+
+    cdef const scalar* get(self) except *:
+        return self.c_obj.get()
+
+    cpdef DataType type(self):
+        """The type of data in the column."""
+        return self._data_type
+
+    cpdef bool is_valid(self):
+        """True if the scalar is valid, false if not"""
+        return self.get().is_valid()
+
+    @staticmethod
+    cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=None):
+        """Construct a Scalar object from a libcudf scalar.
+
+        This method is for pylibcudf's functions to use to ingest outputs of
+        calling libcudf algorithms, and should generally not be needed by users
+        (even direct pylibcudf Cython users).
+        """
+        cdef Scalar s = Scalar.__new__(Scalar)
+        s.c_obj.swap(libcudf_scalar)
+        s._data_type = DataType.from_libcudf(s.get().type())
+        return s
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pxd b/python/cudf/cudf/_lib/pylibcudf/table.pxd
index 95f197b13eb..a9e2874232a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/table.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/table.pxd
@@ -1,6 +1,7 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
+from pyarrow cimport lib as pa
 
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
@@ -16,3 +17,5 @@ cdef class Table:
     cdef Table from_libcudf(unique_ptr[table] libcudf_tbl)
 
     cpdef list columns(self)
+
+    cpdef pa.Table to_arrow(self, list metadata)
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pyx b/python/cudf/cudf/_lib/pylibcudf/table.pyx
index 720f9815bd6..c41eb82e4a1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/table.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/table.pyx
@@ -1,15 +1,22 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
 
 from cython.operator cimport dereference
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
+from pyarrow cimport lib as pa
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.interop cimport (
+    column_metadata,
+    from_arrow as cpp_from_arrow,
+    to_arrow as cpp_to_arrow,
+)
 from cudf._lib.cpp.table.table cimport table
 
 from .column cimport Column
+from .interop cimport ColumnMetadata
 
 
 cdef class Table:
@@ -60,3 +67,27 @@ cdef class Table:
 
     cpdef list columns(self):
         return self._columns
+
+    @staticmethod
+    def from_arrow(pa.Table pyarrow_table):
+        cdef shared_ptr[pa.CTable] ctable = (
+            pa.pyarrow_unwrap_table(pyarrow_table)
+        )
+        cdef unique_ptr[table] c_result
+
+        with nogil:
+            c_result = move(cpp_from_arrow(ctable.get()[0]))
+
+        return Table.from_libcudf(move(c_result))
+
+    cpdef pa.Table to_arrow(self, list metadata):
+        cdef shared_ptr[pa.CTable] c_result
+        cdef vector[column_metadata] c_metadata
+        cdef ColumnMetadata meta
+        for meta in metadata:
+            c_metadata.push_back(meta.to_libcudf())
+
+        with nogil:
+            c_result = move(cpp_to_arrow(self.view(), c_metadata))
+
+        return pa.pyarrow_wrap_table(c_result)
diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd
index 1deed60d67d..77733f59c3d 100644
--- a/python/cudf/cudf/_lib/scalar.pxd
+++ b/python/cudf/cudf/_lib/scalar.pxd
@@ -1,20 +1,19 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 
 from rmm._lib.memory_resource cimport DeviceMemoryResource
 
+# TODO: Would like to remove this cimport, but it will require some more work
+# to excise all C code in scalar.pyx that relies on using the C API of the
+# pylibcudf Scalar underlying the DeviceScalar.
+from cudf._lib cimport pylibcudf
 from cudf._lib.cpp.scalar.scalar cimport scalar
 
 
 cdef class DeviceScalar:
-    cdef unique_ptr[scalar] c_value
-
-    # Holds a reference to the DeviceMemoryResource used for allocation.
-    # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is
-    # needed for deallocation
-    cdef DeviceMemoryResource mr
+    cdef pylibcudf.Scalar c_value
 
     cdef object _dtype
 
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 5ab286c5701..0b64c75f7b6 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -1,7 +1,5 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
-cimport cython
-
 import copy
 
 import numpy as np
@@ -13,17 +11,17 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from rmm._lib.memory_resource cimport get_current_device_resource
-
 import cudf
+from cudf._lib import pylibcudf
 from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES
-from cudf.core.dtypes import ListDtype, StructDtype
+from cudf.core.dtypes import (
+    ListDtype,
+    StructDtype,
+    is_list_dtype,
+    is_struct_dtype,
+)
 from cudf.core.missing import NA, NaT
 
-from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id
-
-from cudf._lib.interop import from_arrow_scalar, to_arrow_scalar
-
 cimport cudf._lib.cpp.types as libcudf_types
 from cudf._lib.cpp.scalar.scalar cimport (
     duration_scalar,
@@ -44,6 +42,7 @@ from cudf._lib.cpp.wrappers.timestamps cimport (
     timestamp_s,
     timestamp_us,
 )
+from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id
 
 
 def _replace_nested(obj, check, replacement):
@@ -61,15 +60,44 @@ def _replace_nested(obj, check, replacement):
                 _replace_nested(v, check, replacement)
 
 
-# The DeviceMemoryResource attribute could be released prematurely
-# by the gc if the DeviceScalar is in a reference cycle. Removing
-# the tp_clear function with the no_gc_clear decoration prevents that.
-# See https://github.com/rapidsai/rmm/pull/931 for details.
-@cython.no_gc_clear
+def gather_metadata(dtypes):
+    """Convert a dict of dtypes to a list of ColumnMetadata objects.
+
+    The metadata is constructed recursively so that nested types are
+    represented as nested ColumnMetadata objects.
+
+    Parameters
+    ----------
+    dtypes : dict
+        A dict mapping column names to dtypes.
+
+    Returns
+    -------
+    List[ColumnMetadata]
+        A list of ColumnMetadata objects.
+    """
+    out = []
+    for name, dtype in dtypes.items():
+        v = pylibcudf.interop.ColumnMetadata(name)
+        if is_struct_dtype(dtype):
+            v.children_meta = gather_metadata(dtype.fields)
+        elif is_list_dtype(dtype):
+            # Offsets column is unnamed and has no children
+            v.children_meta.append(pylibcudf.interop.ColumnMetadata(""))
+            v.children_meta.extend(
+                gather_metadata({"": dtype.element_type})
+            )
+        out.append(v)
+    return out
+
+
 cdef class DeviceScalar:
 
+    # TODO: I think this should be removable, except that currently the way
+    # that from_unique_ptr is implemented is probably dereferencing this in an
+    # invalid state. See what the best way to fix that is.
     def __cinit__(self, *args, **kwargs):
-        self.mr = get_current_device_resource()
+        self.c_value = pylibcudf.Scalar()
 
     def __init__(self, value, dtype):
         """
@@ -85,7 +113,7 @@ cdef class DeviceScalar:
         dtype : dtype
             A NumPy dtype.
         """
-        self._dtype = dtype if dtype.kind != 'U' else cudf.dtype('object')
+        dtype = dtype if dtype.kind != 'U' else cudf.dtype('object')
 
         if cudf.utils.utils.is_na_like(value):
             value = None
@@ -108,10 +136,17 @@ cdef class DeviceScalar:
 
         pa_scalar = pa.scalar(value, type=pa_type)
 
-        # Note: This factory-like behavior in __init__ will be removed when
-        # migrating to pylibcudf.
-        cdef DeviceScalar obj = from_arrow_scalar(pa_scalar, self._dtype)
-        self.c_value.swap(obj.c_value)
+        data_type = None
+        if isinstance(dtype, cudf.core.dtypes.DecimalDtype):
+            tid = pylibcudf.TypeId.DECIMAL128
+            if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype):
+                tid = pylibcudf.TypeId.DECIMAL32
+            elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
+                tid = pylibcudf.TypeId.DECIMAL64
+            data_type = pylibcudf.DataType(tid, -dtype.scale)
+
+        self.c_value = pylibcudf.Scalar.from_arrow(pa_scalar, data_type)
+        self._dtype = dtype
 
     def _to_host_scalar(self):
         is_datetime = self.dtype.kind == "M"
@@ -119,7 +154,8 @@ cdef class DeviceScalar:
 
         null_type = NaT if is_datetime or is_timedelta else NA
 
-        ps = to_arrow_scalar(self)
+        metadata = gather_metadata({"": self.dtype})[0]
+        ps = self.c_value.to_arrow(metadata)
         if not ps.is_valid:
             return null_type
 
@@ -158,13 +194,13 @@ cdef class DeviceScalar:
         return self._to_host_scalar()
 
     cdef const scalar* get_raw_ptr(self) except *:
-        return self.c_value.get()
+        return self.c_value.c_obj.get()
 
     cpdef bool is_valid(self):
         """
         Returns if the Scalar is valid or not(i.e., <NA>).
         """
-        return self.get_raw_ptr()[0].is_valid()
+        return self.c_value.is_valid()
 
     def __repr__(self):
         if cudf.utils.utils.is_na_like(self.value):
@@ -183,7 +219,7 @@ cdef class DeviceScalar:
         cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar)
         cdef libcudf_types.data_type cdtype
 
-        s.c_value = move(ptr)
+        s.c_value = pylibcudf.Scalar.from_libcudf(move(ptr))
         cdtype = s.get_raw_ptr()[0].type()
 
         if dtype is not None:
@@ -310,9 +346,9 @@ def _create_proxy_nat_scalar(dtype):
     if dtype.char in 'mM':
         nat = dtype.type('NaT').astype(dtype)
         if dtype.type == np.datetime64:
-            _set_datetime64_from_np_scalar(result.c_value, nat, dtype, True)
+            _set_datetime64_from_np_scalar(result.c_value.c_obj, nat, dtype, True)
         elif dtype.type == np.timedelta64:
-            _set_timedelta64_from_np_scalar(result.c_value, nat, dtype, True)
+            _set_timedelta64_from_np_scalar(result.c_value.c_obj, nat, dtype, True)
         return result
     else:
         raise TypeError('NAT only valid for datetime and timedelta')
diff --git a/python/cudf/cudf/_lib/strings/CMakeLists.txt b/python/cudf/cudf/_lib/strings/CMakeLists.txt
index a5e87a456cb..fc11f047ab4 100644
--- a/python/cudf/cudf/_lib/strings/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/strings/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -40,6 +40,14 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf
 )
+# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts
+# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the
+# requirement for arrow headers infects all of cudf. That requirement will go away once all
+# scalar-related Cython code is removed from cudf.
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
+  target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
+endforeach()
 
 add_subdirectory(convert)
 add_subdirectory(split)
diff --git a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt
index 434f79d3b5f..f55bb1fb780 100644
--- a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -22,3 +22,11 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf
 )
+# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts
+# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the
+# requirement for arrow headers infects all of cudf. That requirement will go away once all
+# scalar-related Cython code is removed from cudf.
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
+  target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
+endforeach()
diff --git a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt
index 59a22c06e85..2f2063482af 100644
--- a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -20,3 +20,11 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf
 )
+# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts
+# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the
+# requirement for arrow headers infects all of cudf. That requirement will go away once all
+# scalar-related Cython code is removed from cudf.
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
+  target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
+endforeach()

From e28017cc17d2feb050d2effd4ebafb84600fd607 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Mon, 9 Oct 2023 10:05:12 -0500
Subject: [PATCH 11/12] Cleanup of namespaces in parquet code. (#14259)

Cleans up several issues in the parquet code:

- We were using the namespace `cudf::io::detail::parquet`, when `cudf::io::parquet::detail` makes more sense.
- Converts the `cudf::io::parquet::gpu` namespace to also just use `cudf::io::parquet::detail`
- Several detail-style headers and source files were using `cudf::io::parquet` when they should probably have been in the detail namespace.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/14259
---
 cpp/include/cudf/io/detail/parquet.hpp        |   8 +-
 cpp/include/cudf/io/parquet.hpp               |   4 +-
 cpp/src/io/functions.cpp                      |   4 +-
 cpp/src/io/parquet/chunk_dict.cu              |  19 +-
 .../io/parquet/compact_protocol_reader.cpp    |   8 +-
 .../io/parquet/compact_protocol_reader.hpp    |   9 +-
 .../io/parquet/compact_protocol_writer.cpp    |   8 +-
 .../io/parquet/compact_protocol_writer.hpp    |   8 +-
 cpp/src/io/parquet/decode_preprocess.cu       |  10 +-
 cpp/src/io/parquet/delta_binary.cuh           |   4 +-
 cpp/src/io/parquet/page_data.cu               |  12 +-
 cpp/src/io/parquet/page_decode.cuh            |   4 +-
 cpp/src/io/parquet/page_delta_decode.cu       |   6 +-
 cpp/src/io/parquet/page_enc.cu                |  22 +-
 cpp/src/io/parquet/page_hdr.cu                |  14 +-
 cpp/src/io/parquet/page_string_decode.cu      |  14 +-
 cpp/src/io/parquet/page_string_utils.cuh      |   4 +-
 cpp/src/io/parquet/parquet.hpp                |   9 +-
 cpp/src/io/parquet/parquet_common.hpp         |   9 +-
 cpp/src/io/parquet/parquet_gpu.cuh            |   4 +-
 cpp/src/io/parquet/parquet_gpu.hpp            |  27 +-
 cpp/src/io/parquet/predicate_pushdown.cpp     |  14 +-
 cpp/src/io/parquet/reader.cpp                 |   4 +-
 cpp/src/io/parquet/reader_impl.cpp            |  36 +--
 cpp/src/io/parquet/reader_impl.hpp            |  12 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp    | 121 ++++----
 cpp/src/io/parquet/reader_impl_helpers.hpp    |  21 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu  | 259 +++++++++---------
 cpp/src/io/parquet/rle_stream.cuh             |   4 +-
 cpp/src/io/parquet/writer_impl.cu             | 223 ++++++++-------
 cpp/src/io/parquet/writer_impl.hpp            |  28 +-
 cpp/tests/io/parquet_test.cpp                 | 207 +++++++-------
 32 files changed, 531 insertions(+), 605 deletions(-)

diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 074f690d2c7..0b8ee9676de 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -38,7 +38,7 @@ class parquet_reader_options;
 class parquet_writer_options;
 class chunked_parquet_writer_options;
 
-namespace detail::parquet {
+namespace parquet::detail {
 
 /**
  * @brief Class to read Parquet dataset data into columns.
@@ -186,7 +186,7 @@ class writer {
    */
   explicit writer(std::vector<std::unique_ptr<data_sink>> sinks,
                   parquet_writer_options const& options,
-                  single_write_mode mode,
+                  cudf::io::detail::single_write_mode mode,
                   rmm::cuda_stream_view stream);
 
   /**
@@ -201,7 +201,7 @@ class writer {
    */
   explicit writer(std::vector<std::unique_ptr<data_sink>> sinks,
                   chunked_parquet_writer_options const& options,
-                  single_write_mode mode,
+                  cudf::io::detail::single_write_mode mode,
                   rmm::cuda_stream_view stream);
 
   /**
@@ -250,5 +250,5 @@ class writer {
  * metadata.
  */
 parquet_metadata read_parquet_metadata(host_span<std::unique_ptr<datasource> const> sources);
-}  // namespace detail::parquet
+}  // namespace parquet::detail
 }  // namespace cudf::io
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index deaf23d405a..6283099e700 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -499,7 +499,7 @@ class chunked_parquet_reader {
   [[nodiscard]] table_with_metadata read_chunk() const;
 
  private:
-  std::unique_ptr<cudf::io::detail::parquet::chunked_reader> reader;
+  std::unique_ptr<cudf::io::parquet::detail::chunked_reader> reader;
 };
 
 /** @} */  // end of group
@@ -1750,7 +1750,7 @@ class parquet_chunked_writer {
     std::vector<std::string> const& column_chunks_file_paths = {});
 
   /// Unique pointer to impl writer class
-  std::unique_ptr<cudf::io::detail::parquet::writer> writer;
+  std::unique_ptr<parquet::detail::writer> writer;
 };
 
 /** @} */  // end of group
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 392a7850886..726442d752e 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -470,8 +470,8 @@ void orc_chunked_writer::close()
   writer->close();
 }
 
-using namespace cudf::io::detail::parquet;
-namespace detail_parquet = cudf::io::detail::parquet;
+using namespace cudf::io::parquet::detail;
+namespace detail_parquet = cudf::io::parquet::detail;
 
 table_with_metadata read_parquet(parquet_reader_options const& options,
                                  rmm::mr::device_memory_resource* mr)
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 9ff1869edde..53ff31ab0a7 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -24,10 +24,8 @@
 
 #include <cuda/atomic>
 
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
+namespace cudf::io::parquet::detail {
+
 namespace {
 constexpr int DEFAULT_BLOCK_SIZE = 256;
 }
@@ -101,7 +99,7 @@ struct map_find_fn {
 
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
-  populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan<gpu::PageFragment const> frags)
+  populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto col_idx = blockIdx.y;
   auto block_x = blockIdx.x;
@@ -226,7 +224,7 @@ __global__ void __launch_bounds__(block_size)
 
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
-  get_dictionary_indices_kernel(cudf::detail::device_2dspan<gpu::PageFragment const> frags)
+  get_dictionary_indices_kernel(cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto col_idx = blockIdx.y;
   auto block_x = blockIdx.x;
@@ -276,7 +274,7 @@ void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_st
     <<<chunks.size(), block_size, 0, stream.value()>>>(chunks);
 }
 
-void populate_chunk_hash_maps(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void populate_chunk_hash_maps(cudf::detail::device_2dspan<PageFragment const> frags,
                               rmm::cuda_stream_view stream)
 {
   dim3 const dim_grid(frags.size().second, frags.size().first);
@@ -290,14 +288,11 @@ void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_vi
   collect_map_entries_kernel<block_size><<<chunks.size(), block_size, 0, stream.value()>>>(chunks);
 }
 
-void get_dictionary_indices(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void get_dictionary_indices(cudf::detail::device_2dspan<PageFragment const> frags,
                             rmm::cuda_stream_view stream)
 {
   dim3 const dim_grid(frags.size().second, frags.size().first);
   get_dictionary_indices_kernel<DEFAULT_BLOCK_SIZE>
     <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(frags);
 }
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index 5c7b8ca3f8c..81d1be64a45 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -21,9 +21,7 @@
 #include <functional>
 #include <tuple>
 
-namespace cudf {
-namespace io {
-namespace parquet {
+namespace cudf::io::parquet::detail {
 
 /**
  * @brief Base class for parquet field functors.
@@ -870,6 +868,4 @@ int CompactProtocolReader::WalkSchema(
   }
 }
 
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp
index 619815db503..cbb4161b138 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.hpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.hpp
@@ -25,9 +25,8 @@
 #include <utility>
 #include <vector>
 
-namespace cudf {
-namespace io {
-namespace parquet {
+namespace cudf::io::parquet::detail {
+
 /**
  * @brief Class for parsing Parquet's Thrift Compact Protocol encoded metadata
  *
@@ -147,6 +146,4 @@ class CompactProtocolReader {
   friend class parquet_field_struct_blob;
 };
 
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index 60bc8984d81..9adc8767880 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -16,9 +16,7 @@
 
 #include "compact_protocol_writer.hpp"
 
-namespace cudf {
-namespace io {
-namespace parquet {
+namespace cudf::io::parquet::detail {
 
 /**
  * @brief Parquet CompactProtocolWriter class
@@ -391,6 +389,4 @@ inline void CompactProtocolFieldWriter::set_current_field(int const& field)
   current_field_value = field;
 }
 
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
index 26d66527aa5..4849a814b14 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.hpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -25,9 +25,7 @@
 #include <string>
 #include <vector>
 
-namespace cudf {
-namespace io {
-namespace parquet {
+namespace cudf::io::parquet::detail {
 
 /**
  * @brief Class for parsing Parquet's Thrift Compact Protocol encoded metadata
@@ -115,6 +113,4 @@ class CompactProtocolFieldWriter {
   inline void set_current_field(int const& field);
 };
 
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu
index 8de3702bc2e..544c93ee616 100644
--- a/cpp/src/io/parquet/decode_preprocess.cu
+++ b/cpp/src/io/parquet/decode_preprocess.cu
@@ -23,10 +23,7 @@
 #include <rmm/exec_policy.hpp>
 #include <thrust/reduce.h>
 
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
+namespace cudf::io::parquet::detail {
 
 namespace {
 
@@ -411,7 +408,4 @@ void ComputePageSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
   }
 }
 
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh
index 2382e4aafdf..a513e6674b4 100644
--- a/cpp/src/io/parquet/delta_binary.cuh
+++ b/cpp/src/io/parquet/delta_binary.cuh
@@ -18,7 +18,7 @@
 
 #include "page_decode.cuh"
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 // DELTA_XXX encoding support
 //
@@ -291,4 +291,4 @@ struct delta_binary_decoder {
   }
 };
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 230834632dd..cce3659b902 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -23,10 +23,7 @@
 #include <rmm/exec_policy.hpp>
 #include <thrust/reduce.h>
 
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
+namespace cudf::io::parquet::detail {
 
 namespace {
 
@@ -624,7 +621,7 @@ uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_vector<PageInfo>
 }
 
 /**
- * @copydoc cudf::io::parquet::gpu::DecodePageData
+ * @copydoc cudf::io::parquet::detail::DecodePageData
  */
 void __host__ DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
                              cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -648,7 +645,4 @@ void __host__ DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
   }
 }
 
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index d70cabdd35f..7c866fd8b9e 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -24,7 +24,7 @@
 #include <cuda/atomic>
 #include <cuda/std/tuple>
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 struct page_state_s {
   constexpr page_state_s() noexcept {}
@@ -1384,4 +1384,4 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
   return true;
 }
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
index 2b78dead205..d25684a59f3 100644
--- a/cpp/src/io/parquet/page_delta_decode.cu
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -23,7 +23,7 @@
 #include <rmm/exec_policy.hpp>
 #include <thrust/transform_scan.h>
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 namespace {
 
@@ -160,7 +160,7 @@ __global__ void __launch_bounds__(96)
 }  // anonymous namespace
 
 /**
- * @copydoc cudf::io::parquet::gpu::DecodeDeltaBinary
+ * @copydoc cudf::io::parquet::detail::DecodeDeltaBinary
  */
 void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages,
                                 cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -184,4 +184,4 @@ void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages
   }
 }
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index fe0dbb85124..78873d5e8ca 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -41,10 +41,7 @@
 #include <thrust/scatter.h>
 #include <thrust/tuple.h>
 
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
+namespace cudf::io::parquet::detail {
 
 namespace {
 
@@ -329,7 +326,7 @@ __global__ void __launch_bounds__(128)
 // blockDim {128,1,1}
 __global__ void __launch_bounds__(128)
   gpuInitPages(device_2dspan<EncColumnChunk> chunks,
-               device_span<gpu::EncPage> pages,
+               device_span<EncPage> pages,
                device_span<size_type> page_sizes,
                device_span<size_type> comp_page_sizes,
                device_span<parquet_column_device_view const> col_desc,
@@ -998,7 +995,7 @@ __device__ auto julian_days_with_time(int64_t v)
 // blockDim(128, 1, 1)
 template <int block_size>
 __global__ void __launch_bounds__(128, 8)
-  gpuEncodePages(device_span<gpu::EncPage> pages,
+  gpuEncodePages(device_span<EncPage> pages,
                  device_span<device_span<uint8_t const>> comp_in,
                  device_span<device_span<uint8_t>> comp_out,
                  device_span<compression_result> comp_results,
@@ -1988,7 +1985,7 @@ __global__ void __launch_bounds__(128)
 
 // blockDim(1024, 1, 1)
 __global__ void __launch_bounds__(1024)
-  gpuGatherPages(device_span<EncColumnChunk> chunks, device_span<gpu::EncPage const> pages)
+  gpuGatherPages(device_span<EncColumnChunk> chunks, device_span<EncPage const> pages)
 {
   __shared__ __align__(8) EncColumnChunk ck_g;
   __shared__ __align__(8) EncPage page_g;
@@ -2265,7 +2262,7 @@ void InitFragmentStatistics(device_span<statistics_group> groups,
 }
 
 void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
-                      device_span<gpu::EncPage> pages,
+                      device_span<EncPage> pages,
                       device_span<size_type> page_sizes,
                       device_span<size_type> comp_page_sizes,
                       device_span<parquet_column_device_view const> col_desc,
@@ -2294,7 +2291,7 @@ void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
                                                      write_v2_headers);
 }
 
-void EncodePages(device_span<gpu::EncPage> pages,
+void EncodePages(device_span<EncPage> pages,
                  bool write_v2_headers,
                  device_span<device_span<uint8_t const>> comp_in,
                  device_span<device_span<uint8_t>> comp_out,
@@ -2328,7 +2325,7 @@ void EncodePageHeaders(device_span<EncPage> pages,
 }
 
 void GatherPages(device_span<EncColumnChunk> chunks,
-                 device_span<gpu::EncPage const> pages,
+                 device_span<EncPage const> pages,
                  rmm::cuda_stream_view stream)
 {
   gpuGatherPages<<<chunks.size(), 1024, 0, stream.value()>>>(chunks, pages);
@@ -2343,7 +2340,4 @@ void EncodeColumnIndexes(device_span<EncColumnChunk> chunks,
     chunks, column_stats, column_index_truncate_length);
 }
 
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 6f8b2f50443..eae8e05e61e 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -20,10 +20,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
+namespace cudf::io::parquet::detail {
+
 // Minimal thrift implementation for parsing page headers
 // https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md
 
@@ -161,8 +159,7 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type)
  * @param chunk Column chunk the page belongs to
  * @return `kernel_mask_bits` value for the given page
  */
-__device__ uint32_t kernel_mask_for_page(gpu::PageInfo const& page,
-                                         gpu::ColumnChunkDesc const& chunk)
+__device__ uint32_t kernel_mask_for_page(PageInfo const& page, ColumnChunkDesc const& chunk)
 {
   if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return 0; }
 
@@ -528,7 +525,4 @@ void __host__ BuildStringDictionaryIndex(ColumnChunkDesc* chunks,
   gpuBuildStringDictionaryIndex<<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_chunks);
 }
 
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index d79abe4a6d2..4d79770ec34 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -20,10 +20,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/gather.cuh>
 
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
+namespace cudf::io::parquet::detail {
 
 namespace {
 
@@ -757,7 +754,7 @@ __global__ void __launch_bounds__(decode_block_size)
 }  // anonymous namespace
 
 /**
- * @copydoc cudf::io::parquet::gpu::ComputePageStringSizes
+ * @copydoc cudf::io::parquet::detail::ComputePageStringSizes
  */
 void ComputePageStringSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
                             cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -778,7 +775,7 @@ void ComputePageStringSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
 }
 
 /**
- * @copydoc cudf::io::parquet::gpu::DecodeStringPageData
+ * @copydoc cudf::io::parquet::detail::DecodeStringPageData
  */
 void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
                                    cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -802,7 +799,4 @@ void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pa
   }
 }
 
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_string_utils.cuh b/cpp/src/io/parquet/page_string_utils.cuh
index 9395599b3ff..a81d0a64466 100644
--- a/cpp/src/io/parquet/page_string_utils.cuh
+++ b/cpp/src/io/parquet/page_string_utils.cuh
@@ -18,7 +18,7 @@
 
 #include <cudf/strings/detail/gather.cuh>
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 // stole this from cudf/strings/detail/gather.cuh. modified to run on a single string on one warp.
 // copies from src to dst in 16B chunks per thread.
@@ -107,4 +107,4 @@ __device__ void block_excl_sum(size_type* arr, size_type length, size_type initi
   }
 }
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 1df49262e87..c5993d73dec 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -25,9 +25,8 @@
 #include <string>
 #include <vector>
 
-namespace cudf {
-namespace io {
-namespace parquet {
+namespace cudf::io::parquet::detail {
+
 constexpr uint32_t parquet_magic = (('P' << 0) | ('A' << 8) | ('R' << 16) | ('1' << 24));
 
 /**
@@ -405,6 +404,4 @@ static inline int CountLeadingZeros32(uint32_t value)
 #endif
 }
 
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp
index 5a1716bb547..50736197eb9 100644
--- a/cpp/src/io/parquet/parquet_common.hpp
+++ b/cpp/src/io/parquet/parquet_common.hpp
@@ -18,9 +18,8 @@
 
 #include <cstdint>
 
-namespace cudf {
-namespace io {
-namespace parquet {
+namespace cudf::io::parquet::detail {
+
 // Max decimal precisions according to the parquet spec:
 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal
 auto constexpr MAX_DECIMAL32_PRECISION  = 9;
@@ -156,6 +155,4 @@ enum FieldType {
   ST_FLD_STRUCT = 12,
 };
 
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index dc74bee1536..10e12ebb782 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -23,7 +23,7 @@
 
 #include <cuco/static_map.cuh>
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 auto constexpr KEY_SENTINEL   = size_type{-1};
 auto constexpr VALUE_SENTINEL = size_type{-1};
@@ -81,4 +81,4 @@ inline size_type __device__ row_to_value_idx(size_type idx,
   return idx;
 }
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 51c862b376b..767668cc65e 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -35,7 +35,7 @@
 
 #include <vector>
 
-namespace cudf::io::parquet {
+namespace cudf::io::parquet::detail {
 
 using cudf::io::detail::string_index_pair;
 
@@ -88,8 +88,6 @@ struct input_column_info {
   auto nesting_depth() const { return nesting.size(); }
 };
 
-namespace gpu {
-
 /**
  * @brief Enums for the flags in the page header
  */
@@ -347,7 +345,7 @@ struct file_intermediate_data {
 
   // all chunks from the selected row groups. We may end up reading these chunks progressively
   // instead of all at once
-  std::vector<gpu::ColumnChunkDesc> chunks{};
+  std::vector<ColumnChunkDesc> chunks{};
 
   // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we
   // may not be visiting every row group that contains these bounds
@@ -372,16 +370,16 @@ struct pass_intermediate_data {
 
   // rowgroup, chunk and page information for the current pass.
   std::vector<row_group_info> row_groups{};
-  cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc> chunks{};
-  cudf::detail::hostdevice_vector<gpu::PageInfo> pages_info{};
-  cudf::detail::hostdevice_vector<gpu::PageNestingInfo> page_nesting_info{};
-  cudf::detail::hostdevice_vector<gpu::PageNestingDecodeInfo> page_nesting_decode_info{};
+  cudf::detail::hostdevice_vector<ColumnChunkDesc> chunks{};
+  cudf::detail::hostdevice_vector<PageInfo> pages_info{};
+  cudf::detail::hostdevice_vector<PageNestingInfo> page_nesting_info{};
+  cudf::detail::hostdevice_vector<PageNestingDecodeInfo> page_nesting_decode_info{};
 
   rmm::device_uvector<int32_t> page_keys{0, rmm::cuda_stream_default};
   rmm::device_uvector<int32_t> page_index{0, rmm::cuda_stream_default};
   rmm::device_uvector<string_index_pair> str_dict_index{0, rmm::cuda_stream_default};
 
-  std::vector<gpu::chunk_read_info> output_chunk_read_info;
+  std::vector<chunk_read_info> output_chunk_read_info;
   std::size_t current_output_chunk{0};
 
   rmm::device_buffer level_decode_data{};
@@ -739,7 +737,7 @@ void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_st
  * @param frags Column fragments
  * @param stream CUDA stream to use
  */
-void populate_chunk_hash_maps(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void populate_chunk_hash_maps(cudf::detail::device_2dspan<PageFragment const> frags,
                               rmm::cuda_stream_view stream);
 
 /**
@@ -762,7 +760,7 @@ void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_vi
  * @param frags Column fragments
  * @param stream CUDA stream to use
  */
-void get_dictionary_indices(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void get_dictionary_indices(cudf::detail::device_2dspan<PageFragment const> frags,
                             rmm::cuda_stream_view stream);
 
 /**
@@ -781,7 +779,7 @@ void get_dictionary_indices(cudf::detail::device_2dspan<gpu::PageFragment const>
  * @param[in] stream CUDA stream to use
  */
 void InitEncoderPages(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                      device_span<gpu::EncPage> pages,
+                      device_span<EncPage> pages,
                       device_span<size_type> page_sizes,
                       device_span<size_type> comp_page_sizes,
                       device_span<parquet_column_device_view const> col_desc,
@@ -847,7 +845,7 @@ void EncodePageHeaders(device_span<EncPage> pages,
  * @param[in] stream CUDA stream to use
  */
 void GatherPages(device_span<EncColumnChunk> chunks,
-                 device_span<gpu::EncPage const> pages,
+                 device_span<EncPage const> pages,
                  rmm::cuda_stream_view stream);
 
 /**
@@ -863,5 +861,4 @@ void EncodeColumnIndexes(device_span<EncColumnChunk> chunks,
                          int32_t column_index_truncate_length,
                          rmm::cuda_stream_view stream);
 
-}  // namespace gpu
-}  // namespace cudf::io::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index 805d082c71e..9083be1c2dd 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -35,7 +35,7 @@
 #include <numeric>
 #include <optional>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
 namespace {
 /**
@@ -62,13 +62,13 @@ struct stats_caster {
 
   // uses storage type as T
   template <typename T, CUDF_ENABLE_IF(cudf::is_dictionary<T>() or cudf::is_nested<T>())>
-  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  static T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
   {
     CUDF_FAIL("unsupported type for stats casting");
   }
 
   template <typename T, CUDF_ENABLE_IF(cudf::is_boolean<T>())>
-  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  static T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
   {
     CUDF_EXPECTS(type == BOOLEAN, "Invalid type and stats combination");
     return targetType<T>(*reinterpret_cast<bool const*>(stats_val));
@@ -78,7 +78,7 @@ struct stats_caster {
   template <typename T,
             CUDF_ENABLE_IF((cudf::is_integral<T>() and !cudf::is_boolean<T>()) or
                            cudf::is_fixed_point<T>() or cudf::is_chrono<T>())>
-  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  static T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
   {
     switch (type) {
       case INT32: return targetType<T>(*reinterpret_cast<int32_t const*>(stats_val));
@@ -103,7 +103,7 @@ struct stats_caster {
   }
 
   template <typename T, CUDF_ENABLE_IF(cudf::is_floating_point<T>())>
-  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  static T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
   {
     switch (type) {
       case FLOAT: return targetType<T>(*reinterpret_cast<float const*>(stats_val));
@@ -113,7 +113,7 @@ struct stats_caster {
   }
 
   template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, string_view>)>
-  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  static T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
   {
     switch (type) {
       case BYTE_ARRAY: [[fallthrough]];
@@ -527,4 +527,4 @@ named_to_reference_converter::visit_operands(
   return transformed_operands;
 }
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp
index 1e87447006d..17d7c07bc91 100644
--- a/cpp/src/io/parquet/reader.cpp
+++ b/cpp/src/io/parquet/reader.cpp
@@ -16,7 +16,7 @@
 
 #include "reader_impl.hpp"
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
 reader::reader() = default;
 
@@ -59,4 +59,4 @@ bool chunked_reader::has_next() const { return _impl->has_next(); }
 
 table_with_metadata chunked_reader::read_chunk() const { return _impl->read_chunk(); }
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index ea40f29a070..26ec83d5946 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -25,7 +25,7 @@
 #include <bitset>
 #include <numeric>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
 void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 {
@@ -38,7 +38,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
 
   size_t const sum_max_depths = std::accumulate(
-    chunks.begin(), chunks.end(), 0, [&](size_t cursum, gpu::ColumnChunkDesc const& chunk) {
+    chunks.begin(), chunks.end(), 0, [&](size_t cursum, ColumnChunkDesc const& chunk) {
       return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema);
     });
 
@@ -51,10 +51,10 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   // doing a gather operation later on.
   // TODO: This step is somewhat redundant if size info has already been calculated (nested schema,
   // chunked reader).
-  auto const has_strings = (kernel_mask & gpu::KERNEL_MASK_STRING) != 0;
+  auto const has_strings = (kernel_mask & KERNEL_MASK_STRING) != 0;
   std::vector<size_t> col_sizes(_input_columns.size(), 0L);
   if (has_strings) {
-    gpu::ComputePageStringSizes(
+    ComputePageStringSizes(
       pages, chunks, skip_rows, num_rows, _pass_itm_data->level_type_size, _stream);
 
     col_sizes = calculate_page_string_offsets();
@@ -176,19 +176,19 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   if (has_strings) {
     auto& stream = streams[s_idx++];
     chunk_nested_str_data.host_to_device_async(stream);
-    gpu::DecodeStringPageData(
+    DecodeStringPageData(
       pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), stream);
   }
 
   // launch delta binary decoder
-  if ((kernel_mask & gpu::KERNEL_MASK_DELTA_BINARY) != 0) {
-    gpu::DecodeDeltaBinary(
+  if ((kernel_mask & KERNEL_MASK_DELTA_BINARY) != 0) {
+    DecodeDeltaBinary(
       pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
   }
 
   // launch the catch-all page decoder
-  if ((kernel_mask & gpu::KERNEL_MASK_GENERAL) != 0) {
-    gpu::DecodePageData(
+  if ((kernel_mask & KERNEL_MASK_GENERAL) != 0) {
+    DecodePageData(
       pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
   }
 
@@ -248,13 +248,13 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 
   // update null counts in the final column buffers
   for (size_t idx = 0; idx < pages.size(); idx++) {
-    gpu::PageInfo* pi = &pages[idx];
-    if (pi->flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; }
-    gpu::ColumnChunkDesc* col          = &chunks[pi->chunk_idx];
+    PageInfo* pi = &pages[idx];
+    if (pi->flags & PAGEINFO_FLAGS_DICTIONARY) { continue; }
+    ColumnChunkDesc* col               = &chunks[pi->chunk_idx];
     input_column_info const& input_col = _input_columns[col->src_col_index];
 
-    int index                        = pi->nesting_decode - page_nesting_decode.device_ptr();
-    gpu::PageNestingDecodeInfo* pndi = &page_nesting_decode[index];
+    int index                   = pi->nesting_decode - page_nesting_decode.device_ptr();
+    PageNestingDecodeInfo* pndi = &page_nesting_decode[index];
 
     auto* cols = &_output_buffers;
     for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) {
@@ -320,7 +320,7 @@ reader::impl::impl(std::size_t chunk_read_limit,
 
   // Save the states of the output buffers for reuse in `chunk_read()`.
   for (auto const& buff : _output_buffers) {
-    _output_buffers_template.emplace_back(inline_column_buffer::empty_like(buff));
+    _output_buffers_template.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff));
   }
 }
 
@@ -368,7 +368,7 @@ void reader::impl::prepare_data(int64_t skip_rows,
 
     // always create the pass struct, even if we end up with no passes.
     // this will also cause the previous pass information to be deleted
-    _pass_itm_data = std::make_unique<cudf::io::parquet::gpu::pass_intermediate_data>();
+    _pass_itm_data = std::make_unique<pass_intermediate_data>();
 
     if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() &&
         not _input_columns.empty() && _current_input_pass < num_passes) {
@@ -521,7 +521,7 @@ table_with_metadata reader::impl::read_chunk()
   if (_chunk_count > 0) {
     _output_buffers.resize(0);
     for (auto const& buff : _output_buffers_template) {
-      _output_buffers.emplace_back(inline_column_buffer::empty_like(buff));
+      _output_buffers.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff));
     }
   }
 
@@ -571,4 +571,4 @@ parquet_metadata read_parquet_metadata(host_span<std::unique_ptr<datasource> con
                           metadata.get_key_value_metadata()[0]};
 }
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 9445e4d1648..6003b931b04 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -35,7 +35,7 @@
 #include <optional>
 #include <vector>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
 /**
  * @brief Implementation for Parquet reader
@@ -261,10 +261,10 @@ class reader::impl {
   std::vector<input_column_info> _input_columns;
 
   // Buffers for generating output columns
-  std::vector<inline_column_buffer> _output_buffers;
+  std::vector<cudf::io::detail::inline_column_buffer> _output_buffers;
 
   // Buffers copied from `_output_buffers` after construction for reuse
-  std::vector<inline_column_buffer> _output_buffers_template;
+  std::vector<cudf::io::detail::inline_column_buffer> _output_buffers_template;
 
   // _output_buffers associated schema indices
   std::vector<int> _output_column_schemas;
@@ -285,8 +285,8 @@ class reader::impl {
   // Within a pass, we produce one or more chunks of output, whose maximum total
   // byte size is controlled by _output_chunk_read_limit.
 
-  cudf::io::parquet::gpu::file_intermediate_data _file_itm_data;
-  std::unique_ptr<cudf::io::parquet::gpu::pass_intermediate_data> _pass_itm_data;
+  file_intermediate_data _file_itm_data;
+  std::unique_ptr<pass_intermediate_data> _pass_itm_data;
 
   // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents
   // the start/end of the chunks to be loaded for a given pass.
@@ -301,4 +301,4 @@ class reader::impl {
   bool _file_preprocessed{false};
 };
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 9778cfc47d2..171cf07da3e 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -21,34 +21,34 @@
 #include <numeric>
 #include <regex>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
 namespace {
 
 ConvertedType logical_type_to_converted_type(LogicalType const& logical)
 {
   if (logical.isset.STRING) {
-    return parquet::UTF8;
+    return UTF8;
   } else if (logical.isset.MAP) {
-    return parquet::MAP;
+    return MAP;
   } else if (logical.isset.LIST) {
-    return parquet::LIST;
+    return LIST;
   } else if (logical.isset.ENUM) {
-    return parquet::ENUM;
+    return ENUM;
   } else if (logical.isset.DECIMAL) {
-    return parquet::DECIMAL;  // TODO set decimal values
+    return DECIMAL;  // TODO set decimal values
   } else if (logical.isset.DATE) {
-    return parquet::DATE;
+    return DATE;
   } else if (logical.isset.TIME) {
     if (logical.TIME.unit.isset.MILLIS)
-      return parquet::TIME_MILLIS;
+      return TIME_MILLIS;
     else if (logical.TIME.unit.isset.MICROS)
-      return parquet::TIME_MICROS;
+      return TIME_MICROS;
   } else if (logical.isset.TIMESTAMP) {
     if (logical.TIMESTAMP.unit.isset.MILLIS)
-      return parquet::TIMESTAMP_MILLIS;
+      return TIMESTAMP_MILLIS;
     else if (logical.TIMESTAMP.unit.isset.MICROS)
-      return parquet::TIMESTAMP_MICROS;
+      return TIMESTAMP_MICROS;
   } else if (logical.isset.INTEGER) {
     switch (logical.INTEGER.bitWidth) {
       case 8: return logical.INTEGER.isSigned ? INT_8 : UINT_8;
@@ -58,13 +58,13 @@ ConvertedType logical_type_to_converted_type(LogicalType const& logical)
       default: break;
     }
   } else if (logical.isset.UNKNOWN) {
-    return parquet::NA;
+    return NA;
   } else if (logical.isset.JSON) {
-    return parquet::JSON;
+    return JSON;
   } else if (logical.isset.BSON) {
-    return parquet::BSON;
+    return BSON;
   }
-  return parquet::UNKNOWN;
+  return UNKNOWN;
 }
 
 }  // namespace
@@ -76,39 +76,39 @@ type_id to_type_id(SchemaElement const& schema,
                    bool strings_to_categorical,
                    type_id timestamp_type_id)
 {
-  parquet::Type const physical            = schema.type;
-  parquet::LogicalType const logical_type = schema.logical_type;
-  parquet::ConvertedType converted_type   = schema.converted_type;
-  int32_t decimal_precision               = schema.decimal_precision;
+  Type const physical            = schema.type;
+  LogicalType const logical_type = schema.logical_type;
+  ConvertedType converted_type   = schema.converted_type;
+  int32_t decimal_precision      = schema.decimal_precision;
 
   // Logical type used for actual data interpretation; the legacy converted type
   // is superseded by 'logical' type whenever available.
   auto const inferred_converted_type = logical_type_to_converted_type(logical_type);
-  if (inferred_converted_type != parquet::UNKNOWN) { converted_type = inferred_converted_type; }
-  if (inferred_converted_type == parquet::DECIMAL) {
+  if (inferred_converted_type != UNKNOWN) { converted_type = inferred_converted_type; }
+  if (inferred_converted_type == DECIMAL) {
     decimal_precision = schema.logical_type.DECIMAL.precision;
   }
 
   switch (converted_type) {
-    case parquet::UINT_8: return type_id::UINT8;
-    case parquet::INT_8: return type_id::INT8;
-    case parquet::UINT_16: return type_id::UINT16;
-    case parquet::INT_16: return type_id::INT16;
-    case parquet::UINT_32: return type_id::UINT32;
-    case parquet::UINT_64: return type_id::UINT64;
-    case parquet::DATE: return type_id::TIMESTAMP_DAYS;
-    case parquet::TIME_MILLIS: return type_id::DURATION_MILLISECONDS;
-    case parquet::TIME_MICROS: return type_id::DURATION_MICROSECONDS;
-    case parquet::TIMESTAMP_MILLIS:
+    case UINT_8: return type_id::UINT8;
+    case INT_8: return type_id::INT8;
+    case UINT_16: return type_id::UINT16;
+    case INT_16: return type_id::INT16;
+    case UINT_32: return type_id::UINT32;
+    case UINT_64: return type_id::UINT64;
+    case DATE: return type_id::TIMESTAMP_DAYS;
+    case TIME_MILLIS: return type_id::DURATION_MILLISECONDS;
+    case TIME_MICROS: return type_id::DURATION_MICROSECONDS;
+    case TIMESTAMP_MILLIS:
       return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
                                                    : type_id::TIMESTAMP_MILLISECONDS;
-    case parquet::TIMESTAMP_MICROS:
+    case TIMESTAMP_MICROS:
       return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
                                                    : type_id::TIMESTAMP_MICROSECONDS;
-    case parquet::DECIMAL:
-      if (physical == parquet::INT32) { return type_id::DECIMAL32; }
-      if (physical == parquet::INT64) { return type_id::DECIMAL64; }
-      if (physical == parquet::FIXED_LEN_BYTE_ARRAY) {
+    case DECIMAL:
+      if (physical == INT32) { return type_id::DECIMAL32; }
+      if (physical == INT64) { return type_id::DECIMAL64; }
+      if (physical == FIXED_LEN_BYTE_ARRAY) {
         if (schema.type_length <= static_cast<int32_t>(sizeof(int32_t))) {
           return type_id::DECIMAL32;
         }
@@ -119,7 +119,7 @@ type_id to_type_id(SchemaElement const& schema,
           return type_id::DECIMAL128;
         }
       }
-      if (physical == parquet::BYTE_ARRAY) {
+      if (physical == BYTE_ARRAY) {
         CUDF_EXPECTS(decimal_precision <= MAX_DECIMAL128_PRECISION, "Invalid decimal precision");
         if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
           return type_id::DECIMAL32;
@@ -133,20 +133,20 @@ type_id to_type_id(SchemaElement const& schema,
       break;
 
     // maps are just List<Struct<>>.
-    case parquet::MAP:
-    case parquet::LIST: return type_id::LIST;
-    case parquet::NA: return type_id::STRING;
+    case MAP:
+    case LIST: return type_id::LIST;
+    case NA: return type_id::STRING;
     // return type_id::EMPTY; //TODO(kn): enable after Null/Empty column support
     default: break;
   }
 
-  if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and
+  if (inferred_converted_type == UNKNOWN and physical == INT64 and
       logical_type.TIMESTAMP.unit.isset.NANOS) {
     return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
                                                  : type_id::TIMESTAMP_NANOSECONDS;
   }
 
-  if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and
+  if (inferred_converted_type == UNKNOWN and physical == INT64 and
       logical_type.TIME.unit.isset.NANOS) {
     return type_id::DURATION_NANOSECONDS;
   }
@@ -157,16 +157,16 @@ type_id to_type_id(SchemaElement const& schema,
   // Physical storage type supported by Parquet; controls the on-disk storage
   // format in combination with the encoding type.
   switch (physical) {
-    case parquet::BOOLEAN: return type_id::BOOL8;
-    case parquet::INT32: return type_id::INT32;
-    case parquet::INT64: return type_id::INT64;
-    case parquet::FLOAT: return type_id::FLOAT32;
-    case parquet::DOUBLE: return type_id::FLOAT64;
-    case parquet::BYTE_ARRAY:
-    case parquet::FIXED_LEN_BYTE_ARRAY:
+    case BOOLEAN: return type_id::BOOL8;
+    case INT32: return type_id::INT32;
+    case INT64: return type_id::INT64;
+    case FLOAT: return type_id::FLOAT32;
+    case DOUBLE: return type_id::FLOAT64;
+    case BYTE_ARRAY:
+    case FIXED_LEN_BYTE_ARRAY:
       // Can be mapped to INT32 (32-bit hash) or STRING
       return strings_to_categorical ? type_id::INT32 : type_id::STRING;
-    case parquet::INT96:
+    case INT96:
       return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
                                                    : type_id::TIMESTAMP_NANOSECONDS;
     default: break;
@@ -420,7 +420,7 @@ std::vector<std::string> aggregate_reader_metadata::get_pandas_index_names() con
   return names;
 }
 
-std::tuple<int64_t, size_type, std::vector<gpu::row_group_info>>
+std::tuple<int64_t, size_type, std::vector<row_group_info>>
 aggregate_reader_metadata::select_row_groups(
   host_span<std::vector<size_type> const> row_group_indices,
   int64_t skip_rows_opt,
@@ -438,7 +438,7 @@ aggregate_reader_metadata::select_row_groups(
         host_span<std::vector<size_type> const>(filtered_row_group_indices.value());
     }
   }
-  std::vector<gpu::row_group_info> selection;
+  std::vector<row_group_info> selection;
   auto [rows_to_skip, rows_to_read] = [&]() {
     if (not row_group_indices.empty()) { return std::pair<int64_t, size_type>{}; }
     auto const from_opts = cudf::io::detail::skip_rows_num_rows_from_options(
@@ -478,7 +478,7 @@ aggregate_reader_metadata::select_row_groups(
 }
 
 std::tuple<std::vector<input_column_info>,
-           std::vector<inline_column_buffer>,
+           std::vector<cudf::io::detail::inline_column_buffer>,
            std::vector<size_type>>
 aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>> const& use_names,
                                           bool include_index,
@@ -496,17 +496,18 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
              : -1;
   };
 
-  std::vector<inline_column_buffer> output_columns;
+  std::vector<cudf::io::detail::inline_column_buffer> output_columns;
   std::vector<input_column_info> input_columns;
   std::vector<int> nesting;
 
   // Return true if column path is valid. e.g. if the path is {"struct1", "child1"}, then it is
   // valid if "struct1.child1" exists in this file's schema. If "struct1" exists but "child1" is
   // not a child of "struct1" then the function will return false for "struct1"
-  std::function<bool(column_name_info const*, int, std::vector<inline_column_buffer>&, bool)>
+  std::function<bool(
+    column_name_info const*, int, std::vector<cudf::io::detail::inline_column_buffer>&, bool)>
     build_column = [&](column_name_info const* col_name_info,
                        int schema_idx,
-                       std::vector<inline_column_buffer>& out_col_array,
+                       std::vector<cudf::io::detail::inline_column_buffer>& out_col_array,
                        bool has_list_parent) {
       if (schema_idx < 0) { return false; }
       auto const& schema_elem = get_schema(schema_idx);
@@ -529,7 +530,8 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
                               : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
       auto const dtype    = to_data_type(col_type, schema_elem);
 
-      inline_column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL);
+      cudf::io::detail::inline_column_buffer output_col(dtype,
+                                                        schema_elem.repetition_type == OPTIONAL);
       if (has_list_parent) { output_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; }
       // store the index of this element if inserted in out_col_array
       nesting.push_back(static_cast<int>(out_col_array.size()));
@@ -569,7 +571,8 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
             to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
           auto const element_dtype = to_data_type(element_type, schema_elem);
 
-          inline_column_buffer element_col(element_dtype, schema_elem.repetition_type == OPTIONAL);
+          cudf::io::detail::inline_column_buffer element_col(
+            element_dtype, schema_elem.repetition_type == OPTIONAL);
           if (has_list_parent || col_type == type_id::LIST) {
             element_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT;
           }
@@ -732,4 +735,4 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
     std::move(input_columns), std::move(output_columns), std::move(output_column_schemas));
 }
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 9ee17f26a10..1a73e2f55ac 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -32,9 +32,7 @@
 #include <tuple>
 #include <vector>
 
-namespace cudf::io::detail::parquet {
-
-using namespace cudf::io::parquet;
+namespace cudf::io::parquet::detail {
 
 /**
  * @brief Function that translates Parquet datatype to cuDF type enum
@@ -182,7 +180,7 @@ class aggregate_reader_metadata {
    * @return A tuple of corrected row_start, row_count and list of row group indexes and its
    *         starting row
    */
-  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<gpu::row_group_info>> select_row_groups(
+  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<row_group_info>> select_row_groups(
     host_span<std::vector<size_type> const> row_group_indices,
     int64_t row_start,
     std::optional<size_type> const& row_count,
@@ -202,12 +200,13 @@ class aggregate_reader_metadata {
    * @return input column information, output column information, list of output column schema
    * indices
    */
-  [[nodiscard]] std::
-    tuple<std::vector<input_column_info>, std::vector<inline_column_buffer>, std::vector<size_type>>
-    select_columns(std::optional<std::vector<std::string>> const& use_names,
-                   bool include_index,
-                   bool strings_to_categorical,
-                   type_id timestamp_type_id) const;
+  [[nodiscard]] std::tuple<std::vector<input_column_info>,
+                           std::vector<cudf::io::detail::inline_column_buffer>,
+                           std::vector<size_type>>
+  select_columns(std::optional<std::vector<std::string>> const& use_names,
+                 bool include_index,
+                 bool strings_to_categorical,
+                 type_id timestamp_type_id) const;
 };
 
 /**
@@ -276,4 +275,4 @@ class named_to_reference_converter : public ast::detail::expression_transformer
   std::list<ast::operation> _operators;
 };
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index c731c467f2c..4bc6bb6f43b 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -43,7 +43,8 @@
 
 #include <numeric>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
+
 namespace {
 
 /**
@@ -185,11 +186,11 @@ template <typename T = uint8_t>
  */
 [[nodiscard]] std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
                                                                    type_id timestamp_type_id,
-                                                                   parquet::Type physical,
+                                                                   Type physical,
                                                                    int8_t converted,
                                                                    int32_t length)
 {
-  int32_t type_width = (physical == parquet::FIXED_LEN_BYTE_ARRAY) ? length : 0;
+  int32_t type_width = (physical == FIXED_LEN_BYTE_ARRAY) ? length : 0;
   int32_t clock_rate = 0;
   if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) {
     type_width = 1;  // I32 -> I8
@@ -202,9 +203,9 @@ template <typename T = uint8_t>
   }
 
   int8_t converted_type = converted;
-  if (converted_type == parquet::DECIMAL && column_type_id != type_id::FLOAT64 &&
+  if (converted_type == DECIMAL && column_type_id != type_id::FLOAT64 &&
       not cudf::is_fixed_point(data_type{column_type_id})) {
-    converted_type = parquet::UNKNOWN;  // Not converting to float64 or decimal
+    converted_type = UNKNOWN;  // Not converting to float64 or decimal
   }
   return std::make_tuple(type_width, clock_rate, converted_type);
 }
@@ -226,7 +227,7 @@ template <typename T = uint8_t>
 [[nodiscard]] std::future<void> read_column_chunks_async(
   std::vector<std::unique_ptr<datasource>> const& sources,
   std::vector<std::unique_ptr<datasource::buffer>>& page_data,
-  cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
+  cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
   size_t begin_chunk,
   size_t end_chunk,
   std::vector<size_t> const& column_chunk_offsets,
@@ -239,11 +240,10 @@ template <typename T = uint8_t>
     size_t const io_offset   = column_chunk_offsets[chunk];
     size_t io_size           = chunks[chunk].compressed_size;
     size_t next_chunk        = chunk + 1;
-    bool const is_compressed = (chunks[chunk].codec != parquet::Compression::UNCOMPRESSED);
+    bool const is_compressed = (chunks[chunk].codec != Compression::UNCOMPRESSED);
     while (next_chunk < end_chunk) {
-      size_t const next_offset = column_chunk_offsets[next_chunk];
-      bool const is_next_compressed =
-        (chunks[next_chunk].codec != parquet::Compression::UNCOMPRESSED);
+      size_t const next_offset      = column_chunk_offsets[next_chunk];
+      bool const is_next_compressed = (chunks[next_chunk].codec != Compression::UNCOMPRESSED);
       if (next_offset != io_offset + io_size || is_next_compressed != is_compressed ||
           chunk_source_map[chunk] != chunk_source_map[next_chunk]) {
         // Can't merge if not contiguous or mixing compressed and uncompressed
@@ -300,13 +300,13 @@ template <typename T = uint8_t>
  *
  * @return The total number of pages
  */
-[[nodiscard]] size_t count_page_headers(
-  cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& chunks, rmm::cuda_stream_view stream)
+[[nodiscard]] size_t count_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
+                                        rmm::cuda_stream_view stream)
 {
   size_t total_pages = 0;
 
   chunks.host_to_device_async(stream);
-  gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
+  DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
   chunks.device_to_host_sync(stream);
 
   for (size_t c = 0; c < chunks.size(); c++) {
@@ -337,8 +337,8 @@ constexpr bool is_supported_encoding(Encoding enc)
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @returns The size in bytes of level type data required
  */
-int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                        cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
+int decode_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
+                        cudf::detail::hostdevice_vector<PageInfo>& pages,
                         rmm::cuda_stream_view stream)
 {
   // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages),
@@ -350,14 +350,14 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
   }
 
   chunks.host_to_device_async(stream);
-  gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
+  DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
 
   // compute max bytes needed for level data
   auto level_bit_size =
     cudf::detail::make_counting_transform_iterator(0, [chunks = chunks.begin()] __device__(int i) {
       auto c = chunks[i];
       return static_cast<int>(
-        max(c.level_bits[gpu::level_type::REPETITION], c.level_bits[gpu::level_type::DEFINITION]));
+        max(c.level_bits[level_type::REPETITION], c.level_bits[level_type::DEFINITION]));
     });
   // max level data bit size.
   int const max_level_bits   = thrust::reduce(rmm::exec_policy(stream),
@@ -388,11 +388,11 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
  * @return Device buffer to decompressed page data
  */
 [[nodiscard]] rmm::device_buffer decompress_page_data(
-  cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-  cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
+  cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
+  cudf::detail::hostdevice_vector<PageInfo>& pages,
   rmm::cuda_stream_view stream)
 {
-  auto for_each_codec_page = [&](parquet::Compression codec, std::function<void(size_t)> const& f) {
+  auto for_each_codec_page = [&](Compression codec, std::function<void(size_t)> const& f) {
     for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
       const auto page_stride = chunks[c].max_num_pages;
       if (chunks[c].codec == codec) {
@@ -412,19 +412,16 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
   size_t total_decomp_size = 0;
 
   struct codec_stats {
-    parquet::Compression compression_type = UNCOMPRESSED;
-    size_t num_pages                      = 0;
-    int32_t max_decompressed_size         = 0;
-    size_t total_decomp_size              = 0;
+    Compression compression_type  = UNCOMPRESSED;
+    size_t num_pages              = 0;
+    int32_t max_decompressed_size = 0;
+    size_t total_decomp_size      = 0;
   };
 
-  std::array codecs{codec_stats{parquet::GZIP},
-                    codec_stats{parquet::SNAPPY},
-                    codec_stats{parquet::BROTLI},
-                    codec_stats{parquet::ZSTD}};
+  std::array codecs{codec_stats{GZIP}, codec_stats{SNAPPY}, codec_stats{BROTLI}, codec_stats{ZSTD}};
 
   auto is_codec_supported = [&codecs](int8_t codec) {
-    if (codec == parquet::UNCOMPRESSED) return true;
+    if (codec == UNCOMPRESSED) return true;
     return std::find_if(codecs.begin(), codecs.end(), [codec](auto& cstats) {
              return codec == cstats.compression_type;
            }) != codecs.end();
@@ -445,7 +442,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
       codec.num_pages++;
       num_comp_pages++;
     });
-    if (codec.compression_type == parquet::BROTLI && codec.num_pages > 0) {
+    if (codec.compression_type == BROTLI && codec.num_pages > 0) {
       debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.num_pages), stream);
     }
   }
@@ -482,7 +479,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
       auto& page          = pages[page_idx];
       // offset will only be non-zero for V2 pages
       auto const offset =
-        page.lvl_bytes[gpu::level_type::DEFINITION] + page.lvl_bytes[gpu::level_type::REPETITION];
+        page.lvl_bytes[level_type::DEFINITION] + page.lvl_bytes[level_type::REPETITION];
       // for V2 need to copy def and rep level info into place, and then offset the
       // input and output buffers. otherwise we'd have to keep both the compressed
       // and decompressed data.
@@ -509,11 +506,11 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
     device_span<compression_result> d_comp_res_view(comp_res.data() + start_pos, codec.num_pages);
 
     switch (codec.compression_type) {
-      case parquet::GZIP:
+      case GZIP:
         gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream);
         break;
-      case parquet::SNAPPY:
-        if (nvcomp_integration::is_stable_enabled()) {
+      case SNAPPY:
+        if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) {
           nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
                                      d_comp_in,
                                      d_comp_out,
@@ -525,7 +522,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
           gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, stream);
         }
         break;
-      case parquet::ZSTD:
+      case ZSTD:
         nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
                                    d_comp_in,
                                    d_comp_out,
@@ -534,7 +531,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
                                    codec.total_decomp_size,
                                    stream);
         break;
-      case parquet::BROTLI:
+      case BROTLI:
         gpu_debrotli(d_comp_in,
                      d_comp_out,
                      d_comp_res_view,
@@ -594,9 +591,9 @@ void reader::impl::allocate_nesting_info()
     });
 
   page_nesting_info =
-    cudf::detail::hostdevice_vector<gpu::PageNestingInfo>{total_page_nesting_infos, _stream};
+    cudf::detail::hostdevice_vector<PageNestingInfo>{total_page_nesting_infos, _stream};
   page_nesting_decode_info =
-    cudf::detail::hostdevice_vector<gpu::PageNestingDecodeInfo>{total_page_nesting_infos, _stream};
+    cudf::detail::hostdevice_vector<PageNestingDecodeInfo>{total_page_nesting_infos, _stream};
 
   // update pointers in the PageInfos
   int target_page_index = 0;
@@ -653,10 +650,10 @@ void reader::impl::allocate_nesting_info()
       if (!cur_schema.is_stub()) {
         // initialize each page within the chunk
         for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) {
-          gpu::PageNestingInfo* pni =
+          PageNestingInfo* pni =
             &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size)];
 
-          gpu::PageNestingDecodeInfo* nesting_info =
+          PageNestingDecodeInfo* nesting_info =
             &page_nesting_decode_info[nesting_info_index + (p_idx * per_page_nesting_info_size)];
 
           // if we have lists, set our start and end depth remappings
@@ -717,9 +714,9 @@ void reader::impl::allocate_level_decode_space()
   for (size_t idx = 0; idx < pages.size(); idx++) {
     auto& p = pages[idx];
 
-    p.lvl_decode_buf[gpu::level_type::DEFINITION] = buf;
+    p.lvl_decode_buf[level_type::DEFINITION] = buf;
     buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size);
-    p.lvl_decode_buf[gpu::level_type::REPETITION] = buf;
+    p.lvl_decode_buf[level_type::REPETITION] = buf;
     buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size);
   }
 }
@@ -824,25 +821,25 @@ void reader::impl::load_global_chunk_info()
                         schema.converted_type,
                         schema.type_length);
 
-      chunks.push_back(gpu::ColumnChunkDesc(col_meta.total_compressed_size,
-                                            nullptr,
-                                            col_meta.num_values,
-                                            schema.type,
-                                            type_width,
-                                            row_group_start,
-                                            row_group_rows,
-                                            schema.max_definition_level,
-                                            schema.max_repetition_level,
-                                            _metadata->get_output_nesting_depth(col.schema_idx),
-                                            required_bits(schema.max_definition_level),
-                                            required_bits(schema.max_repetition_level),
-                                            col_meta.codec,
-                                            converted_type,
-                                            schema.logical_type,
-                                            schema.decimal_precision,
-                                            clock_rate,
-                                            i,
-                                            col.schema_idx));
+      chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size,
+                                       nullptr,
+                                       col_meta.num_values,
+                                       schema.type,
+                                       type_width,
+                                       row_group_start,
+                                       row_group_rows,
+                                       schema.max_definition_level,
+                                       schema.max_repetition_level,
+                                       _metadata->get_output_nesting_depth(col.schema_idx),
+                                       required_bits(schema.max_definition_level),
+                                       required_bits(schema.max_repetition_level),
+                                       col_meta.codec,
+                                       converted_type,
+                                       schema.logical_type,
+                                       schema.decimal_precision,
+                                       clock_rate,
+                                       i,
+                                       col.schema_idx));
     }
 
     remaining_rows -= row_group_rows;
@@ -909,7 +906,7 @@ void reader::impl::compute_input_pass_row_group_info()
 void reader::impl::setup_pass()
 {
   // this will also cause the previous pass information to be deleted
-  _pass_itm_data = std::make_unique<cudf::io::parquet::gpu::pass_intermediate_data>();
+  _pass_itm_data = std::make_unique<pass_intermediate_data>();
 
   // setup row groups to be loaded for this pass
   auto const row_group_start = _input_pass_row_group_offsets[_current_input_pass];
@@ -929,8 +926,7 @@ void reader::impl::setup_pass()
   auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup);
   auto chunk_end   = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup);
 
-  _pass_itm_data->chunks =
-    cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>(num_chunks, _stream);
+  _pass_itm_data->chunks = cudf::detail::hostdevice_vector<ColumnChunkDesc>(num_chunks, _stream);
   std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin());
 
   // adjust skip_rows and num_rows by what's available in the row groups we are processing
@@ -970,7 +966,7 @@ void reader::impl::load_and_decompress_data()
   // Process dataset chunk pages into output columns
   auto const total_pages = count_page_headers(chunks, _stream);
   if (total_pages <= 0) { return; }
-  pages = cudf::detail::hostdevice_vector<gpu::PageInfo>(total_pages, total_pages, _stream);
+  pages = cudf::detail::hostdevice_vector<PageInfo>(total_pages, total_pages, _stream);
 
   // decoding of column/page information
   _pass_itm_data->level_type_size = decode_page_headers(chunks, pages, _stream);
@@ -978,7 +974,7 @@ void reader::impl::load_and_decompress_data()
     decomp_page_data = decompress_page_data(chunks, pages, _stream);
     // Free compressed data
     for (size_t c = 0; c < chunks.size(); c++) {
-      if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { raw_page_data[c].reset(); }
+      if (chunks[c].codec != Compression::UNCOMPRESSED) { raw_page_data[c].reset(); }
     }
   }
 
@@ -1019,14 +1015,13 @@ struct cumulative_row_info {
 };
 
 #if defined(PREPROCESS_DEBUG)
-void print_pages(cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
-                 rmm::cuda_stream_view _stream)
+void print_pages(cudf::detail::hostdevice_vector<PageInfo>& pages, rmm::cuda_stream_view _stream)
 {
   pages.device_to_host_sync(_stream);
   for (size_t idx = 0; idx < pages.size(); idx++) {
     auto const& p = pages[idx];
     // skip dictionary pages
-    if (p.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; }
+    if (p.flags & PAGEINFO_FLAGS_DICTIONARY) { continue; }
     printf(
       "P(%lu, s:%d): chunk_row(%d), num_rows(%d), skipped_values(%d), skipped_leaf_values(%d), "
       "str_bytes(%d)\n",
@@ -1040,7 +1035,7 @@ void print_pages(cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
   }
 }
 
-void print_cumulative_page_info(cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
+void print_cumulative_page_info(cudf::detail::hostdevice_vector<PageInfo>& pages,
                                 rmm::device_uvector<int32_t> const& page_index,
                                 rmm::device_uvector<cumulative_row_info> const& c_info,
                                 rmm::cuda_stream_view stream)
@@ -1067,7 +1062,7 @@ void print_cumulative_page_info(cudf::detail::hostdevice_vector<gpu::PageInfo>&
     printf("Schema %d\n", schemas[idx]);
     for (size_t pidx = 0; pidx < pages.size(); pidx++) {
       auto const& page = pages[h_page_index[pidx]];
-      if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) {
+      if (page.flags & PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) {
         continue;
       }
       printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes);
@@ -1075,10 +1070,9 @@ void print_cumulative_page_info(cudf::detail::hostdevice_vector<gpu::PageInfo>&
   }
 }
 
-void print_cumulative_row_info(
-  host_span<cumulative_row_info const> sizes,
-  std::string const& label,
-  std::optional<std::vector<gpu::chunk_read_info>> splits = std::nullopt)
+void print_cumulative_row_info(host_span<cumulative_row_info const> sizes,
+                               std::string const& label,
+                               std::optional<std::vector<chunk_read_info>> splits = std::nullopt)
 {
   if (splits.has_value()) {
     printf("------------\nSplits\n");
@@ -1093,7 +1087,7 @@ void print_cumulative_row_info(
     if (splits.has_value()) {
       // if we have a split at this row count and this is the last instance of this row count
       auto start = thrust::make_transform_iterator(
-        splits->begin(), [](gpu::chunk_read_info const& i) { return i.skip_rows; });
+        splits->begin(), [](chunk_read_info const& i) { return i.skip_rows; });
       auto end               = start + splits->size();
       auto split             = std::find(start, end, sizes[idx].row_count);
       auto const split_index = [&]() -> int {
@@ -1180,12 +1174,12 @@ __device__ size_t row_size_functor::operator()<string_view>(size_t num_rows, boo
  * Sums across all nesting levels.
  */
 struct get_cumulative_row_info {
-  gpu::PageInfo const* const pages;
+  PageInfo const* const pages;
 
   __device__ cumulative_row_info operator()(size_type index)
   {
     auto const& page = pages[index];
-    if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) {
+    if (page.flags & PAGEINFO_FLAGS_DICTIONARY) {
       return cumulative_row_info{0, 0, page.src_col_schema};
     }
 
@@ -1250,15 +1244,15 @@ struct row_total_size {
  * @param num_rows Total number of rows to read
  * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
  */
-std::vector<gpu::chunk_read_info> find_splits(std::vector<cumulative_row_info> const& sizes,
-                                              size_t num_rows,
-                                              size_t chunk_read_limit)
+std::vector<chunk_read_info> find_splits(std::vector<cumulative_row_info> const& sizes,
+                                         size_t num_rows,
+                                         size_t chunk_read_limit)
 {
   // now we have an array of {row_count, real output bytes}. just walk through it and generate
   // splits.
   // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch
   // sizes are reasonably large, this shouldn't iterate too many times
-  std::vector<gpu::chunk_read_info> splits;
+  std::vector<chunk_read_info> splits;
   {
     size_t cur_pos             = 0;
     size_t cur_cumulative_size = 0;
@@ -1290,7 +1284,7 @@ std::vector<gpu::chunk_read_info> find_splits(std::vector<cumulative_row_info> c
 
       auto const start_row = cur_row_count;
       cur_row_count        = sizes[split_pos].row_count;
-      splits.push_back(gpu::chunk_read_info{start_row, cur_row_count - start_row});
+      splits.push_back(chunk_read_info{start_row, cur_row_count - start_row});
       cur_pos             = split_pos;
       cur_cumulative_size = sizes[split_pos].size_bytes;
     }
@@ -1311,12 +1305,11 @@ std::vector<gpu::chunk_read_info> find_splits(std::vector<cumulative_row_info> c
  * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
  * @param stream CUDA stream to use
  */
-std::vector<gpu::chunk_read_info> compute_splits(
-  cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
-  gpu::pass_intermediate_data const& id,
-  size_t num_rows,
-  size_t chunk_read_limit,
-  rmm::cuda_stream_view stream)
+std::vector<chunk_read_info> compute_splits(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                                            pass_intermediate_data const& id,
+                                            size_t num_rows,
+                                            size_t chunk_read_limit,
+                                            rmm::cuda_stream_view stream)
 {
   auto const& page_keys  = id.page_keys;
   auto const& page_index = id.page_index;
@@ -1395,16 +1388,16 @@ std::vector<gpu::chunk_read_info> compute_splits(
 }
 
 struct get_page_chunk_idx {
-  __device__ size_type operator()(gpu::PageInfo const& page) { return page.chunk_idx; }
+  __device__ size_type operator()(PageInfo const& page) { return page.chunk_idx; }
 };
 
 struct get_page_num_rows {
-  __device__ size_type operator()(gpu::PageInfo const& page) { return page.num_rows; }
+  __device__ size_type operator()(PageInfo const& page) { return page.num_rows; }
 };
 
 struct get_page_column_index {
-  gpu::ColumnChunkDesc const* chunks;
-  __device__ size_type operator()(gpu::PageInfo const& page)
+  ColumnChunkDesc const* chunks;
+  __device__ size_type operator()(PageInfo const& page)
   {
     return chunks[page.chunk_idx].src_col_index;
   }
@@ -1441,7 +1434,7 @@ struct get_page_nesting_size {
   input_col_info const* const input_cols;
   size_type const max_depth;
   size_t const num_pages;
-  gpu::PageInfo const* const pages;
+  PageInfo const* const pages;
   int const* page_indices;
 
   __device__ size_type operator()(size_t index) const
@@ -1450,7 +1443,7 @@ struct get_page_nesting_size {
 
     auto const& page = pages[page_indices[indices.page_idx]];
     if (page.src_col_schema != input_cols[indices.col_idx].schema_idx ||
-        page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY ||
+        page.flags & PAGEINFO_FLAGS_DICTIONARY ||
         indices.depth_idx >= input_cols[indices.col_idx].nesting_depth) {
       return 0;
     }
@@ -1468,7 +1461,7 @@ struct get_reduction_key {
  * @brief Writes to the chunk_row field of the PageInfo struct.
  */
 struct chunk_row_output_iter {
-  gpu::PageInfo* p;
+  PageInfo* p;
   using value_type        = size_type;
   using difference_type   = size_type;
   using pointer           = size_type*;
@@ -1490,7 +1483,7 @@ struct chunk_row_output_iter {
  * @brief Writes to the page_start_value field of the PageNestingInfo struct, keyed by schema.
  */
 struct start_offset_output_iterator {
-  gpu::PageInfo const* pages;
+  PageInfo const* pages;
   int const* page_indices;
   size_t cur_index;
   input_col_info const* input_cols;
@@ -1529,9 +1522,9 @@ struct start_offset_output_iterator {
   {
     auto const indices = reduction_indices{index, max_depth, num_pages};
 
-    gpu::PageInfo const& p = pages[page_indices[indices.page_idx]];
+    PageInfo const& p = pages[page_indices[indices.page_idx]];
     if (p.src_col_schema != input_cols[indices.col_idx].schema_idx ||
-        p.flags & gpu::PAGEINFO_FLAGS_DICTIONARY ||
+        p.flags & PAGEINFO_FLAGS_DICTIONARY ||
         indices.depth_idx >= input_cols[indices.col_idx].nesting_depth) {
       return empty;
     }
@@ -1540,15 +1533,15 @@ struct start_offset_output_iterator {
 };
 
 struct flat_column_num_rows {
-  gpu::PageInfo const* pages;
-  gpu::ColumnChunkDesc const* chunks;
+  PageInfo const* pages;
+  ColumnChunkDesc const* chunks;
 
   __device__ size_type operator()(size_type pindex) const
   {
-    gpu::PageInfo const& page = pages[pindex];
+    PageInfo const& page = pages[pindex];
     // ignore dictionary pages and pages belonging to any column containing repetition (lists)
-    if ((page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) ||
-        (chunks[page.chunk_idx].max_level[gpu::level_type::REPETITION] > 0)) {
+    if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) ||
+        (chunks[page.chunk_idx].max_level[level_type::REPETITION] > 0)) {
       return 0;
     }
     return page.num_rows;
@@ -1581,8 +1574,8 @@ struct row_counts_different {
  * @param expected_row_count Expected row count, if applicable
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void detect_malformed_pages(cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
-                            cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc> const& chunks,
+void detect_malformed_pages(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                            cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
                             device_span<int const> page_keys,
                             device_span<int const> page_index,
                             std::optional<size_t> expected_row_count,
@@ -1631,23 +1624,21 @@ void detect_malformed_pages(cudf::detail::hostdevice_vector<gpu::PageInfo>& page
 }
 
 struct page_to_string_size {
-  gpu::PageInfo* pages;
-  gpu::ColumnChunkDesc const* chunks;
+  PageInfo* pages;
+  ColumnChunkDesc const* chunks;
 
   __device__ size_t operator()(size_type page_idx) const
   {
     auto const page  = pages[page_idx];
     auto const chunk = chunks[page.chunk_idx];
 
-    if (not is_string_col(chunk) || (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) != 0) {
-      return 0;
-    }
+    if (not is_string_col(chunk) || (page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0) { return 0; }
     return pages[page_idx].str_bytes;
   }
 };
 
 struct page_offset_output_iter {
-  gpu::PageInfo* p;
+  PageInfo* p;
   size_type const* index;
 
   using value_type        = size_type;
@@ -1738,7 +1729,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
       cols          = &out_buf.children;
 
       // if this has a list parent, we have to get column sizes from the
-      // data computed during gpu::ComputePageSizes
+      // data computed during ComputePageSizes
       if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) {
         has_lists = true;
         break;
@@ -1749,7 +1740,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
 
   // generate string dict indices if necessary
   {
-    auto is_dict_chunk = [](gpu::ColumnChunkDesc const& chunk) {
+    auto is_dict_chunk = [](ColumnChunkDesc const& chunk) {
       return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0;
     };
 
@@ -1785,7 +1776,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
 
     if (total_str_dict_indexes > 0) {
       chunks.host_to_device_async(_stream);
-      gpu::BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream);
+      BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream);
     }
   }
 
@@ -1800,14 +1791,14 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
     // if:
     // - user has passed custom row bounds
     // - we will be doing a chunked read
-    gpu::ComputePageSizes(pages,
-                          chunks,
-                          0,  // 0-max size_t. process all possible rows
-                          std::numeric_limits<size_t>::max(),
-                          true,                  // compute num_rows
-                          chunk_read_limit > 0,  // compute string sizes
-                          _pass_itm_data->level_type_size,
-                          _stream);
+    ComputePageSizes(pages,
+                     chunks,
+                     0,  // 0-max size_t. process all possible rows
+                     std::numeric_limits<size_t>::max(),
+                     true,                  // compute num_rows
+                     chunk_read_limit > 0,  // compute string sizes
+                     _pass_itm_data->level_type_size,
+                     _stream);
 
     // computes:
     // PageInfo::chunk_row (the absolute start row index) for all pages
@@ -1836,7 +1827,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
   _pass_itm_data->output_chunk_read_info =
     _output_chunk_read_limit > 0
       ? compute_splits(pages, *_pass_itm_data, num_rows, chunk_read_limit, _stream)
-      : std::vector<gpu::chunk_read_info>{{skip_rows, num_rows}};
+      : std::vector<chunk_read_info>{{skip_rows, num_rows}};
 }
 
 void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds)
@@ -1853,14 +1844,14 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
   // respect the user bounds. It is only necessary to do this second pass if uses_custom_row_bounds
   // is set (if the user has specified artificial bounds).
   if (uses_custom_row_bounds) {
-    gpu::ComputePageSizes(pages,
-                          chunks,
-                          skip_rows,
-                          num_rows,
-                          false,  // num_rows is already computed
-                          false,  // no need to compute string sizes
-                          _pass_itm_data->level_type_size,
-                          _stream);
+    ComputePageSizes(pages,
+                     chunks,
+                     skip_rows,
+                     num_rows,
+                     false,  // num_rows is already computed
+                     false,  // no need to compute string sizes
+                     _pass_itm_data->level_type_size,
+                     _stream);
 
     // print_pages(pages, _stream);
   }
@@ -1879,7 +1870,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
       cols          = &out_buf.children;
 
       // if this has a list parent, we have to get column sizes from the
-      // data computed during gpu::ComputePageSizes
+      // data computed during ComputePageSizes
       if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) {
         has_lists = true;
       }
@@ -2014,4 +2005,4 @@ std::vector<size_t> reader::impl::calculate_page_string_offsets()
   return col_sizes;
 }
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index 2545a074a38..799d6d9fd64 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -20,7 +20,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 template <int num_threads>
 constexpr int rle_stream_required_run_buffer_size()
@@ -362,4 +362,4 @@ struct rle_stream {
   }
 };
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index a124f352ee4..50589f23626 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -54,12 +54,9 @@
 #include <numeric>
 #include <utility>
 
-namespace cudf {
-namespace io {
-namespace detail {
-namespace parquet {
-using namespace cudf::io::parquet;
-using namespace cudf::io;
+namespace cudf::io::parquet::detail {
+
+using namespace cudf::io::detail;
 
 struct aggregate_writer_metadata {
   aggregate_writer_metadata(host_span<partition_info const> partitions,
@@ -185,13 +182,13 @@ namespace {
  * @param compression The compression type
  * @return The supported Parquet compression
  */
-parquet::Compression to_parquet_compression(compression_type compression)
+Compression to_parquet_compression(compression_type compression)
 {
   switch (compression) {
     case compression_type::AUTO:
-    case compression_type::SNAPPY: return parquet::Compression::SNAPPY;
-    case compression_type::ZSTD: return parquet::Compression::ZSTD;
-    case compression_type::NONE: return parquet::Compression::UNCOMPRESSED;
+    case compression_type::SNAPPY: return Compression::SNAPPY;
+    case compression_type::ZSTD: return Compression::ZSTD;
+    case compression_type::NONE: return Compression::UNCOMPRESSED;
     default: CUDF_FAIL("Unsupported compression type");
   }
 }
@@ -206,7 +203,7 @@ void update_chunk_encodings(std::vector<Encoding>& encodings, uint32_t enc_mask)
 {
   for (uint8_t enc = 0; enc < static_cast<uint8_t>(Encoding::NUM_ENCODINGS); enc++) {
     auto const enc_enum = static_cast<Encoding>(enc);
-    if ((enc_mask & gpu::encoding_to_mask(enc_enum)) != 0) { encodings.push_back(enc_enum); }
+    if ((enc_mask & encoding_to_mask(enc_enum)) != 0) { encodings.push_back(enc_enum); }
   }
 }
 
@@ -761,11 +758,11 @@ struct parquet_column_view {
                       std::vector<schema_tree_node> const& schema_tree,
                       rmm::cuda_stream_view stream);
 
-  [[nodiscard]] gpu::parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const;
+  [[nodiscard]] parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const;
 
   [[nodiscard]] column_view cudf_column_view() const { return cudf_col; }
-  [[nodiscard]] parquet::Type physical_type() const { return schema_node.type; }
-  [[nodiscard]] parquet::ConvertedType converted_type() const { return schema_node.converted_type; }
+  [[nodiscard]] Type physical_type() const { return schema_node.type; }
+  [[nodiscard]] ConvertedType converted_type() const { return schema_node.converted_type; }
 
   std::vector<std::string> const& get_path_in_schema() { return path_in_schema; }
 
@@ -846,11 +843,11 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
   uint16_t max_rep_level = 0;
   curr_schema_node       = schema_node;
   while (curr_schema_node.parent_idx != -1) {
-    if (curr_schema_node.repetition_type == parquet::REPEATED or
-        curr_schema_node.repetition_type == parquet::OPTIONAL) {
+    if (curr_schema_node.repetition_type == REPEATED or
+        curr_schema_node.repetition_type == OPTIONAL) {
       ++max_def_level;
     }
-    if (curr_schema_node.repetition_type == parquet::REPEATED) { ++max_rep_level; }
+    if (curr_schema_node.repetition_type == REPEATED) { ++max_rep_level; }
     curr_schema_node = schema_tree[curr_schema_node.parent_idx];
   }
   CUDF_EXPECTS(max_def_level < 256, "Definition levels above 255 are not supported");
@@ -897,9 +894,9 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
   }
 }
 
-gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream_view) const
+parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream_view) const
 {
-  auto desc        = gpu::parquet_column_device_view{};  // Zero out all fields
+  auto desc        = parquet_column_device_view{};  // Zero out all fields
   desc.stats_dtype = schema_node.stats_dtype;
   desc.ts_scale    = schema_node.ts_scale;
 
@@ -931,8 +928,8 @@ gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_s
  * @param fragment_size Number of rows per fragment
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void init_row_group_fragments(cudf::detail::hostdevice_2dvector<gpu::PageFragment>& frag,
-                              device_span<gpu::parquet_column_device_view const> col_desc,
+void init_row_group_fragments(cudf::detail::hostdevice_2dvector<PageFragment>& frag,
+                              device_span<parquet_column_device_view const> col_desc,
                               host_span<partition_info const> partitions,
                               device_span<int const> part_frag_offset,
                               uint32_t fragment_size,
@@ -940,7 +937,7 @@ void init_row_group_fragments(cudf::detail::hostdevice_2dvector<gpu::PageFragmen
 {
   auto d_partitions = cudf::detail::make_device_uvector_async(
     partitions, stream, rmm::mr::get_current_device_resource());
-  gpu::InitRowGroupFragments(frag, col_desc, d_partitions, part_frag_offset, fragment_size, stream);
+  InitRowGroupFragments(frag, col_desc, d_partitions, part_frag_offset, fragment_size, stream);
   frag.device_to_host_sync(stream);
 }
 
@@ -954,13 +951,13 @@ void init_row_group_fragments(cudf::detail::hostdevice_2dvector<gpu::PageFragmen
  * @param frag_sizes Array of fragment sizes for each column
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void calculate_page_fragments(device_span<gpu::PageFragment> frag,
+void calculate_page_fragments(device_span<PageFragment> frag,
                               host_span<size_type const> frag_sizes,
                               rmm::cuda_stream_view stream)
 {
   auto d_frag_sz = cudf::detail::make_device_uvector_async(
     frag_sizes, stream, rmm::mr::get_current_device_resource());
-  gpu::CalculatePageFragments(frag, d_frag_sz, stream);
+  CalculatePageFragments(frag, d_frag_sz, stream);
 }
 
 /**
@@ -972,13 +969,13 @@ void calculate_page_fragments(device_span<gpu::PageFragment> frag,
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 void gather_fragment_statistics(device_span<statistics_chunk> frag_stats,
-                                device_span<gpu::PageFragment const> frags,
+                                device_span<PageFragment const> frags,
                                 bool int96_timestamps,
                                 rmm::cuda_stream_view stream)
 {
   rmm::device_uvector<statistics_group> frag_stats_group(frag_stats.size(), stream);
 
-  gpu::InitFragmentStatistics(frag_stats_group, frags, stream);
+  InitFragmentStatistics(frag_stats_group, frags, stream);
   detail::calculate_group_statistics<detail::io_file_format::PARQUET>(
     frag_stats.data(), frag_stats_group.data(), frag_stats.size(), stream, int96_timestamps);
   stream.synchronize();
@@ -1008,8 +1005,8 @@ size_t max_compression_output_size(Compression codec, uint32_t compression_block
   return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize);
 }
 
-auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
-                     device_span<gpu::parquet_column_device_view const> col_desc,
+auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
+                     device_span<parquet_column_device_view const> col_desc,
                      uint32_t num_columns,
                      size_t max_page_size_bytes,
                      size_type max_page_size_rows,
@@ -1021,19 +1018,19 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
 
   chunks.host_to_device_async(stream);
   // Calculate number of pages and store in respective chunks
-  gpu::InitEncoderPages(chunks,
-                        {},
-                        {},
-                        {},
-                        col_desc,
-                        num_columns,
-                        max_page_size_bytes,
-                        max_page_size_rows,
-                        page_alignment(compression_codec),
-                        write_v2_headers,
-                        nullptr,
-                        nullptr,
-                        stream);
+  InitEncoderPages(chunks,
+                   {},
+                   {},
+                   {},
+                   col_desc,
+                   num_columns,
+                   max_page_size_bytes,
+                   max_page_size_rows,
+                   page_alignment(compression_codec),
+                   write_v2_headers,
+                   nullptr,
+                   nullptr,
+                   stream);
   chunks.device_to_host_sync(stream);
 
   int num_pages = 0;
@@ -1046,19 +1043,19 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   // Now that we know the number of pages, allocate an array to hold per page size and get it
   // populated
   cudf::detail::hostdevice_vector<size_type> page_sizes(num_pages, stream);
-  gpu::InitEncoderPages(chunks,
-                        {},
-                        page_sizes,
-                        {},
-                        col_desc,
-                        num_columns,
-                        max_page_size_bytes,
-                        max_page_size_rows,
-                        page_alignment(compression_codec),
-                        write_v2_headers,
-                        nullptr,
-                        nullptr,
-                        stream);
+  InitEncoderPages(chunks,
+                   {},
+                   page_sizes,
+                   {},
+                   col_desc,
+                   num_columns,
+                   max_page_size_bytes,
+                   max_page_size_rows,
+                   page_alignment(compression_codec),
+                   write_v2_headers,
+                   nullptr,
+                   nullptr,
+                   stream);
   page_sizes.device_to_host_sync(stream);
 
   // Get per-page max compressed size
@@ -1072,26 +1069,26 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   comp_page_sizes.host_to_device_async(stream);
 
   // Use per-page max compressed size to calculate chunk.compressed_size
-  gpu::InitEncoderPages(chunks,
-                        {},
-                        {},
-                        comp_page_sizes,
-                        col_desc,
-                        num_columns,
-                        max_page_size_bytes,
-                        max_page_size_rows,
-                        page_alignment(compression_codec),
-                        write_v2_headers,
-                        nullptr,
-                        nullptr,
-                        stream);
+  InitEncoderPages(chunks,
+                   {},
+                   {},
+                   comp_page_sizes,
+                   col_desc,
+                   num_columns,
+                   max_page_size_bytes,
+                   max_page_size_rows,
+                   page_alignment(compression_codec),
+                   write_v2_headers,
+                   nullptr,
+                   nullptr,
+                   stream);
   chunks.device_to_host_sync(stream);
   return comp_page_sizes;
 }
 
 size_t max_page_bytes(Compression compression, size_t max_page_size_bytes)
 {
-  if (compression == parquet::Compression::UNCOMPRESSED) { return max_page_size_bytes; }
+  if (compression == Compression::UNCOMPRESSED) { return max_page_size_bytes; }
 
   auto const ncomp_type   = to_nvcomp_compression_type(compression);
   auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type)
@@ -1104,9 +1101,9 @@ size_t max_page_bytes(Compression compression, size_t max_page_size_bytes)
 }
 
 std::pair<std::vector<rmm::device_uvector<size_type>>, std::vector<rmm::device_uvector<size_type>>>
-build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
-                         host_span<gpu::parquet_column_device_view const> col_desc,
-                         device_2dspan<gpu::PageFragment const> frags,
+build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
+                         host_span<parquet_column_device_view const> col_desc,
+                         device_2dspan<PageFragment const> frags,
                          Compression compression,
                          dictionary_policy dict_policy,
                          size_t max_dict_size,
@@ -1130,7 +1127,7 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   }
 
   // Allocate slots for each chunk
-  std::vector<rmm::device_uvector<gpu::slot_type>> hash_maps_storage;
+  std::vector<rmm::device_uvector<slot_type>> hash_maps_storage;
   hash_maps_storage.reserve(h_chunks.size());
   for (auto& chunk : h_chunks) {
     if (col_desc[chunk.col_desc_id].physical_type == Type::BOOLEAN ||
@@ -1149,8 +1146,8 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
 
   chunks.host_to_device_async(stream);
 
-  gpu::initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream);
-  gpu::populate_chunk_hash_maps(frags, stream);
+  initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream);
+  populate_chunk_hash_maps(frags, stream);
 
   chunks.device_to_host_sync(stream);
 
@@ -1197,8 +1194,8 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
     chunk.dict_index          = inserted_dict_index.data();
   }
   chunks.host_to_device_async(stream);
-  gpu::collect_map_entries(chunks.device_view().flat_view(), stream);
-  gpu::get_dictionary_indices(frags, stream);
+  collect_map_entries(chunks.device_view().flat_view(), stream);
+  get_dictionary_indices(frags, stream);
 
   return std::pair(std::move(dict_data), std::move(dict_index));
 }
@@ -1221,9 +1218,9 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
  * @param write_v2_headers True if version 2 page headers are to be written
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
-                        device_span<gpu::parquet_column_device_view const> col_desc,
-                        device_span<gpu::EncPage> pages,
+void init_encoder_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
+                        device_span<parquet_column_device_view const> col_desc,
+                        device_span<EncPage> pages,
                         cudf::detail::hostdevice_vector<size_type>& comp_page_sizes,
                         statistics_chunk* page_stats,
                         statistics_chunk* frag_stats,
@@ -1286,8 +1283,8 @@ void init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
  * @param write_v2_headers True if V2 page headers should be written
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
-                  device_span<gpu::EncPage> pages,
+void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
+                  device_span<EncPage> pages,
                   uint32_t pages_in_batch,
                   uint32_t first_page_in_batch,
                   uint32_t rowgroups_in_batch,
@@ -1308,8 +1305,7 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
       ? device_span<statistics_chunk const>(page_stats + first_page_in_batch, pages_in_batch)
       : device_span<statistics_chunk const>();
 
-  uint32_t max_comp_pages =
-    (compression != parquet::Compression::UNCOMPRESSED) ? pages_in_batch : 0;
+  uint32_t max_comp_pages = (compression != Compression::UNCOMPRESSED) ? pages_in_batch : 0;
 
   rmm::device_uvector<device_span<uint8_t const>> comp_in(max_comp_pages, stream);
   rmm::device_uvector<device_span<uint8_t>> comp_out(max_comp_pages, stream);
@@ -1319,9 +1315,9 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                comp_res.end(),
                compression_result{0, compression_status::FAILURE});
 
-  gpu::EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream);
+  EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream);
   switch (compression) {
-    case parquet::Compression::SNAPPY:
+    case Compression::SNAPPY:
       if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) {
         gpu_snap(comp_in, comp_out, comp_res, stream);
       } else {
@@ -1329,7 +1325,7 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
           nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream);
       }
       break;
-    case parquet::Compression::ZSTD: {
+    case Compression::ZSTD: {
       if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD);
           reason) {
         CUDF_FAIL("Compression error: " + reason.value());
@@ -1338,7 +1334,7 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
 
       break;
     }
-    case parquet::Compression::UNCOMPRESSED: break;
+    case Compression::UNCOMPRESSED: break;
     default: CUDF_FAIL("invalid compression type");
   }
 
@@ -1378,7 +1374,7 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
  * @param column_index_truncate_length maximum length of min or max values in column index, in bytes
  * @return Computed buffer size needed to encode the column index
  */
-size_t column_index_buffer_size(gpu::EncColumnChunk* ck, int32_t column_index_truncate_length)
+size_t column_index_buffer_size(EncColumnChunk* ck, int32_t column_index_truncate_length)
 {
   // encoding the column index for a given chunk requires:
   //   each list (4 of them) requires 6 bytes of overhead
@@ -1499,8 +1495,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   std::vector<SchemaElement> this_table_schema(schema_tree.begin(), schema_tree.end());
 
   // Initialize column description
-  cudf::detail::hostdevice_vector<gpu::parquet_column_device_view> col_desc(parquet_columns.size(),
-                                                                            stream);
+  cudf::detail::hostdevice_vector<parquet_column_device_view> col_desc(parquet_columns.size(),
+                                                                       stream);
   std::transform(
     parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [&](auto const& pcol) {
       return pcol.get_device_view(stream);
@@ -1576,7 +1572,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
 
   auto d_part_frag_offset = cudf::detail::make_device_uvector_async(
     part_frag_offset, stream, rmm::mr::get_current_device_resource());
-  cudf::detail::hostdevice_2dvector<gpu::PageFragment> row_group_fragments(
+  cudf::detail::hostdevice_2dvector<PageFragment> row_group_fragments(
     num_columns, num_fragments, stream);
 
   // Create table_device_view so that corresponding column_device_view data
@@ -1588,7 +1584,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   if (num_fragments != 0) {
     // Move column info to device
     col_desc.host_to_device_async(stream);
-    leaf_column_views = create_leaf_column_device_views<gpu::parquet_column_device_view>(
+    leaf_column_views = create_leaf_column_device_views<parquet_column_device_view>(
       col_desc, *parent_column_table_device_view, stream);
 
     init_row_group_fragments(row_group_fragments,
@@ -1662,7 +1658,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
 
   // Initialize row groups and column chunks
   auto const num_chunks = num_rowgroups * num_columns;
-  hostdevice_2dvector<gpu::EncColumnChunk> chunks(num_rowgroups, num_columns, stream);
+  hostdevice_2dvector<EncColumnChunk> chunks(num_rowgroups, num_columns, stream);
 
   // total fragments per column (in case they are non-uniform)
   std::vector<size_type> frags_per_column(num_columns, 0);
@@ -1678,7 +1674,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
       row_group.total_byte_size = 0;
       row_group.columns.resize(num_columns);
       for (int c = 0; c < num_columns; c++) {
-        gpu::EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c];
+        EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c];
 
         ck                   = {};
         ck.col_desc          = col_desc.device_ptr() + c;
@@ -1700,7 +1696,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
             return l + r.num_values;
           });
         ck.plain_data_size = std::accumulate(
-          chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, gpu::PageFragment frag) {
+          chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, PageFragment frag) {
             return sum + frag.fragment_data_size;
           });
         auto& column_chunk_meta          = row_group.columns[c].meta_data;
@@ -1731,7 +1727,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     frags_per_column.empty() ? 0 : frag_offsets.back() + frags_per_column.back();
 
   rmm::device_uvector<statistics_chunk> frag_stats(0, stream);
-  cudf::detail::hostdevice_vector<gpu::PageFragment> page_fragments(total_frags, stream);
+  cudf::detail::hostdevice_vector<PageFragment> page_fragments(total_frags, stream);
 
   // update fragments and/or prepare for fragment statistics calculation if necessary
   if (total_frags != 0) {
@@ -1749,9 +1745,9 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
           auto const& row_group = agg_meta->file(p).row_groups[global_r];
           uint32_t const fragments_in_chunk =
             util::div_rounding_up_unsafe(row_group.num_rows, frag_size);
-          gpu::EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c];
-          ck.fragments            = page_fragments.device_ptr(frag_offset);
-          ck.first_fragment       = frag_offset;
+          EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c];
+          ck.fragments       = page_fragments.device_ptr(frag_offset);
+          ck.first_fragment  = frag_offset;
 
           // update the chunk pointer here for each fragment in chunk.fragments
           for (uint32_t i = 0; i < fragments_in_chunk; i++) {
@@ -1817,8 +1813,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     size_t comp_rowgroup_size = 0;
     if (r < num_rowgroups) {
       for (int i = 0; i < num_columns; i++) {
-        gpu::EncColumnChunk* ck = &chunks[r][i];
-        ck->first_page          = num_pages;
+        EncColumnChunk* ck = &chunks[r][i];
+        ck->first_page     = num_pages;
         num_pages += ck->num_pages;
         pages_in_batch += ck->num_pages;
         rowgroup_size += ck->bfr_size;
@@ -1850,7 +1846,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   }
 
   // Clear compressed buffer size if compression has been turned off
-  if (compression == parquet::Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; }
+  if (compression == Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; }
 
   // Initialize data pointers in batch
   uint32_t const num_stats_bfr =
@@ -1864,7 +1860,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                               stream);
 
   rmm::device_buffer col_idx_bfr(column_index_bfr_size, stream);
-  rmm::device_uvector<gpu::EncPage> pages(num_pages, stream);
+  rmm::device_uvector<EncPage> pages(num_pages, stream);
 
   // This contains stats for both the pages and the rowgroups. TODO: make them separate.
   rmm::device_uvector<statistics_chunk> page_stats(num_stats_bfr, stream);
@@ -1874,10 +1870,10 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     auto bfr_c = static_cast<uint8_t*>(comp_bfr.data());
     for (auto j = 0; j < batch_list[b]; j++, r++) {
       for (auto i = 0; i < num_columns; i++) {
-        gpu::EncColumnChunk& ck = chunks[r][i];
-        ck.uncompressed_bfr     = bfr;
-        ck.compressed_bfr       = bfr_c;
-        ck.column_index_blob    = bfr_i;
+        EncColumnChunk& ck   = chunks[r][i];
+        ck.uncompressed_bfr  = bfr;
+        ck.compressed_bfr    = bfr_c;
+        ck.column_index_blob = bfr_i;
         bfr += ck.bfr_size;
         bfr_c += ck.compressed_size;
         if (stats_granularity == statistics_freq::STATISTICS_COLUMN) {
@@ -1960,7 +1956,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         if (ck.ck_stat_size != 0) {
           std::vector<uint8_t> const stats_blob = cudf::detail::make_std_vector_sync(
             device_span<uint8_t const>(dev_bfr, ck.ck_stat_size), stream);
-          cudf::io::parquet::CompactProtocolReader cp(stats_blob.data(), stats_blob.size());
+          CompactProtocolReader cp(stats_blob.data(), stats_blob.size());
           cp.read(&column_chunk_meta.statistics);
           need_sync = true;
         }
@@ -2142,8 +2138,8 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
 
 void writer::impl::write_parquet_data_to_sink(
   std::unique_ptr<aggregate_writer_metadata>& updated_agg_meta,
-  device_span<gpu::EncPage const> pages,
-  host_2dspan<gpu::EncColumnChunk const> chunks,
+  device_span<EncPage const> pages,
+  host_2dspan<EncColumnChunk const> chunks,
   host_span<size_t const> global_rowgroup_base,
   host_span<int const> first_rg_in_part,
   host_span<size_type const> batch_list,
@@ -2209,7 +2205,7 @@ void writer::impl::write_parquet_data_to_sink(
         int const global_r    = global_rowgroup_base[p] + r - first_rg_in_part[p];
         auto const& row_group = _agg_meta->file(p).row_groups[global_r];
         for (std::size_t i = 0; i < num_columns; i++) {
-          gpu::EncColumnChunk const& ck = chunks[r][i];
+          EncColumnChunk const& ck      = chunks[r][i];
           auto const& column_chunk_meta = row_group.columns[i].meta_data;
 
           // start transfer of the column index
@@ -2392,7 +2388,4 @@ std::unique_ptr<std::vector<uint8_t>> writer::merge_row_group_metadata(
   return std::make_unique<std::vector<uint8_t>>(std::move(output));
 }
 
-}  // namespace parquet
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 89ef85ba2bd..1d27a8400c8 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -38,15 +38,11 @@
 #include <string>
 #include <vector>
 
-namespace cudf {
-namespace io {
-namespace detail {
-namespace parquet {
+namespace cudf::io::parquet::detail {
+
 // Forward internal classes
 struct aggregate_writer_metadata;
 
-using namespace cudf::io::parquet;
-using namespace cudf::io;
 using cudf::detail::device_2dspan;
 using cudf::detail::host_2dspan;
 using cudf::detail::hostdevice_2dvector;
@@ -66,7 +62,7 @@ class writer::impl {
    */
   explicit impl(std::vector<std::unique_ptr<data_sink>> sinks,
                 parquet_writer_options const& options,
-                single_write_mode mode,
+                cudf::io::detail::single_write_mode mode,
                 rmm::cuda_stream_view stream);
 
   /**
@@ -79,7 +75,7 @@ class writer::impl {
    */
   explicit impl(std::vector<std::unique_ptr<data_sink>> sinks,
                 chunked_parquet_writer_options const& options,
-                single_write_mode mode,
+                cudf::io::detail::single_write_mode mode,
                 rmm::cuda_stream_view stream);
 
   /**
@@ -139,8 +135,8 @@ class writer::impl {
    * @param[out] bounce_buffer Temporary host output buffer
    */
   void write_parquet_data_to_sink(std::unique_ptr<aggregate_writer_metadata>& updated_agg_meta,
-                                  device_span<gpu::EncPage const> pages,
-                                  host_2dspan<gpu::EncColumnChunk const> chunks,
+                                  device_span<EncPage const> pages,
+                                  host_2dspan<EncColumnChunk const> chunks,
                                   host_span<size_t const> global_rowgroup_base,
                                   host_span<int const> first_rg_in_part,
                                   host_span<size_type const> batch_list,
@@ -164,9 +160,10 @@ class writer::impl {
   bool const _write_v2_headers;
   int32_t const _column_index_truncate_length;
   std::vector<std::map<std::string, std::string>> const _kv_meta;  // Optional user metadata.
-  single_write_mode const _single_write_mode;  // Special parameter only used by `write()` to
-                                               // indicate that we are guaranteeing a single table
-                                               // write. This enables some internal optimizations.
+  cudf::io::detail::single_write_mode const
+    _single_write_mode;  // Special parameter only used by `write()` to
+                         // indicate that we are guaranteeing a single table
+                         // write. This enables some internal optimizations.
   std::vector<std::unique_ptr<data_sink>> const _out_sink;
 
   // Internal states, filled during `write()` and written to sink during `write` and `close()`.
@@ -180,7 +177,4 @@ class writer::impl {
   bool _closed                = false;  // To track if the output has been written to sink.
 };
 
-}  // namespace parquet
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 73c946a5feb..3e5d7033e60 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -200,29 +200,30 @@ std::unique_ptr<cudf::column> make_parquet_list_list_col(
 // of the file to populate the FileMetaData pointed to by file_meta_data.
 // throws cudf::logic_error if the file or metadata is invalid.
 void read_footer(std::unique_ptr<cudf::io::datasource> const& source,
-                 cudf::io::parquet::FileMetaData* file_meta_data)
+                 cudf::io::parquet::detail::FileMetaData* file_meta_data)
 {
-  constexpr auto header_len = sizeof(cudf::io::parquet::file_header_s);
-  constexpr auto ender_len  = sizeof(cudf::io::parquet::file_ender_s);
+  constexpr auto header_len = sizeof(cudf::io::parquet::detail::file_header_s);
+  constexpr auto ender_len  = sizeof(cudf::io::parquet::detail::file_ender_s);
 
   auto const len           = source->size();
   auto const header_buffer = source->host_read(0, header_len);
   auto const header =
-    reinterpret_cast<cudf::io::parquet::file_header_s const*>(header_buffer->data());
+    reinterpret_cast<cudf::io::parquet::detail::file_header_s const*>(header_buffer->data());
   auto const ender_buffer = source->host_read(len - ender_len, ender_len);
-  auto const ender = reinterpret_cast<cudf::io::parquet::file_ender_s const*>(ender_buffer->data());
+  auto const ender =
+    reinterpret_cast<cudf::io::parquet::detail::file_ender_s const*>(ender_buffer->data());
 
   // checks for valid header, footer, and file length
   ASSERT_GT(len, header_len + ender_len);
-  ASSERT_TRUE(header->magic == cudf::io::parquet::parquet_magic &&
-              ender->magic == cudf::io::parquet::parquet_magic);
+  ASSERT_TRUE(header->magic == cudf::io::parquet::detail::parquet_magic &&
+              ender->magic == cudf::io::parquet::detail::parquet_magic);
   ASSERT_TRUE(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len));
 
   // parquet files end with 4-byte footer_length and 4-byte magic == "PAR1"
   // seek backwards from the end of the file (footer_length + 8 bytes of ender)
   auto const footer_buffer =
     source->host_read(len - ender->footer_len - ender_len, ender->footer_len);
-  cudf::io::parquet::CompactProtocolReader cp(footer_buffer->data(), ender->footer_len);
+  cudf::io::parquet::detail::CompactProtocolReader cp(footer_buffer->data(), ender->footer_len);
 
   // returns true on success
   bool res = cp.read(file_meta_data);
@@ -233,14 +234,14 @@ void read_footer(std::unique_ptr<cudf::io::datasource> const& source,
 // this assumes the data is uncompressed.
 // throws cudf::logic_error if the page_loc data is invalid.
 int read_dict_bits(std::unique_ptr<cudf::io::datasource> const& source,
-                   cudf::io::parquet::PageLocation const& page_loc)
+                   cudf::io::parquet::detail::PageLocation const& page_loc)
 {
   CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header");
   CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length");
 
-  cudf::io::parquet::PageHeader page_hdr;
+  cudf::io::parquet::detail::PageHeader page_hdr;
   auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size);
-  cudf::io::parquet::CompactProtocolReader cp(page_buf->data(), page_buf->size());
+  cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size());
   bool res = cp.read(&page_hdr);
   CUDF_EXPECTS(res, "Cannot parse page header");
 
@@ -252,15 +253,16 @@ int read_dict_bits(std::unique_ptr<cudf::io::datasource> const& source,
 // read column index from datasource at location indicated by chunk,
 // parse and return as a ColumnIndex struct.
 // throws cudf::logic_error if the chunk data is invalid.
-cudf::io::parquet::ColumnIndex read_column_index(
-  std::unique_ptr<cudf::io::datasource> const& source, cudf::io::parquet::ColumnChunk const& chunk)
+cudf::io::parquet::detail::ColumnIndex read_column_index(
+  std::unique_ptr<cudf::io::datasource> const& source,
+  cudf::io::parquet::detail::ColumnChunk const& chunk)
 {
   CUDF_EXPECTS(chunk.column_index_offset > 0, "Cannot find column index");
   CUDF_EXPECTS(chunk.column_index_length > 0, "Invalid column index length");
 
-  cudf::io::parquet::ColumnIndex colidx;
+  cudf::io::parquet::detail::ColumnIndex colidx;
   auto const ci_buf = source->host_read(chunk.column_index_offset, chunk.column_index_length);
-  cudf::io::parquet::CompactProtocolReader cp(ci_buf->data(), ci_buf->size());
+  cudf::io::parquet::detail::CompactProtocolReader cp(ci_buf->data(), ci_buf->size());
   bool res = cp.read(&colidx);
   CUDF_EXPECTS(res, "Cannot parse column index");
   return colidx;
@@ -269,22 +271,24 @@ cudf::io::parquet::ColumnIndex read_column_index(
 // read offset index from datasource at location indicated by chunk,
 // parse and return as an OffsetIndex struct.
 // throws cudf::logic_error if the chunk data is invalid.
-cudf::io::parquet::OffsetIndex read_offset_index(
-  std::unique_ptr<cudf::io::datasource> const& source, cudf::io::parquet::ColumnChunk const& chunk)
+cudf::io::parquet::detail::OffsetIndex read_offset_index(
+  std::unique_ptr<cudf::io::datasource> const& source,
+  cudf::io::parquet::detail::ColumnChunk const& chunk)
 {
   CUDF_EXPECTS(chunk.offset_index_offset > 0, "Cannot find offset index");
   CUDF_EXPECTS(chunk.offset_index_length > 0, "Invalid offset index length");
 
-  cudf::io::parquet::OffsetIndex offidx;
+  cudf::io::parquet::detail::OffsetIndex offidx;
   auto const oi_buf = source->host_read(chunk.offset_index_offset, chunk.offset_index_length);
-  cudf::io::parquet::CompactProtocolReader cp(oi_buf->data(), oi_buf->size());
+  cudf::io::parquet::detail::CompactProtocolReader cp(oi_buf->data(), oi_buf->size());
   bool res = cp.read(&offidx);
   CUDF_EXPECTS(res, "Cannot parse offset index");
   return offidx;
 }
 
 // Return as a Statistics from the column chunk
-cudf::io::parquet::Statistics const& get_statistics(cudf::io::parquet::ColumnChunk const& chunk)
+cudf::io::parquet::detail::Statistics const& get_statistics(
+  cudf::io::parquet::detail::ColumnChunk const& chunk)
 {
   return chunk.meta_data.statistics;
 }
@@ -292,15 +296,16 @@ cudf::io::parquet::Statistics const& get_statistics(cudf::io::parquet::ColumnChu
 // read page header from datasource at location indicated by page_loc,
 // parse and return as a PageHeader struct.
 // throws cudf::logic_error if the page_loc data is invalid.
-cudf::io::parquet::PageHeader read_page_header(std::unique_ptr<cudf::io::datasource> const& source,
-                                               cudf::io::parquet::PageLocation const& page_loc)
+cudf::io::parquet::detail::PageHeader read_page_header(
+  std::unique_ptr<cudf::io::datasource> const& source,
+  cudf::io::parquet::detail::PageLocation const& page_loc)
 {
   CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header");
   CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length");
 
-  cudf::io::parquet::PageHeader page_hdr;
+  cudf::io::parquet::detail::PageHeader page_hdr;
   auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size);
-  cudf::io::parquet::CompactProtocolReader cp(page_buf->data(), page_buf->size());
+  cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size());
   bool res = cp.read(&page_hdr);
   CUDF_EXPECTS(res, "Cannot parse page header");
   return page_hdr;
@@ -3686,7 +3691,7 @@ TEST_F(ParquetWriterTest, CheckPageRows)
 
   // check first page header and make sure it has only page_rows values
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   ASSERT_GT(fmd.row_groups.size(), 0);
@@ -3697,7 +3702,7 @@ TEST_F(ParquetWriterTest, CheckPageRows)
   // read first data page header.  sizeof(PageHeader) is not exact, but the thrift encoded
   // version should be smaller than size of the struct.
   auto const ph = read_page_header(
-    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0});
+    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0});
 
   EXPECT_EQ(ph.data_page_header.num_values, page_rows);
 }
@@ -3722,7 +3727,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsAdjusted)
 
   // check first page header and make sure it has only page_rows values
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   ASSERT_GT(fmd.row_groups.size(), 0);
@@ -3733,7 +3738,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsAdjusted)
   // read first data page header.  sizeof(PageHeader) is not exact, but the thrift encoded
   // version should be smaller than size of the struct.
   auto const ph = read_page_header(
-    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0});
+    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0});
 
   EXPECT_LE(ph.data_page_header.num_values, rows_per_page);
 }
@@ -3759,7 +3764,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsTooSmall)
 
   // check that file is written correctly when rows/page < fragment size
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   ASSERT_TRUE(fmd.row_groups.size() > 0);
@@ -3770,7 +3775,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsTooSmall)
   // read first data page header.  sizeof(PageHeader) is not exact, but the thrift encoded
   // version should be smaller than size of the struct.
   auto const ph = read_page_header(
-    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0});
+    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0});
 
   // there should be only one page since the fragment size is larger than rows_per_page
   EXPECT_EQ(ph.data_page_header.num_values, num_rows);
@@ -3798,7 +3803,7 @@ TEST_F(ParquetWriterTest, Decimal128Stats)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4031,7 +4036,7 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   ASSERT_GT(fmd.row_groups.size(), 0);
@@ -4041,10 +4046,10 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted)
 
   // now check that the boundary order for chunk 1 is ascending,
   // chunk 2 is descending, and chunk 3 is unordered
-  cudf::io::parquet::BoundaryOrder expected_orders[] = {
-    cudf::io::parquet::BoundaryOrder::ASCENDING,
-    cudf::io::parquet::BoundaryOrder::DESCENDING,
-    cudf::io::parquet::BoundaryOrder::UNORDERED};
+  cudf::io::parquet::detail::BoundaryOrder expected_orders[] = {
+    cudf::io::parquet::detail::BoundaryOrder::ASCENDING,
+    cudf::io::parquet::detail::BoundaryOrder::DESCENDING,
+    cudf::io::parquet::detail::BoundaryOrder::UNORDERED};
 
   for (std::size_t i = 0; i < columns.size(); i++) {
     auto const ci = read_column_index(source, columns[i]);
@@ -4067,15 +4072,15 @@ int32_t compare(T& v1, T& v2)
 // 1 if v1 > v2.
 int32_t compare_binary(std::vector<uint8_t> const& v1,
                        std::vector<uint8_t> const& v2,
-                       cudf::io::parquet::Type ptype,
-                       cudf::io::parquet::ConvertedType ctype)
+                       cudf::io::parquet::detail::Type ptype,
+                       cudf::io::parquet::detail::ConvertedType ctype)
 {
   switch (ptype) {
-    case cudf::io::parquet::INT32:
+    case cudf::io::parquet::detail::INT32:
       switch (ctype) {
-        case cudf::io::parquet::UINT_8:
-        case cudf::io::parquet::UINT_16:
-        case cudf::io::parquet::UINT_32:
+        case cudf::io::parquet::detail::UINT_8:
+        case cudf::io::parquet::detail::UINT_16:
+        case cudf::io::parquet::detail::UINT_32:
           return compare(*(reinterpret_cast<uint32_t const*>(v1.data())),
                          *(reinterpret_cast<uint32_t const*>(v2.data())));
         default:
@@ -4083,23 +4088,23 @@ int32_t compare_binary(std::vector<uint8_t> const& v1,
                          *(reinterpret_cast<int32_t const*>(v2.data())));
       }
 
-    case cudf::io::parquet::INT64:
-      if (ctype == cudf::io::parquet::UINT_64) {
+    case cudf::io::parquet::detail::INT64:
+      if (ctype == cudf::io::parquet::detail::UINT_64) {
         return compare(*(reinterpret_cast<uint64_t const*>(v1.data())),
                        *(reinterpret_cast<uint64_t const*>(v2.data())));
       }
       return compare(*(reinterpret_cast<int64_t const*>(v1.data())),
                      *(reinterpret_cast<int64_t const*>(v2.data())));
 
-    case cudf::io::parquet::FLOAT:
+    case cudf::io::parquet::detail::FLOAT:
       return compare(*(reinterpret_cast<float const*>(v1.data())),
                      *(reinterpret_cast<float const*>(v2.data())));
 
-    case cudf::io::parquet::DOUBLE:
+    case cudf::io::parquet::detail::DOUBLE:
       return compare(*(reinterpret_cast<double const*>(v1.data())),
                      *(reinterpret_cast<double const*>(v2.data())));
 
-    case cudf::io::parquet::BYTE_ARRAY: {
+    case cudf::io::parquet::detail::BYTE_ARRAY: {
       int32_t v1sz = v1.size();
       int32_t v2sz = v2.size();
       int32_t ret  = memcmp(v1.data(), v2.data(), std::min(v1sz, v2sz));
@@ -4142,7 +4147,7 @@ TEST_P(ParquetV2Test, LargeColumnIndex)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4164,10 +4169,10 @@ TEST_P(ParquetV2Test, LargeColumnIndex)
 
 TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
 {
-  constexpr auto num_rows = 100000;
-  auto const is_v2        = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  constexpr auto num_rows      = 100000;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   // fixed length strings
   auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
@@ -4210,7 +4215,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4255,10 +4260,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
 
 TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
 {
-  constexpr auto num_rows = 100000;
-  auto const is_v2        = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  constexpr auto num_rows      = 100000;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   // fixed length strings
   auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
@@ -4311,7 +4316,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4362,10 +4367,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
 
 TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
 {
-  constexpr auto num_rows = 100000;
-  auto const is_v2        = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  constexpr auto num_rows      = 100000;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   // fixed length strings
   auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
@@ -4403,7 +4408,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4458,9 +4463,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
 
 TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct)
 {
-  auto const is_v2 = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   auto c0 = testdata::ascending<uint32_t>();
 
@@ -4495,7 +4500,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4542,9 +4547,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct)
 
 TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls)
 {
-  auto const is_v2 = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   auto validity2 =
     cudf::detail::make_counting_transform_iterator(0, [](cudf::size_type i) { return i % 2; });
@@ -4586,7 +4591,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4616,9 +4621,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls)
 
 TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls)
 {
-  auto const is_v2 = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   using cudf::test::iterators::null_at;
   using cudf::test::iterators::nulls_at;
@@ -4711,7 +4716,7 @@ TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4812,7 +4817,7 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4870,7 +4875,7 @@ TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -5030,10 +5035,10 @@ TEST_F(ParquetReaderTest, NestedByteArray)
   cudf::io::write_parquet(out_opts);
 
   auto source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
-  EXPECT_EQ(fmd.schema[5].type, cudf::io::parquet::Type::BYTE_ARRAY);
+  EXPECT_EQ(fmd.schema[5].type, cudf::io::parquet::detail::Type::BYTE_ARRAY);
 
   std::vector<cudf::io::reader_column_schema> md{
     {},
@@ -5081,12 +5086,12 @@ TEST_F(ParquetWriterTest, ByteArrayStats)
   auto result = cudf::io::read_parquet(in_opts);
 
   auto source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
-  EXPECT_EQ(fmd.schema[1].type, cudf::io::parquet::Type::BYTE_ARRAY);
-  EXPECT_EQ(fmd.schema[2].type, cudf::io::parquet::Type::BYTE_ARRAY);
+  EXPECT_EQ(fmd.schema[1].type, cudf::io::parquet::detail::Type::BYTE_ARRAY);
+  EXPECT_EQ(fmd.schema[2].type, cudf::io::parquet::detail::Type::BYTE_ARRAY);
 
   auto const stats0 = get_statistics(fmd.row_groups[0].columns[0]);
   auto const stats1 = get_statistics(fmd.row_groups[0].columns[1]);
@@ -5137,9 +5142,9 @@ TEST_F(ParquetReaderTest, StructByteArray)
 
 TEST_F(ParquetReaderTest, NestingOptimizationTest)
 {
-  // test nesting levels > cudf::io::parquet::gpu::max_cacheable_nesting_decode_info deep.
+  // test nesting levels > cudf::io::parquet::detail::max_cacheable_nesting_decode_info deep.
   constexpr cudf::size_type num_nesting_levels = 16;
-  static_assert(num_nesting_levels > cudf::io::parquet::gpu::max_cacheable_nesting_decode_info);
+  static_assert(num_nesting_levels > cudf::io::parquet::detail::max_cacheable_nesting_decode_info);
   constexpr cudf::size_type rows_per_level = 2;
 
   constexpr cudf::size_type num_values = (1 << num_nesting_levels) * rows_per_level;
@@ -5206,13 +5211,13 @@ TEST_F(ParquetWriterTest, SingleValueDictionaryTest)
 
   // make sure dictionary was used
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto used_dict = [&fmd]() {
     for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) {
-      if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or
-          enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) {
+      if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or
+          enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) {
         return true;
       }
     }
@@ -5252,13 +5257,13 @@ TEST_F(ParquetWriterTest, DictionaryNeverTest)
 
   // make sure dictionary was not used
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto used_dict = [&fmd]() {
     for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) {
-      if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or
-          enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) {
+      if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or
+          enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) {
         return true;
       }
     }
@@ -5303,13 +5308,13 @@ TEST_F(ParquetWriterTest, DictionaryAdaptiveTest)
   // make sure dictionary was used as expected. col0 should use one,
   // col1 should not.
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto used_dict = [&fmd](int col) {
     for (auto enc : fmd.row_groups[0].columns[col].meta_data.encodings) {
-      if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or
-          enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) {
+      if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or
+          enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) {
         return true;
       }
     }
@@ -5354,13 +5359,13 @@ TEST_F(ParquetWriterTest, DictionaryAlwaysTest)
 
   // make sure dictionary was used for both columns
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto used_dict = [&fmd](int col) {
     for (auto enc : fmd.row_groups[0].columns[col].meta_data.encodings) {
-      if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or
-          enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) {
+      if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or
+          enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) {
         return true;
       }
     }
@@ -5438,13 +5443,13 @@ TEST_P(ParquetSizedTest, DictionaryTest)
 
   // make sure dictionary was used
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto used_dict = [&fmd]() {
     for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) {
-      if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or
-          enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) {
+      if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or
+          enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) {
         return true;
       }
     }
@@ -6664,7 +6669,7 @@ TEST_F(ParquetWriterTest, PreserveNullability)
 
 TEST_P(ParquetV2Test, CheckEncodings)
 {
-  using cudf::io::parquet::Encoding;
+  using cudf::io::parquet::detail::Encoding;
   constexpr auto num_rows = 100'000;
   auto const is_v2        = GetParam();
 
@@ -6697,7 +6702,7 @@ TEST_P(ParquetV2Test, CheckEncodings)
   };
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto const& chunk0_enc = fmd.row_groups[0].columns[0].meta_data.encodings;

From e345620ddaf5d8ac87e2428a84508ecfec2ba4f8 Mon Sep 17 00:00:00 2001
From: Suraj Aralihalli <suraj.ara16@gmail.com>
Date: Mon, 9 Oct 2023 10:13:25 -0700
Subject: [PATCH 12/12] Add stream parameter to List Manipulation and
 Operations APIs (#14248)

I have organized the public List APIs into **three** distinct categories based on their functionality, simplifying the PRs for easier and shorter reviews. This particular PR introduces the `stream` parameter only to the `List Manipulation and Operations APIs`, which fall under `Section 1`. See next comment for other sections.


1. List Manipulation and Operations (`combine.hpp`, `contains.hpp`, `count_elements.hpp`)

```
concatenate_rows
concatenate_list_elements
contains_nulls
contains - search_keys
contains - search_key
index_of - search_keys
index_of - search_key
count_elements
```


This PR addresses issues in the following files:

1. **column_wrapper.hpp**:
      - Corrects the improper passing of the stream value in the `make_lists_column` function.
      - Enables the missing cast to `lists_column_view`.
      - Substitutes `copy_bitmask` with `cudf::detail::copy_bitmask` to include the stream parameter.

2. **concatenate.cu:**

      - Substitutes `create_null_mask` with `cudf::detail::create_null_mask` to include the stream parameter.

Authors:
  - Suraj Aralihalli (https://github.com/SurajAralihalli)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/14248
---
 cpp/include/cudf/lists/combine.hpp            |  4 +
 cpp/include/cudf/lists/contains.hpp           | 14 ++-
 cpp/include/cudf/lists/count_elements.hpp     |  2 +
 cpp/include/cudf_test/column_wrapper.hpp      | 45 +++++++---
 .../combine/concatenate_list_elements.cu      |  3 +-
 cpp/src/lists/combine/concatenate_rows.cu     |  3 +-
 cpp/src/lists/contains.cu                     | 37 ++++----
 cpp/src/lists/copying/concatenate.cu          |  5 +-
 cpp/src/lists/count_elements.cu               |  3 +-
 cpp/tests/CMakeLists.txt                      |  1 +
 cpp/tests/streams/lists_test.cpp              | 87 +++++++++++++++++++
 11 files changed, 169 insertions(+), 35 deletions(-)
 create mode 100644 cpp/tests/streams/lists_test.cpp

diff --git a/cpp/include/cudf/lists/combine.hpp b/cpp/include/cudf/lists/combine.hpp
index 0bc76828fc3..0d9c1c157eb 100644
--- a/cpp/include/cudf/lists/combine.hpp
+++ b/cpp/include/cudf/lists/combine.hpp
@@ -57,6 +57,7 @@ enum class concatenate_null_policy { IGNORE, NULLIFY_OUTPUT_ROW };
  * @param input Table of lists to be concatenated.
  * @param null_policy The parameter to specify whether a null list element will be ignored from
  *        concatenation, or any concatenation involving a null element will result in a null list.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return A new column in which each row is a list resulted from concatenating all list elements in
  *         the corresponding row of the input table.
@@ -64,6 +65,7 @@ enum class concatenate_null_policy { IGNORE, NULLIFY_OUTPUT_ROW };
 std::unique_ptr<column> concatenate_rows(
   table_view const& input,
   concatenate_null_policy null_policy = concatenate_null_policy::IGNORE,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -86,6 +88,7 @@ std::unique_ptr<column> concatenate_rows(
  * @param input The lists column containing lists of list elements to concatenate.
  * @param null_policy The parameter to specify whether a null list element will be ignored from
  *        concatenation, or any concatenation involving a null element will result in a null list.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return A new column in which each row is a list resulted from concatenating all list elements in
  *         the corresponding row of the input lists column.
@@ -93,6 +96,7 @@ std::unique_ptr<column> concatenate_rows(
 std::unique_ptr<column> concatenate_list_elements(
   column_view const& input,
   concatenate_null_policy null_policy = concatenate_null_policy::IGNORE,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/lists/contains.hpp b/cpp/include/cudf/lists/contains.hpp
index 21c2ca1d64e..7cf67ec9205 100644
--- a/cpp/include/cudf/lists/contains.hpp
+++ b/cpp/include/cudf/lists/contains.hpp
@@ -42,12 +42,14 @@ namespace lists {
  *
  * @param lists Lists column whose `n` rows are to be searched
  * @param search_key The scalar key to be looked up in each list row
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return BOOL8 column of `n` rows with the result of the lookup
  */
 std::unique_ptr<column> contains(
   cudf::lists_column_view const& lists,
   cudf::scalar const& search_key,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -63,13 +65,15 @@ std::unique_ptr<column> contains(
  *   2. The list row `lists[i]` is null
  *
  * @param lists Lists column whose `n` rows are to be searched
- * @param search_keys Column of elements to be looked up in each list row
+ * @param search_keys Column of elements to be looked up in each list row.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return BOOL8 column of `n` rows with the result of the lookup
  */
 std::unique_ptr<column> contains(
   cudf::lists_column_view const& lists,
   cudf::column_view const& search_keys,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -84,12 +88,14 @@ std::unique_ptr<column> contains(
  * A row with an empty list will always return false.
  * Nulls inside non-null nested elements (such as lists or structs) are not considered.
  *
- * @param lists Lists column whose `n` rows are to be searched
+ * @param lists Lists column whose `n` rows are to be searched.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return BOOL8 column of `n` rows with the result of the lookup
  */
 std::unique_ptr<column> contains_nulls(
   cudf::lists_column_view const& lists,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -125,6 +131,7 @@ enum class duplicate_find_option : int32_t {
  * @param search_key The scalar key to be looked up in each list row
  * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or
  * last (`FIND_LAST`)
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return column of `n` rows with the location of the `search_key`
  */
@@ -132,6 +139,7 @@ std::unique_ptr<column> index_of(
   cudf::lists_column_view const& lists,
   cudf::scalar const& search_key,
   duplicate_find_option find_option   = duplicate_find_option::FIND_FIRST,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -160,6 +168,7 @@ std::unique_ptr<column> index_of(
  * `lists`
  * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or
  * last (`FIND_LAST`)
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return column of `n` rows with the location of the `search_key`
  */
@@ -167,6 +176,7 @@ std::unique_ptr<column> index_of(
   cudf::lists_column_view const& lists,
   cudf::column_view const& search_keys,
   duplicate_find_option find_option   = duplicate_find_option::FIND_FIRST,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/lists/count_elements.hpp b/cpp/include/cudf/lists/count_elements.hpp
index 552ba058b93..e4bd0dca9ae 100644
--- a/cpp/include/cudf/lists/count_elements.hpp
+++ b/cpp/include/cudf/lists/count_elements.hpp
@@ -45,11 +45,13 @@ namespace lists {
  * in the output column.
  *
  * @param input Input lists column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New column with the number of elements for each row
  */
 std::unique_ptr<column> count_elements(
   lists_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of lists_elements group
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index c0932b81dc3..e94dfea9dcf 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -21,6 +21,7 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/concatenate.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
@@ -1281,6 +1282,11 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
 template <typename T, typename SourceElementT = T>
 class lists_column_wrapper : public detail::column_wrapper {
  public:
+  /**
+   * @brief Cast to lists_column_view
+   */
+  operator lists_column_view() const { return cudf::lists_column_view{wrapped->view()}; }
+
   /**
    * @brief Construct a lists column containing a single list of fixed-width
    * type from an initializer list of values.
@@ -1542,8 +1548,12 @@ class lists_column_wrapper : public detail::column_wrapper {
                        rmm::device_buffer&& null_mask)
   {
     // construct the list column
-    wrapped = make_lists_column(
-      num_rows, std::move(offsets), std::move(values), null_count, std::move(null_mask));
+    wrapped = make_lists_column(num_rows,
+                                std::move(offsets),
+                                std::move(values),
+                                null_count,
+                                std::move(null_mask),
+                                cudf::test::get_default_stream());
   }
 
   /**
@@ -1618,8 +1628,12 @@ class lists_column_wrapper : public detail::column_wrapper {
     }();
 
     // construct the list column
-    wrapped = make_lists_column(
-      cols.size(), std::move(offsets), std::move(data), null_count, std::move(null_mask));
+    wrapped = make_lists_column(cols.size(),
+                                std::move(offsets),
+                                std::move(data),
+                                null_count,
+                                std::move(null_mask),
+                                cudf::test::get_default_stream());
   }
 
   /**
@@ -1647,8 +1661,12 @@ class lists_column_wrapper : public detail::column_wrapper {
     depth = 0;
 
     size_type num_elements = offsets->size() == 0 ? 0 : offsets->size() - 1;
-    wrapped =
-      make_lists_column(num_elements, std::move(offsets), std::move(c), 0, rmm::device_buffer{});
+    wrapped                = make_lists_column(num_elements,
+                                std::move(offsets),
+                                std::move(c),
+                                0,
+                                rmm::device_buffer{},
+                                cudf::test::get_default_stream());
   }
 
   /**
@@ -1697,12 +1715,15 @@ class lists_column_wrapper : public detail::column_wrapper {
     }
 
     lists_column_view lcv(col);
-    return make_lists_column(col.size(),
-                             std::make_unique<column>(lcv.offsets()),
-                             normalize_column(lists_column_view(col).child(),
-                                              lists_column_view(expected_hierarchy).child()),
-                             col.null_count(),
-                             copy_bitmask(col));
+    return make_lists_column(
+      col.size(),
+      std::make_unique<column>(lcv.offsets()),
+      normalize_column(lists_column_view(col).child(),
+                       lists_column_view(expected_hierarchy).child()),
+      col.null_count(),
+      cudf::detail::copy_bitmask(
+        col, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
+      cudf::test::get_default_stream());
   }
 
   std::pair<std::vector<column_view>, std::vector<std::unique_ptr<column>>> preprocess_columns(
diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu
index fbe297765f8..99dbd55678b 100644
--- a/cpp/src/lists/combine/concatenate_list_elements.cu
+++ b/cpp/src/lists/combine/concatenate_list_elements.cu
@@ -271,10 +271,11 @@ std::unique_ptr<column> concatenate_list_elements(column_view const& input,
  */
 std::unique_ptr<column> concatenate_list_elements(column_view const& input,
                                                   concatenate_null_policy null_policy,
+                                                  rmm::cuda_stream_view stream,
                                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate_list_elements(input, null_policy, cudf::get_default_stream(), mr);
+  return detail::concatenate_list_elements(input, null_policy, stream, mr);
 }
 
 }  // namespace lists
diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu
index 658538b0195..49be7b5ff17 100644
--- a/cpp/src/lists/combine/concatenate_rows.cu
+++ b/cpp/src/lists/combine/concatenate_rows.cu
@@ -305,10 +305,11 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
  */
 std::unique_ptr<column> concatenate_rows(table_view const& input,
                                          concatenate_null_policy null_policy,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate_rows(input, null_policy, cudf::get_default_stream(), mr);
+  return detail::concatenate_rows(input, null_policy, stream, mr);
 }
 
 }  // namespace lists
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index df1d043bdb6..4733a5d63a8 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -287,7 +287,7 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
   }
 
   auto search_key_col = cudf::make_column_from_scalar(search_key, lists.size(), stream, mr);
-  return index_of(lists, search_key_col->view(), find_option, stream, mr);
+  return detail::index_of(lists, search_key_col->view(), find_option, stream, mr);
 }
 
 std::unique_ptr<column> index_of(lists_column_view const& lists,
@@ -306,11 +306,11 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
-  auto key_indices = index_of(lists,
-                              search_key,
-                              duplicate_find_option::FIND_FIRST,
-                              stream,
-                              rmm::mr::get_current_device_resource());
+  auto key_indices = detail::index_of(lists,
+                                      search_key,
+                                      duplicate_find_option::FIND_FIRST,
+                                      stream,
+                                      rmm::mr::get_current_device_resource());
   return to_contains(std::move(key_indices), stream, mr);
 }
 
@@ -322,11 +322,11 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
   CUDF_EXPECTS(search_keys.size() == lists.size(),
                "Number of search keys must match list column size.");
 
-  auto key_indices = index_of(lists,
-                              search_keys,
-                              duplicate_find_option::FIND_FIRST,
-                              stream,
-                              rmm::mr::get_current_device_resource());
+  auto key_indices = detail::index_of(lists,
+                                      search_keys,
+                                      duplicate_find_option::FIND_FIRST,
+                                      stream,
+                                      rmm::mr::get_current_device_resource());
   return to_contains(std::move(key_indices), stream, mr);
 }
 
@@ -364,43 +364,48 @@ std::unique_ptr<column> contains_nulls(lists_column_view const& lists,
 
 std::unique_ptr<column> contains(lists_column_view const& lists,
                                  cudf::scalar const& search_key,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(lists, search_key, cudf::get_default_stream(), mr);
+  return detail::contains(lists, search_key, stream, mr);
 }
 
 std::unique_ptr<column> contains(lists_column_view const& lists,
                                  column_view const& search_keys,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(lists, search_keys, cudf::get_default_stream(), mr);
+  return detail::contains(lists, search_keys, stream, mr);
 }
 
 std::unique_ptr<column> contains_nulls(lists_column_view const& lists,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains_nulls(lists, cudf::get_default_stream(), mr);
+  return detail::contains_nulls(lists, stream, mr);
 }
 
 std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  cudf::scalar const& search_key,
                                  duplicate_find_option find_option,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::index_of(lists, search_key, find_option, cudf::get_default_stream(), mr);
+  return detail::index_of(lists, search_key, find_option, stream, mr);
 }
 
 std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  column_view const& search_keys,
                                  duplicate_find_option find_option,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::index_of(lists, search_keys, find_option, cudf::get_default_stream(), mr);
+  return detail::index_of(lists, search_keys, find_option, stream, mr);
 }
 
 }  // namespace cudf::lists
diff --git a/cpp/src/lists/copying/concatenate.cu b/cpp/src/lists/copying/concatenate.cu
index ddd0dfbe084..5407b88236f 100644
--- a/cpp/src/lists/copying/concatenate.cu
+++ b/cpp/src/lists/copying/concatenate.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/concatenate_masks.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -123,8 +124,8 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
   // if any of the input columns have nulls, construct the output mask
   bool const has_nulls =
     std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
-  rmm::device_buffer null_mask = create_null_mask(
-    total_list_count, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED);
+  rmm::device_buffer null_mask = cudf::detail::create_null_mask(
+    total_list_count, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED, stream, mr);
   auto null_mask_data = static_cast<bitmask_type*>(null_mask.data());
   auto const null_count =
     has_nulls ? cudf::detail::concatenate_masks(columns, null_mask_data, stream) : size_type{0};
diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu
index 40a14d805e1..2fd0851067a 100644
--- a/cpp/src/lists/count_elements.cu
+++ b/cpp/src/lists/count_elements.cu
@@ -73,10 +73,11 @@ std::unique_ptr<column> count_elements(lists_column_view const& input,
 // external APIS
 
 std::unique_ptr<column> count_elements(lists_column_view const& input,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_elements(input, cudf::get_default_stream(), mr);
+  return detail::count_elements(input, stream, mr);
 }
 
 }  // namespace lists
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index ac13c121530..ffaba7d6fa7 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -638,6 +638,7 @@ ConfigureTest(
 )
 ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing)
 
 # ##################################################################################################
 # Install tests ####################################################################################
diff --git a/cpp/tests/streams/lists_test.cpp b/cpp/tests/streams/lists_test.cpp
new file mode 100644
index 00000000000..e292b551d83
--- /dev/null
+++ b/cpp/tests/streams/lists_test.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/lists/combine.hpp>
+#include <cudf/lists/contains.hpp>
+#include <cudf/lists/count_elements.hpp>
+
+class ListTest : public cudf::test::BaseFixture {};
+
+TEST_F(ListTest, ConcatenateRows)
+{
+  cudf::test::lists_column_wrapper<int> list_col_1{{0, 1}, {2, 3}, {4, 5}};
+  cudf::test::lists_column_wrapper<int> list_col_2{{0, 1}, {2, 3}, {4, 5}};
+  cudf::table_view lists_table({list_col_1, list_col_2});
+  cudf::lists::concatenate_rows(
+    lists_table, cudf::lists::concatenate_null_policy::IGNORE, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ConcatenateListElements)
+{
+  cudf::test::lists_column_wrapper<int> ll_column{{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}};
+  cudf::lists::concatenate_list_elements(
+    ll_column, cudf::lists::concatenate_null_policy::IGNORE, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ContainsNulls)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3}, {4, 5}};
+  cudf::lists::contains_nulls(list_col, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ContainsSearchKey)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3}, {4, 5}};
+  cudf::numeric_scalar<int32_t> search_key(2, true, cudf::test::get_default_stream());
+  cudf::lists::contains(list_col, search_key, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ContainsSearchKeys)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3}, {4, 5}};
+  cudf::test::fixed_width_column_wrapper<int> search_keys({1, 2, 3});
+  cudf::lists::contains(list_col, search_keys, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, IndexOfSearchKey)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3}, {4, 5}};
+  cudf::numeric_scalar<int32_t> search_key(2, true, cudf::test::get_default_stream());
+  cudf::lists::index_of(list_col,
+                        search_key,
+                        cudf::lists::duplicate_find_option::FIND_FIRST,
+                        cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, IndexOfSearchKeys)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3}, {4, 5}};
+  cudf::test::fixed_width_column_wrapper<int> search_keys({1, 2, 3});
+  cudf::lists::index_of(list_col,
+                        search_keys,
+                        cudf::lists::duplicate_find_option::FIND_FIRST,
+                        cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, CountElements)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3, 7}, {4, 5}};
+  cudf::lists::count_elements(list_col, cudf::test::get_default_stream());
+}