diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 000f80065ab..f8b9762f1d4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -401,6 +401,7 @@ add_library(
   src/io/parquet/predicate_pushdown.cpp
   src/io/parquet/reader.cpp
   src/io/parquet/reader_impl.cpp
+  src/io/parquet/reader_impl_chunking.cu
   src/io/parquet/reader_impl_helpers.cpp
   src/io/parquet/reader_impl_preprocess.cu
   src/io/parquet/writer_impl.cu
diff --git a/cpp/benchmarks/copying/shift.cu b/cpp/benchmarks/copying/shift.cu
index 460100a8fe9..e1169e3bcd6 100644
--- a/cpp/benchmarks/copying/shift.cu
+++ b/cpp/benchmarks/copying/shift.cu
@@ -56,18 +56,32 @@ static void BM_shift(benchmark::State& state)
   cudf::size_type size   = state.range(0);
   cudf::size_type offset = size * (static_cast<double>(shift_factor) / 100.0);
 
-  auto const input_table =
-    create_sequence_table({cudf::type_to_id<int>()},
-                          row_count{size},
-                          use_validity ? std::optional<double>{1.0} : std::nullopt);
+  auto constexpr column_type_id = cudf::type_id::INT32;
+  using column_type             = cudf::id_to_type<column_type_id>;
+
+  auto const input_table = create_sequence_table(
+    {column_type_id}, row_count{size}, use_validity ? std::optional<double>{1.0} : std::nullopt);
   cudf::column_view input{input_table->get_column(0)};
 
-  auto fill = use_validity ? make_scalar<int>() : make_scalar<int>(777);
+  auto fill = use_validity ? make_scalar<column_type>() : make_scalar<column_type>(777);
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);
     auto output = cudf::shift(input, offset, *fill);
   }
+
+  auto const elems_read = (size - offset);
+  auto const bytes_read = elems_read * sizeof(column_type);
+
+  // If 'use_validity' is false, the fill value is a number, and the entire column
+  // (excluding the null bitmask) needs to be written. On the other hand, if 'use_validity'
+  // is true, only the elements that can be shifted are written, along with the full null bitmask.
+  auto const elems_written = use_validity ? (size - offset) : size;
+  auto const bytes_written = elems_written * sizeof(column_type);
+  auto const null_bytes    = use_validity ? 2 * cudf::bitmask_allocation_size_bytes(size) : 0;
+
+  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
+                          (bytes_written + bytes_read + null_bytes));
 }
 
 class Shift : public cudf::benchmark {};
diff --git a/cpp/benchmarks/transpose/transpose.cpp b/cpp/benchmarks/transpose/transpose.cpp
index 2f41bda4b88..c2737325462 100644
--- a/cpp/benchmarks/transpose/transpose.cpp
+++ b/cpp/benchmarks/transpose/transpose.cpp
@@ -20,17 +20,19 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/transpose.hpp>
+#include <cudf/types.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
 static void BM_transpose(benchmark::State& state)
 {
-  auto count = state.range(0);
+  auto count                    = state.range(0);
+  constexpr auto column_type_id = cudf::type_id::INT32;
   auto int_column_generator =
     thrust::make_transform_iterator(thrust::counting_iterator(0), [count](int i) {
       return cudf::make_numeric_column(
-        cudf::data_type{cudf::type_id::INT32}, count, cudf::mask_state::ALL_VALID);
+        cudf::data_type{column_type_id}, count, cudf::mask_state::ALL_VALID);
     });
 
   auto input_table = cudf::table(std::vector(int_column_generator, int_column_generator + count));
@@ -40,6 +42,17 @@ static void BM_transpose(benchmark::State& state)
     cuda_event_timer raii(state, true);
     auto output = cudf::transpose(input);
   }
+
+  // Collect memory statistics.
+  auto const bytes_read = static_cast<uint64_t>(input.num_columns()) * input.num_rows() *
+                          sizeof(cudf::id_to_type<column_type_id>);
+  auto const bytes_written = bytes_read;
+  // Account for nullability in input and output.
+  auto const null_bytes = 2 * static_cast<uint64_t>(input.num_columns()) *
+                          cudf::bitmask_allocation_size_bytes(input.num_rows());
+
+  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
+                          (bytes_read + bytes_written + null_bytes));
 }
 
 class Transpose : public cudf::benchmark {};
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 074f690d2c7..0b8ee9676de 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -38,7 +38,7 @@ class parquet_reader_options;
 class parquet_writer_options;
 class chunked_parquet_writer_options;
 
-namespace detail::parquet {
+namespace parquet::detail {
 
 /**
  * @brief Class to read Parquet dataset data into columns.
@@ -186,7 +186,7 @@ class writer {
    */
   explicit writer(std::vector<std::unique_ptr<data_sink>> sinks,
                   parquet_writer_options const& options,
-                  single_write_mode mode,
+                  cudf::io::detail::single_write_mode mode,
                   rmm::cuda_stream_view stream);
 
   /**
@@ -201,7 +201,7 @@ class writer {
    */
   explicit writer(std::vector<std::unique_ptr<data_sink>> sinks,
                   chunked_parquet_writer_options const& options,
-                  single_write_mode mode,
+                  cudf::io::detail::single_write_mode mode,
                   rmm::cuda_stream_view stream);
 
   /**
@@ -250,5 +250,5 @@ class writer {
  * metadata.
  */
 parquet_metadata read_parquet_metadata(host_span<std::unique_ptr<datasource> const> sources);
-}  // namespace detail::parquet
+}  // namespace parquet::detail
 }  // namespace cudf::io
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index deaf23d405a..6283099e700 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -499,7 +499,7 @@ class chunked_parquet_reader {
   [[nodiscard]] table_with_metadata read_chunk() const;
 
  private:
-  std::unique_ptr<cudf::io::detail::parquet::chunked_reader> reader;
+  std::unique_ptr<cudf::io::parquet::detail::chunked_reader> reader;
 };
 
 /** @} */  // end of group
@@ -1750,7 +1750,7 @@ class parquet_chunked_writer {
     std::vector<std::string> const& column_chunks_file_paths = {});
 
   /// Unique pointer to impl writer class
-  std::unique_ptr<cudf::io::detail::parquet::writer> writer;
+  std::unique_ptr<parquet::detail::writer> writer;
 };
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/lists/combine.hpp b/cpp/include/cudf/lists/combine.hpp
index 0bc76828fc3..0d9c1c157eb 100644
--- a/cpp/include/cudf/lists/combine.hpp
+++ b/cpp/include/cudf/lists/combine.hpp
@@ -57,6 +57,7 @@ enum class concatenate_null_policy { IGNORE, NULLIFY_OUTPUT_ROW };
  * @param input Table of lists to be concatenated.
  * @param null_policy The parameter to specify whether a null list element will be ignored from
  *        concatenation, or any concatenation involving a null element will result in a null list.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return A new column in which each row is a list resulted from concatenating all list elements in
  *         the corresponding row of the input table.
@@ -64,6 +65,7 @@ enum class concatenate_null_policy { IGNORE, NULLIFY_OUTPUT_ROW };
 std::unique_ptr<column> concatenate_rows(
   table_view const& input,
   concatenate_null_policy null_policy = concatenate_null_policy::IGNORE,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -86,6 +88,7 @@ std::unique_ptr<column> concatenate_rows(
  * @param input The lists column containing lists of list elements to concatenate.
  * @param null_policy The parameter to specify whether a null list element will be ignored from
  *        concatenation, or any concatenation involving a null element will result in a null list.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return A new column in which each row is a list resulted from concatenating all list elements in
  *         the corresponding row of the input lists column.
@@ -93,6 +96,7 @@ std::unique_ptr<column> concatenate_rows(
 std::unique_ptr<column> concatenate_list_elements(
   column_view const& input,
   concatenate_null_policy null_policy = concatenate_null_policy::IGNORE,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/lists/contains.hpp b/cpp/include/cudf/lists/contains.hpp
index 21c2ca1d64e..7cf67ec9205 100644
--- a/cpp/include/cudf/lists/contains.hpp
+++ b/cpp/include/cudf/lists/contains.hpp
@@ -42,12 +42,14 @@ namespace lists {
  *
  * @param lists Lists column whose `n` rows are to be searched
  * @param search_key The scalar key to be looked up in each list row
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return BOOL8 column of `n` rows with the result of the lookup
  */
 std::unique_ptr<column> contains(
   cudf::lists_column_view const& lists,
   cudf::scalar const& search_key,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -63,13 +65,15 @@ std::unique_ptr<column> contains(
  *   2. The list row `lists[i]` is null
  *
  * @param lists Lists column whose `n` rows are to be searched
- * @param search_keys Column of elements to be looked up in each list row
+ * @param search_keys Column of elements to be looked up in each list row.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return BOOL8 column of `n` rows with the result of the lookup
  */
 std::unique_ptr<column> contains(
   cudf::lists_column_view const& lists,
   cudf::column_view const& search_keys,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -84,12 +88,14 @@ std::unique_ptr<column> contains(
  * A row with an empty list will always return false.
  * Nulls inside non-null nested elements (such as lists or structs) are not considered.
  *
- * @param lists Lists column whose `n` rows are to be searched
+ * @param lists Lists column whose `n` rows are to be searched.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return BOOL8 column of `n` rows with the result of the lookup
  */
 std::unique_ptr<column> contains_nulls(
   cudf::lists_column_view const& lists,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -125,6 +131,7 @@ enum class duplicate_find_option : int32_t {
  * @param search_key The scalar key to be looked up in each list row
  * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or
  * last (`FIND_LAST`)
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return column of `n` rows with the location of the `search_key`
  */
@@ -132,6 +139,7 @@ std::unique_ptr<column> index_of(
   cudf::lists_column_view const& lists,
   cudf::scalar const& search_key,
   duplicate_find_option find_option   = duplicate_find_option::FIND_FIRST,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -160,6 +168,7 @@ std::unique_ptr<column> index_of(
  * `lists`
  * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or
  * last (`FIND_LAST`)
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return column of `n` rows with the location of the `search_key`
  */
@@ -167,6 +176,7 @@ std::unique_ptr<column> index_of(
   cudf::lists_column_view const& lists,
   cudf::column_view const& search_keys,
   duplicate_find_option find_option   = duplicate_find_option::FIND_FIRST,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/lists/count_elements.hpp b/cpp/include/cudf/lists/count_elements.hpp
index 552ba058b93..e4bd0dca9ae 100644
--- a/cpp/include/cudf/lists/count_elements.hpp
+++ b/cpp/include/cudf/lists/count_elements.hpp
@@ -45,11 +45,13 @@ namespace lists {
  * in the output column.
  *
  * @param input Input lists column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New column with the number of elements for each row
  */
 std::unique_ptr<column> count_elements(
   lists_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of lists_elements group
diff --git a/cpp/include/cudf/strings/padding.hpp b/cpp/include/cudf/strings/padding.hpp
index 7699159fbea..f0cb351eeda 100644
--- a/cpp/include/cudf/strings/padding.hpp
+++ b/cpp/include/cudf/strings/padding.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,6 +51,7 @@ namespace strings {
  *        Default is pad right (left justify)
  * @param fill_char Single UTF-8 character to use for padding;
  *        Default is the space character
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New column with padded strings
  */
@@ -59,6 +60,7 @@ std::unique_ptr<column> pad(
   size_type width,
   side_type side                      = side_type::RIGHT,
   std::string_view fill_char          = " ",
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -79,14 +81,16 @@ std::unique_ptr<column> pad(
  * r is now ['001234','-09876','+00.34','-342567', '0002+2']
  * @endcode
  *
- * @param input Strings instance for this operation.
- * @param width The minimum number of characters for each string.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of strings.
+ * @param input Strings instance for this operation
+ * @param width The minimum number of characters for each string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of strings
  */
 std::unique_ptr<column> zfill(
   strings_column_view const& input,
   size_type width,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/slice.hpp b/cpp/include/cudf/strings/slice.hpp
index 5f2c71725eb..f106663be9b 100644
--- a/cpp/include/cudf/strings/slice.hpp
+++ b/cpp/include/cudf/strings/slice.hpp
@@ -50,18 +50,20 @@ namespace strings {
  * r2 is now ["lo","ob"]
  * @endcode
  *
- * @param strings Strings column for this operation.
- * @param start First character position to begin the substring.
- * @param stop Last character position (exclusive) to end the substring.
- * @param step Distance between input characters retrieved.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column with sorted elements of this instance.
+ * @param input Strings column for this operation
+ * @param start First character position to begin the substring
+ * @param stop Last character position (exclusive) to end the substring
+ * @param step Distance between input characters retrieved
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column with sorted elements of this instance
  */
 std::unique_ptr<column> slice_strings(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   numeric_scalar<size_type> const& start = numeric_scalar<size_type>(0, false),
   numeric_scalar<size_type> const& stop  = numeric_scalar<size_type>(0, false),
   numeric_scalar<size_type> const& step  = numeric_scalar<size_type>(1),
+  rmm::cuda_stream_view stream           = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
 
 /**
@@ -95,16 +97,18 @@ std::unique_ptr<column> slice_strings(
  * @throw cudf::logic_error if starts and stops are not same integer type.
  * @throw cudf::logic_error if starts or stops contains nulls.
  *
- * @param strings Strings column for this operation.
- * @param starts First character positions to begin the substring.
- * @param stops Last character (exclusive) positions to end the substring.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column with sorted elements of this instance.
+ * @param input Strings column for this operation
+ * @param starts First character positions to begin the substring
+ * @param stops Last character (exclusive) positions to end the substring
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column with sorted elements of this instance
  */
 std::unique_ptr<column> slice_strings(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   column_view const& starts,
   column_view const& stops,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/strip.hpp b/cpp/include/cudf/strings/strip.hpp
index adf3b291144..556d6805ac3 100644
--- a/cpp/include/cudf/strings/strip.hpp
+++ b/cpp/include/cudf/strings/strip.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,6 +57,7 @@ namespace strings {
  *        string; Default is both
  * @param to_strip UTF-8 encoded characters to strip from each string;
  *        Default is empty string which indicates strip whitespace characters
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column.
  */
@@ -64,6 +65,7 @@ std::unique_ptr<column> strip(
   strings_column_view const& input,
   side_type side                      = side_type::BOTH,
   string_scalar const& to_strip       = string_scalar(""),
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/wrap.hpp b/cpp/include/cudf/strings/wrap.hpp
index 8d2d43c7f0f..efdc3e62aff 100644
--- a/cpp/include/cudf/strings/wrap.hpp
+++ b/cpp/include/cudf/strings/wrap.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,14 +57,16 @@ namespace strings {
  * wrapped_string_tbl = ["the quick\nbrown fox\njumped over\nthe lazy\nbrown dog", "hello, world"]
  * ```
  *
- * @param[in] strings String column.
- * @param[in] width Maximum character width of a line within each string.
- * @param[in] mr Device memory resource used to allocate the returned column's device memory
- * @return Column of wrapped strings.
+ * @param input String column
+ * @param width Maximum character width of a line within each string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Column of wrapped strings
  */
 std::unique_ptr<column> wrap(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   size_type width,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index c0932b81dc3..e94dfea9dcf 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -21,6 +21,7 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/concatenate.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
@@ -1281,6 +1282,11 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
 template <typename T, typename SourceElementT = T>
 class lists_column_wrapper : public detail::column_wrapper {
  public:
+  /**
+   * @brief Cast to lists_column_view
+   */
+  operator lists_column_view() const { return cudf::lists_column_view{wrapped->view()}; }
+
   /**
    * @brief Construct a lists column containing a single list of fixed-width
    * type from an initializer list of values.
@@ -1542,8 +1548,12 @@ class lists_column_wrapper : public detail::column_wrapper {
                        rmm::device_buffer&& null_mask)
   {
     // construct the list column
-    wrapped = make_lists_column(
-      num_rows, std::move(offsets), std::move(values), null_count, std::move(null_mask));
+    wrapped = make_lists_column(num_rows,
+                                std::move(offsets),
+                                std::move(values),
+                                null_count,
+                                std::move(null_mask),
+                                cudf::test::get_default_stream());
   }
 
   /**
@@ -1618,8 +1628,12 @@ class lists_column_wrapper : public detail::column_wrapper {
     }();
 
     // construct the list column
-    wrapped = make_lists_column(
-      cols.size(), std::move(offsets), std::move(data), null_count, std::move(null_mask));
+    wrapped = make_lists_column(cols.size(),
+                                std::move(offsets),
+                                std::move(data),
+                                null_count,
+                                std::move(null_mask),
+                                cudf::test::get_default_stream());
   }
 
   /**
@@ -1647,8 +1661,12 @@ class lists_column_wrapper : public detail::column_wrapper {
     depth = 0;
 
     size_type num_elements = offsets->size() == 0 ? 0 : offsets->size() - 1;
-    wrapped =
-      make_lists_column(num_elements, std::move(offsets), std::move(c), 0, rmm::device_buffer{});
+    wrapped                = make_lists_column(num_elements,
+                                std::move(offsets),
+                                std::move(c),
+                                0,
+                                rmm::device_buffer{},
+                                cudf::test::get_default_stream());
   }
 
   /**
@@ -1697,12 +1715,15 @@ class lists_column_wrapper : public detail::column_wrapper {
     }
 
     lists_column_view lcv(col);
-    return make_lists_column(col.size(),
-                             std::make_unique<column>(lcv.offsets()),
-                             normalize_column(lists_column_view(col).child(),
-                                              lists_column_view(expected_hierarchy).child()),
-                             col.null_count(),
-                             copy_bitmask(col));
+    return make_lists_column(
+      col.size(),
+      std::make_unique<column>(lcv.offsets()),
+      normalize_column(lists_column_view(col).child(),
+                       lists_column_view(expected_hierarchy).child()),
+      col.null_count(),
+      cudf::detail::copy_bitmask(
+        col, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
+      cudf::test::get_default_stream());
   }
 
   std::pair<std::vector<column_view>, std::vector<std::unique_ptr<column>>> preprocess_columns(
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 392a7850886..726442d752e 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -470,8 +470,8 @@ void orc_chunked_writer::close()
   writer->close();
 }
 
-using namespace cudf::io::detail::parquet;
-namespace detail_parquet = cudf::io::detail::parquet;
+using namespace cudf::io::parquet::detail;
+namespace detail_parquet = cudf::io::parquet::detail;
 
 table_with_metadata read_parquet(parquet_reader_options const& options,
                                  rmm::mr::device_memory_resource* mr)
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 9ff1869edde..53ff31ab0a7 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -24,10 +24,8 @@
 
 #include <cuda/atomic>
 
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
+namespace cudf::io::parquet::detail {
+
 namespace {
 constexpr int DEFAULT_BLOCK_SIZE = 256;
 }
@@ -101,7 +99,7 @@ struct map_find_fn {
 
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
-  populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan<gpu::PageFragment const> frags)
+  populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto col_idx = blockIdx.y;
   auto block_x = blockIdx.x;
@@ -226,7 +224,7 @@ __global__ void __launch_bounds__(block_size)
 
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
-  get_dictionary_indices_kernel(cudf::detail::device_2dspan<gpu::PageFragment const> frags)
+  get_dictionary_indices_kernel(cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto col_idx = blockIdx.y;
   auto block_x = blockIdx.x;
@@ -276,7 +274,7 @@ void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_st
     <<<chunks.size(), block_size, 0, stream.value()>>>(chunks);
 }
 
-void populate_chunk_hash_maps(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void populate_chunk_hash_maps(cudf::detail::device_2dspan<PageFragment const> frags,
                               rmm::cuda_stream_view stream)
 {
   dim3 const dim_grid(frags.size().second, frags.size().first);
@@ -290,14 +288,11 @@ void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_vi
   collect_map_entries_kernel<block_size><<<chunks.size(), block_size, 0, stream.value()>>>(chunks);
 }
 
-void get_dictionary_indices(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void get_dictionary_indices(cudf::detail::device_2dspan<PageFragment const> frags,
                             rmm::cuda_stream_view stream)
 {
   dim3 const dim_grid(frags.size().second, frags.size().first);
   get_dictionary_indices_kernel<DEFAULT_BLOCK_SIZE>
     <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(frags);
 }
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index 5c7b8ca3f8c..1a345ee0750 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -21,9 +21,7 @@
 #include <functional>
 #include <tuple>
 
-namespace cudf {
-namespace io {
-namespace parquet {
+namespace cudf::io::parquet::detail {
 
 /**
  * @brief Base class for parquet field functors.
@@ -769,12 +767,15 @@ bool CompactProtocolReader::read(ColumnIndex* c)
 
 bool CompactProtocolReader::read(Statistics* s)
 {
-  auto op = std::make_tuple(parquet_field_binary(1, s->max),
-                            parquet_field_binary(2, s->min),
-                            parquet_field_int64(3, s->null_count),
-                            parquet_field_int64(4, s->distinct_count),
-                            parquet_field_binary(5, s->max_value),
-                            parquet_field_binary(6, s->min_value));
+  using optional_binary = parquet_field_optional<std::vector<uint8_t>, parquet_field_binary>;
+  using optional_int64  = parquet_field_optional<int64_t, parquet_field_int64>;
+
+  auto op = std::make_tuple(optional_binary(1, s->max),
+                            optional_binary(2, s->min),
+                            optional_int64(3, s->null_count),
+                            optional_int64(4, s->distinct_count),
+                            optional_binary(5, s->max_value),
+                            optional_binary(6, s->min_value));
   return function_builder(this, op);
 }
 
@@ -870,6 +871,4 @@ int CompactProtocolReader::WalkSchema(
   }
 }
 
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp
index 619815db503..cbb4161b138 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.hpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.hpp
@@ -25,9 +25,8 @@
 #include <utility>
 #include <vector>
 
-namespace cudf {
-namespace io {
-namespace parquet {
+namespace cudf::io::parquet::detail {
+
 /**
  * @brief Class for parsing Parquet's Thrift Compact Protocol encoded metadata
  *
@@ -147,6 +146,4 @@ class CompactProtocolReader {
   friend class parquet_field_struct_blob;
 };
 
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index 60bc8984d81..00810269d3c 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -16,9 +16,7 @@
 
 #include "compact_protocol_writer.hpp"
 
-namespace cudf {
-namespace io {
-namespace parquet {
+namespace cudf::io::parquet::detail {
 
 /**
  * @brief Parquet CompactProtocolWriter class
@@ -197,12 +195,12 @@ size_t CompactProtocolWriter::write(ColumnChunkMetaData const& s)
 size_t CompactProtocolWriter::write(Statistics const& s)
 {
   CompactProtocolFieldWriter c(*this);
-  if (not s.max.empty()) { c.field_binary(1, s.max); }
-  if (not s.min.empty()) { c.field_binary(2, s.min); }
-  if (s.null_count != -1) { c.field_int(3, s.null_count); }
-  if (s.distinct_count != -1) { c.field_int(4, s.distinct_count); }
-  if (not s.max_value.empty()) { c.field_binary(5, s.max_value); }
-  if (not s.min_value.empty()) { c.field_binary(6, s.min_value); }
+  if (s.max.has_value()) { c.field_binary(1, s.max.value()); }
+  if (s.min.has_value()) { c.field_binary(2, s.min.value()); }
+  if (s.null_count.has_value()) { c.field_int(3, s.null_count.value()); }
+  if (s.distinct_count.has_value()) { c.field_int(4, s.distinct_count.value()); }
+  if (s.max_value.has_value()) { c.field_binary(5, s.max_value.value()); }
+  if (s.min_value.has_value()) { c.field_binary(6, s.min_value.value()); }
   return c.value();
 }
 
@@ -391,6 +389,4 @@ inline void CompactProtocolFieldWriter::set_current_field(int const& field)
   current_field_value = field;
 }
 
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
index 26d66527aa5..4849a814b14 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.hpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -25,9 +25,7 @@
 #include <string>
 #include <vector>
 
-namespace cudf {
-namespace io {
-namespace parquet {
+namespace cudf::io::parquet::detail {
 
 /**
  * @brief Class for parsing Parquet's Thrift Compact Protocol encoded metadata
@@ -115,6 +113,4 @@ class CompactProtocolFieldWriter {
   inline void set_current_field(int const& field);
 };
 
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu
index 8de3702bc2e..544c93ee616 100644
--- a/cpp/src/io/parquet/decode_preprocess.cu
+++ b/cpp/src/io/parquet/decode_preprocess.cu
@@ -23,10 +23,7 @@
 #include <rmm/exec_policy.hpp>
 #include <thrust/reduce.h>
 
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
+namespace cudf::io::parquet::detail {
 
 namespace {
 
@@ -411,7 +408,4 @@ void ComputePageSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
   }
 }
 
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh
index 2382e4aafdf..a513e6674b4 100644
--- a/cpp/src/io/parquet/delta_binary.cuh
+++ b/cpp/src/io/parquet/delta_binary.cuh
@@ -18,7 +18,7 @@
 
 #include "page_decode.cuh"
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 // DELTA_XXX encoding support
 //
@@ -291,4 +291,4 @@ struct delta_binary_decoder {
   }
 };
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 230834632dd..cce3659b902 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -23,10 +23,7 @@
 #include <rmm/exec_policy.hpp>
 #include <thrust/reduce.h>
 
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
+namespace cudf::io::parquet::detail {
 
 namespace {
 
@@ -624,7 +621,7 @@ uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_vector<PageInfo>
 }
 
 /**
- * @copydoc cudf::io::parquet::gpu::DecodePageData
+ * @copydoc cudf::io::parquet::detail::DecodePageData
  */
 void __host__ DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
                              cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -648,7 +645,4 @@ void __host__ DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
   }
 }
 
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index d70cabdd35f..7c866fd8b9e 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -24,7 +24,7 @@
 #include <cuda/atomic>
 #include <cuda/std/tuple>
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 struct page_state_s {
   constexpr page_state_s() noexcept {}
@@ -1384,4 +1384,4 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
   return true;
 }
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
index 2b78dead205..d25684a59f3 100644
--- a/cpp/src/io/parquet/page_delta_decode.cu
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -23,7 +23,7 @@
 #include <rmm/exec_policy.hpp>
 #include <thrust/transform_scan.h>
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 namespace {
 
@@ -160,7 +160,7 @@ __global__ void __launch_bounds__(96)
 }  // anonymous namespace
 
 /**
- * @copydoc cudf::io::parquet::gpu::DecodeDeltaBinary
+ * @copydoc cudf::io::parquet::detail::DecodeDeltaBinary
  */
 void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages,
                                 cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -184,4 +184,4 @@ void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages
   }
 }
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index fe0dbb85124..78873d5e8ca 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -41,10 +41,7 @@
 #include <thrust/scatter.h>
 #include <thrust/tuple.h>
 
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
+namespace cudf::io::parquet::detail {
 
 namespace {
 
@@ -329,7 +326,7 @@ __global__ void __launch_bounds__(128)
 // blockDim {128,1,1}
 __global__ void __launch_bounds__(128)
   gpuInitPages(device_2dspan<EncColumnChunk> chunks,
-               device_span<gpu::EncPage> pages,
+               device_span<EncPage> pages,
                device_span<size_type> page_sizes,
                device_span<size_type> comp_page_sizes,
                device_span<parquet_column_device_view const> col_desc,
@@ -998,7 +995,7 @@ __device__ auto julian_days_with_time(int64_t v)
 // blockDim(128, 1, 1)
 template <int block_size>
 __global__ void __launch_bounds__(128, 8)
-  gpuEncodePages(device_span<gpu::EncPage> pages,
+  gpuEncodePages(device_span<EncPage> pages,
                  device_span<device_span<uint8_t const>> comp_in,
                  device_span<device_span<uint8_t>> comp_out,
                  device_span<compression_result> comp_results,
@@ -1988,7 +1985,7 @@ __global__ void __launch_bounds__(128)
 
 // blockDim(1024, 1, 1)
 __global__ void __launch_bounds__(1024)
-  gpuGatherPages(device_span<EncColumnChunk> chunks, device_span<gpu::EncPage const> pages)
+  gpuGatherPages(device_span<EncColumnChunk> chunks, device_span<EncPage const> pages)
 {
   __shared__ __align__(8) EncColumnChunk ck_g;
   __shared__ __align__(8) EncPage page_g;
@@ -2265,7 +2262,7 @@ void InitFragmentStatistics(device_span<statistics_group> groups,
 }
 
 void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
-                      device_span<gpu::EncPage> pages,
+                      device_span<EncPage> pages,
                       device_span<size_type> page_sizes,
                       device_span<size_type> comp_page_sizes,
                       device_span<parquet_column_device_view const> col_desc,
@@ -2294,7 +2291,7 @@ void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
                                                      write_v2_headers);
 }
 
-void EncodePages(device_span<gpu::EncPage> pages,
+void EncodePages(device_span<EncPage> pages,
                  bool write_v2_headers,
                  device_span<device_span<uint8_t const>> comp_in,
                  device_span<device_span<uint8_t>> comp_out,
@@ -2328,7 +2325,7 @@ void EncodePageHeaders(device_span<EncPage> pages,
 }
 
 void GatherPages(device_span<EncColumnChunk> chunks,
-                 device_span<gpu::EncPage const> pages,
+                 device_span<EncPage const> pages,
                  rmm::cuda_stream_view stream)
 {
   gpuGatherPages<<<chunks.size(), 1024, 0, stream.value()>>>(chunks, pages);
@@ -2343,7 +2340,4 @@ void EncodeColumnIndexes(device_span<EncColumnChunk> chunks,
     chunks, column_stats, column_index_truncate_length);
 }
 
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 6f8b2f50443..eae8e05e61e 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -20,10 +20,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
+namespace cudf::io::parquet::detail {
+
 // Minimal thrift implementation for parsing page headers
 // https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md
 
@@ -161,8 +159,7 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type)
  * @param chunk Column chunk the page belongs to
  * @return `kernel_mask_bits` value for the given page
  */
-__device__ uint32_t kernel_mask_for_page(gpu::PageInfo const& page,
-                                         gpu::ColumnChunkDesc const& chunk)
+__device__ uint32_t kernel_mask_for_page(PageInfo const& page, ColumnChunkDesc const& chunk)
 {
   if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return 0; }
 
@@ -528,7 +525,4 @@ void __host__ BuildStringDictionaryIndex(ColumnChunkDesc* chunks,
   gpuBuildStringDictionaryIndex<<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_chunks);
 }
 
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index d79abe4a6d2..4d79770ec34 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -20,10 +20,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/gather.cuh>
 
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
+namespace cudf::io::parquet::detail {
 
 namespace {
 
@@ -757,7 +754,7 @@ __global__ void __launch_bounds__(decode_block_size)
 }  // anonymous namespace
 
 /**
- * @copydoc cudf::io::parquet::gpu::ComputePageStringSizes
+ * @copydoc cudf::io::parquet::detail::ComputePageStringSizes
  */
 void ComputePageStringSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
                             cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -778,7 +775,7 @@ void ComputePageStringSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
 }
 
 /**
- * @copydoc cudf::io::parquet::gpu::DecodeStringPageData
+ * @copydoc cudf::io::parquet::detail::DecodeStringPageData
  */
 void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
                                    cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -802,7 +799,4 @@ void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pa
   }
 }
 
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_string_utils.cuh b/cpp/src/io/parquet/page_string_utils.cuh
index 9395599b3ff..a81d0a64466 100644
--- a/cpp/src/io/parquet/page_string_utils.cuh
+++ b/cpp/src/io/parquet/page_string_utils.cuh
@@ -18,7 +18,7 @@
 
 #include <cudf/strings/detail/gather.cuh>
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 // stole this from cudf/strings/detail/gather.cuh. modified to run on a single string on one warp.
 // copies from src to dst in 16B chunks per thread.
@@ -107,4 +107,4 @@ __device__ void block_excl_sum(size_type* arr, size_type length, size_type initi
   }
 }
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 1df49262e87..1cd16ac6102 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -18,6 +18,8 @@
 
 #include "parquet_common.hpp"
 
+#include <cudf/types.hpp>
+
 #include <thrust/optional.h>
 
 #include <cstdint>
@@ -25,9 +27,8 @@
 #include <string>
 #include <vector>
 
-namespace cudf {
-namespace io {
-namespace parquet {
+namespace cudf::io::parquet::detail {
+
 constexpr uint32_t parquet_magic = (('P' << 0) | ('A' << 8) | ('R' << 16) | ('1' << 24));
 
 /**
@@ -153,8 +154,8 @@ struct SchemaElement {
   // The following fields are filled in later during schema initialization
   int max_definition_level = 0;
   int max_repetition_level = 0;
-  int parent_idx           = 0;
-  std::vector<size_t> children_idx;
+  size_type parent_idx     = 0;
+  std::vector<size_type> children_idx;
 
   bool operator==(SchemaElement const& other) const
   {
@@ -214,12 +215,18 @@ struct SchemaElement {
  * @brief Thrift-derived struct describing column chunk statistics
  */
 struct Statistics {
-  std::vector<uint8_t> max;        // deprecated max value in signed comparison order
-  std::vector<uint8_t> min;        // deprecated min value in signed comparison order
-  int64_t null_count     = -1;     // count of null values in the column
-  int64_t distinct_count = -1;     // count of distinct values occurring
-  std::vector<uint8_t> max_value;  // max value for column determined by ColumnOrder
-  std::vector<uint8_t> min_value;  // min value for column determined by ColumnOrder
+  // deprecated max value in signed comparison order
+  thrust::optional<std::vector<uint8_t>> max;
+  // deprecated min value in signed comparison order
+  thrust::optional<std::vector<uint8_t>> min;
+  // count of null values in the column
+  thrust::optional<int64_t> null_count;
+  // count of distinct values occurring
+  thrust::optional<int64_t> distinct_count;
+  // max value for column determined by ColumnOrder
+  thrust::optional<std::vector<uint8_t>> max_value;
+  // min value for column determined by ColumnOrder
+  thrust::optional<std::vector<uint8_t>> min_value;
 };
 
 /**
@@ -405,6 +412,4 @@ static inline int CountLeadingZeros32(uint32_t value)
 #endif
 }
 
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp
index 5a1716bb547..50736197eb9 100644
--- a/cpp/src/io/parquet/parquet_common.hpp
+++ b/cpp/src/io/parquet/parquet_common.hpp
@@ -18,9 +18,8 @@
 
 #include <cstdint>
 
-namespace cudf {
-namespace io {
-namespace parquet {
+namespace cudf::io::parquet::detail {
+
 // Max decimal precisions according to the parquet spec:
 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal
 auto constexpr MAX_DECIMAL32_PRECISION  = 9;
@@ -156,6 +155,4 @@ enum FieldType {
   ST_FLD_STRUCT = 12,
 };
 
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index dc74bee1536..10e12ebb782 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -23,7 +23,7 @@
 
 #include <cuco/static_map.cuh>
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 auto constexpr KEY_SENTINEL   = size_type{-1};
 auto constexpr VALUE_SENTINEL = size_type{-1};
@@ -81,4 +81,4 @@ inline size_type __device__ row_to_value_idx(size_type idx,
   return idx;
 }
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 51c862b376b..6a93fec0c46 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -35,7 +35,7 @@
 
 #include <vector>
 
-namespace cudf::io::parquet {
+namespace cudf::io::parquet::detail {
 
 using cudf::io::detail::string_index_pair;
 
@@ -88,8 +88,6 @@ struct input_column_info {
   auto nesting_depth() const { return nesting.size(); }
 };
 
-namespace gpu {
-
 /**
  * @brief Enums for the flags in the page header
  */
@@ -320,79 +318,6 @@ struct ColumnChunkDesc {
   int32_t src_col_schema{};  // my schema index in the file
 };
 
-/**
- * @brief The row_group_info class
- */
-struct row_group_info {
-  size_type index;  // row group index within a file. aggregate_reader_metadata::get_row_group() is
-                    // called with index and source_index
-  size_t start_row;
-  size_type source_index;  // file index.
-
-  row_group_info() = default;
-
-  row_group_info(size_type index, size_t start_row, size_type source_index)
-    : index{index}, start_row{start_row}, source_index{source_index}
-  {
-  }
-};
-
-/**
- * @brief Struct to store file-level data that remains constant for
- * all passes/chunks for the file.
- */
-struct file_intermediate_data {
-  // all row groups to read
-  std::vector<row_group_info> row_groups{};
-
-  // all chunks from the selected row groups. We may end up reading these chunks progressively
-  // instead of all at once
-  std::vector<gpu::ColumnChunkDesc> chunks{};
-
-  // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we
-  // may not be visiting every row group that contains these bounds
-  size_t global_skip_rows;
-  size_t global_num_rows;
-};
-
-/**
- * @brief Structs to identify the reading row range for each chunk of rows in chunked reading.
- */
-struct chunk_read_info {
-  size_t skip_rows;
-  size_t num_rows;
-};
-
-/**
- * @brief Struct to store pass-level data that remains constant for a single pass.
- */
-struct pass_intermediate_data {
-  std::vector<std::unique_ptr<datasource::buffer>> raw_page_data;
-  rmm::device_buffer decomp_page_data;
-
-  // rowgroup, chunk and page information for the current pass.
-  std::vector<row_group_info> row_groups{};
-  cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc> chunks{};
-  cudf::detail::hostdevice_vector<gpu::PageInfo> pages_info{};
-  cudf::detail::hostdevice_vector<gpu::PageNestingInfo> page_nesting_info{};
-  cudf::detail::hostdevice_vector<gpu::PageNestingDecodeInfo> page_nesting_decode_info{};
-
-  rmm::device_uvector<int32_t> page_keys{0, rmm::cuda_stream_default};
-  rmm::device_uvector<int32_t> page_index{0, rmm::cuda_stream_default};
-  rmm::device_uvector<string_index_pair> str_dict_index{0, rmm::cuda_stream_default};
-
-  std::vector<gpu::chunk_read_info> output_chunk_read_info;
-  std::size_t current_output_chunk{0};
-
-  rmm::device_buffer level_decode_data{};
-  int level_type_size{0};
-
-  // skip_rows and num_rows values for this particular pass. these may be adjusted values from the
-  // global values stored in file_intermediate_data.
-  size_t skip_rows;
-  size_t num_rows;
-};
-
 /**
  * @brief Struct describing an encoder column
  */
@@ -739,7 +664,7 @@ void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_st
  * @param frags Column fragments
  * @param stream CUDA stream to use
  */
-void populate_chunk_hash_maps(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void populate_chunk_hash_maps(cudf::detail::device_2dspan<PageFragment const> frags,
                               rmm::cuda_stream_view stream);
 
 /**
@@ -762,7 +687,7 @@ void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_vi
  * @param frags Column fragments
  * @param stream CUDA stream to use
  */
-void get_dictionary_indices(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void get_dictionary_indices(cudf::detail::device_2dspan<PageFragment const> frags,
                             rmm::cuda_stream_view stream);
 
 /**
@@ -781,7 +706,7 @@ void get_dictionary_indices(cudf::detail::device_2dspan<gpu::PageFragment const>
  * @param[in] stream CUDA stream to use
  */
 void InitEncoderPages(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                      device_span<gpu::EncPage> pages,
+                      device_span<EncPage> pages,
                       device_span<size_type> page_sizes,
                       device_span<size_type> comp_page_sizes,
                       device_span<parquet_column_device_view const> col_desc,
@@ -847,7 +772,7 @@ void EncodePageHeaders(device_span<EncPage> pages,
  * @param[in] stream CUDA stream to use
  */
 void GatherPages(device_span<EncColumnChunk> chunks,
-                 device_span<gpu::EncPage const> pages,
+                 device_span<EncPage const> pages,
                  rmm::cuda_stream_view stream);
 
 /**
@@ -863,5 +788,4 @@ void EncodeColumnIndexes(device_span<EncColumnChunk> chunks,
                          int32_t column_index_truncate_length,
                          rmm::cuda_stream_view stream);
 
-}  // namespace gpu
-}  // namespace cudf::io::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index 805d082c71e..a5851de3c20 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -35,7 +35,7 @@
 #include <numeric>
 #include <optional>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
 namespace {
 /**
@@ -62,13 +62,13 @@ struct stats_caster {
 
   // uses storage type as T
   template <typename T, CUDF_ENABLE_IF(cudf::is_dictionary<T>() or cudf::is_nested<T>())>
-  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  static T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
   {
     CUDF_FAIL("unsupported type for stats casting");
   }
 
   template <typename T, CUDF_ENABLE_IF(cudf::is_boolean<T>())>
-  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  static T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
   {
     CUDF_EXPECTS(type == BOOLEAN, "Invalid type and stats combination");
     return targetType<T>(*reinterpret_cast<bool const*>(stats_val));
@@ -78,7 +78,7 @@ struct stats_caster {
   template <typename T,
             CUDF_ENABLE_IF((cudf::is_integral<T>() and !cudf::is_boolean<T>()) or
                            cudf::is_fixed_point<T>() or cudf::is_chrono<T>())>
-  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  static T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
   {
     switch (type) {
       case INT32: return targetType<T>(*reinterpret_cast<int32_t const*>(stats_val));
@@ -103,7 +103,7 @@ struct stats_caster {
   }
 
   template <typename T, CUDF_ENABLE_IF(cudf::is_floating_point<T>())>
-  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  static T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
   {
     switch (type) {
       case FLOAT: return targetType<T>(*reinterpret_cast<float const*>(stats_val));
@@ -113,7 +113,7 @@ struct stats_caster {
   }
 
   template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, string_view>)>
-  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  static T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
   {
     switch (type) {
       case BYTE_ARRAY: [[fallthrough]];
@@ -150,12 +150,14 @@ struct stats_caster {
         {
         }
 
-        void set_index(size_type index, std::vector<uint8_t> const& binary_value, Type const type)
+        void set_index(size_type index,
+                       thrust::optional<std::vector<uint8_t>> const& binary_value,
+                       Type const type)
         {
-          if (!binary_value.empty()) {
-            val[index] = convert<T>(binary_value.data(), binary_value.size(), type);
+          if (binary_value.has_value()) {
+            val[index] = convert<T>(binary_value.value().data(), binary_value.value().size(), type);
           }
-          if (binary_value.empty()) {
+          if (not binary_value.has_value()) {
             clear_bit_unsafe(null_mask.data(), index);
             null_count++;
           }
@@ -210,10 +212,10 @@ struct stats_caster {
           auto const& row_group = per_file_metadata[src_idx].row_groups[rg_idx];
           auto const& colchunk  = row_group.columns[col_idx];
           // To support deprecated min, max fields.
-          auto const& min_value = colchunk.meta_data.statistics.min_value.size() > 0
+          auto const& min_value = colchunk.meta_data.statistics.min_value.has_value()
                                     ? colchunk.meta_data.statistics.min_value
                                     : colchunk.meta_data.statistics.min;
-          auto const& max_value = colchunk.meta_data.statistics.max_value.size() > 0
+          auto const& max_value = colchunk.meta_data.statistics.max_value.has_value()
                                     ? colchunk.meta_data.statistics.max_value
                                     : colchunk.meta_data.statistics.max;
           // translate binary data to Type then to <T>
@@ -527,4 +529,4 @@ named_to_reference_converter::visit_operands(
   return transformed_operands;
 }
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp
index 1e87447006d..17d7c07bc91 100644
--- a/cpp/src/io/parquet/reader.cpp
+++ b/cpp/src/io/parquet/reader.cpp
@@ -16,7 +16,7 @@
 
 #include "reader_impl.hpp"
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
 reader::reader() = default;
 
@@ -59,4 +59,4 @@ bool chunked_reader::has_next() const { return _impl->has_next(); }
 
 table_with_metadata chunked_reader::read_chunk() const { return _impl->read_chunk(); }
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index ea40f29a070..db81222157a 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -25,7 +25,7 @@
 #include <bitset>
 #include <numeric>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
 void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 {
@@ -38,7 +38,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
 
   size_t const sum_max_depths = std::accumulate(
-    chunks.begin(), chunks.end(), 0, [&](size_t cursum, gpu::ColumnChunkDesc const& chunk) {
+    chunks.begin(), chunks.end(), 0, [&](size_t cursum, ColumnChunkDesc const& chunk) {
       return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema);
     });
 
@@ -51,10 +51,10 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   // doing a gather operation later on.
   // TODO: This step is somewhat redundant if size info has already been calculated (nested schema,
   // chunked reader).
-  auto const has_strings = (kernel_mask & gpu::KERNEL_MASK_STRING) != 0;
+  auto const has_strings = (kernel_mask & KERNEL_MASK_STRING) != 0;
   std::vector<size_t> col_sizes(_input_columns.size(), 0L);
   if (has_strings) {
-    gpu::ComputePageStringSizes(
+    ComputePageStringSizes(
       pages, chunks, skip_rows, num_rows, _pass_itm_data->level_type_size, _stream);
 
     col_sizes = calculate_page_string_offsets();
@@ -176,19 +176,19 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   if (has_strings) {
     auto& stream = streams[s_idx++];
     chunk_nested_str_data.host_to_device_async(stream);
-    gpu::DecodeStringPageData(
+    DecodeStringPageData(
       pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), stream);
   }
 
   // launch delta binary decoder
-  if ((kernel_mask & gpu::KERNEL_MASK_DELTA_BINARY) != 0) {
-    gpu::DecodeDeltaBinary(
+  if ((kernel_mask & KERNEL_MASK_DELTA_BINARY) != 0) {
+    DecodeDeltaBinary(
       pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
   }
 
   // launch the catch-all page decoder
-  if ((kernel_mask & gpu::KERNEL_MASK_GENERAL) != 0) {
-    gpu::DecodePageData(
+  if ((kernel_mask & KERNEL_MASK_GENERAL) != 0) {
+    DecodePageData(
       pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
   }
 
@@ -248,13 +248,13 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 
   // update null counts in the final column buffers
   for (size_t idx = 0; idx < pages.size(); idx++) {
-    gpu::PageInfo* pi = &pages[idx];
-    if (pi->flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; }
-    gpu::ColumnChunkDesc* col          = &chunks[pi->chunk_idx];
+    PageInfo* pi = &pages[idx];
+    if (pi->flags & PAGEINFO_FLAGS_DICTIONARY) { continue; }
+    ColumnChunkDesc* col               = &chunks[pi->chunk_idx];
     input_column_info const& input_col = _input_columns[col->src_col_index];
 
-    int index                        = pi->nesting_decode - page_nesting_decode.device_ptr();
-    gpu::PageNestingDecodeInfo* pndi = &page_nesting_decode[index];
+    int index                   = pi->nesting_decode - page_nesting_decode.device_ptr();
+    PageNestingDecodeInfo* pndi = &page_nesting_decode[index];
 
     auto* cols = &_output_buffers;
     for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) {
@@ -320,7 +320,7 @@ reader::impl::impl(std::size_t chunk_read_limit,
 
   // Save the states of the output buffers for reuse in `chunk_read()`.
   for (auto const& buff : _output_buffers) {
-    _output_buffers_template.emplace_back(inline_column_buffer::empty_like(buff));
+    _output_buffers_template.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff));
   }
 }
 
@@ -349,14 +349,14 @@ void reader::impl::prepare_data(int64_t skip_rows,
         not _input_columns.empty()) {
       // fills in chunk information without physically loading or decompressing
       // the associated data
-      load_global_chunk_info();
+      create_global_chunk_info();
 
       // compute schedule of input reads. Each rowgroup contains 1 chunk per column. For now
       // we will read an entire row group at a time. However, it is possible to do
       // sub-rowgroup reads if we made some estimates on individual chunk sizes (tricky) and
       // changed the high level structure such that we weren't always reading an entire table's
       // worth of columns at once.
-      compute_input_pass_row_group_info();
+      compute_input_passes();
     }
 
     _file_preprocessed = true;
@@ -364,16 +364,16 @@ void reader::impl::prepare_data(int64_t skip_rows,
 
   // if we have to start a new pass, do that now
   if (!_pass_preprocessed) {
-    auto const num_passes = _input_pass_row_group_offsets.size() - 1;
+    auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
 
     // always create the pass struct, even if we end up with no passes.
     // this will also cause the previous pass information to be deleted
-    _pass_itm_data = std::make_unique<cudf::io::parquet::gpu::pass_intermediate_data>();
+    _pass_itm_data = std::make_unique<pass_intermediate_data>();
 
     if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() &&
         not _input_columns.empty() && _current_input_pass < num_passes) {
       // setup the pass_intermediate_info for this pass.
-      setup_pass();
+      setup_next_pass();
 
       load_and_decompress_data();
       preprocess_pages(uses_custom_row_bounds, _output_chunk_read_limit);
@@ -521,7 +521,7 @@ table_with_metadata reader::impl::read_chunk()
   if (_chunk_count > 0) {
     _output_buffers.resize(0);
     for (auto const& buff : _output_buffers_template) {
-      _output_buffers.emplace_back(inline_column_buffer::empty_like(buff));
+      _output_buffers.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff));
     }
   }
 
@@ -541,8 +541,8 @@ bool reader::impl::has_next()
                {} /*row_group_indices, empty means read all row groups*/,
                std::nullopt /*filter*/);
 
-  auto const num_input_passes =
-    _input_pass_row_group_offsets.size() == 0 ? 0 : _input_pass_row_group_offsets.size() - 1;
+  size_t const num_input_passes = std::max(
+    int64_t{0}, static_cast<int64_t>(_file_itm_data.input_pass_row_group_offsets.size()) - 1);
   return (_pass_itm_data->current_output_chunk < _pass_itm_data->output_chunk_read_info.size()) ||
          (_current_input_pass < num_input_passes);
 }
@@ -571,4 +571,4 @@ parquet_metadata read_parquet_metadata(host_span<std::unique_ptr<datasource> con
                           metadata.get_key_value_metadata()[0]};
 }
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 9445e4d1648..cea4ba35606 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -22,6 +22,7 @@
 #pragma once
 
 #include "parquet_gpu.hpp"
+#include "reader_impl_chunking.hpp"
 #include "reader_impl_helpers.hpp"
 
 #include <cudf/io/datasource.hpp>
@@ -35,7 +36,7 @@
 #include <optional>
 #include <vector>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
 /**
  * @brief Implementation for Parquet reader
@@ -136,10 +137,6 @@ class reader::impl {
                     host_span<std::vector<size_type> const> row_group_indices,
                     std::optional<std::reference_wrapper<ast::expression const>> filter);
 
-  void load_global_chunk_info();
-  void compute_input_pass_row_group_info();
-  void setup_pass();
-
   /**
    * @brief Create chunk information and start file reads
    *
@@ -250,6 +247,31 @@ class reader::impl {
    */
   void decode_page_data(size_t skip_rows, size_t num_rows);
 
+  /**
+   * @brief Creates file-wide parquet chunk information.
+   *
+   * Creates information about all chunks in the file, storing it in
+   * the file-wide _file_itm_data structure.
+   */
+  void create_global_chunk_info();
+
+  /**
+   * @brief Computes all of the passes we will perform over the file.
+   */
+  void compute_input_passes();
+
+  /**
+   * @brief Close out the existing pass (if any) and prepare for the next pass.
+   */
+  void setup_next_pass();
+
+  /**
+   * @brief Given a set of pages that have had their sizes computed by nesting level and
+   * a limit on total read size, generate a set of {skip_rows, num_rows} pairs representing
+   * a set of reads that will generate output columns of total size <= `chunk_read_limit` bytes.
+   */
+  void compute_splits_for_pass();
+
  private:
   rmm::cuda_stream_view _stream;
   rmm::mr::device_memory_resource* _mr = nullptr;
@@ -261,10 +283,10 @@ class reader::impl {
   std::vector<input_column_info> _input_columns;
 
   // Buffers for generating output columns
-  std::vector<inline_column_buffer> _output_buffers;
+  std::vector<cudf::io::detail::inline_column_buffer> _output_buffers;
 
   // Buffers copied from `_output_buffers` after construction for reuse
-  std::vector<inline_column_buffer> _output_buffers_template;
+  std::vector<cudf::io::detail::inline_column_buffer> _output_buffers_template;
 
   // _output_buffers associated schema indices
   std::vector<int> _output_column_schemas;
@@ -278,27 +300,24 @@ class reader::impl {
 
   // chunked reading happens in 2 parts:
   //
-  // At the top level there is the "pass" in which we try and limit the
+  // At the top level, the entire file is divided up into "passes" omn which we try and limit the
   // total amount of temporary memory (compressed data, decompressed data) in use
   // via _input_pass_read_limit.
   //
   // Within a pass, we produce one or more chunks of output, whose maximum total
   // byte size is controlled by _output_chunk_read_limit.
 
-  cudf::io::parquet::gpu::file_intermediate_data _file_itm_data;
-  std::unique_ptr<cudf::io::parquet::gpu::pass_intermediate_data> _pass_itm_data;
-
-  // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents
-  // the start/end of the chunks to be loaded for a given pass.
-  std::vector<std::size_t> _input_pass_row_group_offsets{};
-  std::vector<std::size_t> _input_pass_row_count{};
-  std::size_t _current_input_pass{0};
-  std::size_t _chunk_count{0};
+  file_intermediate_data _file_itm_data;
+  bool _file_preprocessed{false};
 
-  std::size_t _output_chunk_read_limit{0};
-  std::size_t _input_pass_read_limit{0};
+  std::unique_ptr<pass_intermediate_data> _pass_itm_data;
   bool _pass_preprocessed{false};
-  bool _file_preprocessed{false};
+
+  std::size_t _output_chunk_read_limit{0};  // output chunk size limit in bytes
+  std::size_t _input_pass_read_limit{0};    // input pass memory usage limit in bytes
+
+  std::size_t _current_input_pass{0};  // current input pass index
+  std::size_t _chunk_count{0};         // how many output chunks we have produced
 };
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
new file mode 100644
index 00000000000..ad52a7dfcc1
--- /dev/null
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -0,0 +1,598 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "reader_impl.hpp"
+#include "reader_impl_chunking.hpp"
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
+
+#include <io/utilities/time_utils.cuh>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/sort.h>
+
+namespace cudf::io::parquet::detail {
+
+namespace {
+
+struct cumulative_row_info {
+  size_t row_count;   // cumulative row count
+  size_t size_bytes;  // cumulative size in bytes
+  int key;            // schema index
+};
+
+#if defined(CHUNKING_DEBUG)
+void print_cumulative_page_info(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                                rmm::device_uvector<int32_t> const& page_index,
+                                rmm::device_uvector<cumulative_row_info> const& c_info,
+                                rmm::cuda_stream_view stream)
+{
+  pages.device_to_host_sync(stream);
+
+  printf("------------\nCumulative sizes by page\n");
+
+  std::vector<int> schemas(pages.size());
+  std::vector<int> h_page_index(pages.size());
+  CUDF_CUDA_TRY(cudaMemcpy(
+    h_page_index.data(), page_index.data(), sizeof(int) * pages.size(), cudaMemcpyDefault));
+  std::vector<cumulative_row_info> h_cinfo(pages.size());
+  CUDF_CUDA_TRY(cudaMemcpy(
+    h_cinfo.data(), c_info.data(), sizeof(cumulative_row_info) * pages.size(), cudaMemcpyDefault));
+  auto schema_iter = cudf::detail::make_counting_transform_iterator(
+    0, [&](size_type i) { return pages[h_page_index[i]].src_col_schema; });
+  thrust::copy(thrust::seq, schema_iter, schema_iter + pages.size(), schemas.begin());
+  auto last = thrust::unique(thrust::seq, schemas.begin(), schemas.end());
+  schemas.resize(last - schemas.begin());
+  printf("Num schemas: %lu\n", schemas.size());
+
+  for (size_t idx = 0; idx < schemas.size(); idx++) {
+    printf("Schema %d\n", schemas[idx]);
+    for (size_t pidx = 0; pidx < pages.size(); pidx++) {
+      auto const& page = pages[h_page_index[pidx]];
+      if (page.flags & PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) {
+        continue;
+      }
+      printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes);
+    }
+  }
+}
+
+void print_cumulative_row_info(host_span<cumulative_row_info const> sizes,
+                               std::string const& label,
+                               std::optional<std::vector<chunk_read_info>> splits = std::nullopt)
+{
+  if (splits.has_value()) {
+    printf("------------\nSplits\n");
+    for (size_t idx = 0; idx < splits->size(); idx++) {
+      printf("{%lu, %lu}\n", splits.value()[idx].skip_rows, splits.value()[idx].num_rows);
+    }
+  }
+
+  printf("------------\nCumulative sizes %s\n", label.c_str());
+  for (size_t idx = 0; idx < sizes.size(); idx++) {
+    printf("{%lu, %lu, %d}", sizes[idx].row_count, sizes[idx].size_bytes, sizes[idx].key);
+    if (splits.has_value()) {
+      // if we have a split at this row count and this is the last instance of this row count
+      auto start = thrust::make_transform_iterator(
+        splits->begin(), [](chunk_read_info const& i) { return i.skip_rows; });
+      auto end               = start + splits->size();
+      auto split             = std::find(start, end, sizes[idx].row_count);
+      auto const split_index = [&]() -> int {
+        if (split != end &&
+            ((idx == sizes.size() - 1) || (sizes[idx + 1].row_count > sizes[idx].row_count))) {
+          return static_cast<int>(std::distance(start, split));
+        }
+        return idx == 0 ? 0 : -1;
+      }();
+      if (split_index >= 0) {
+        printf(" <-- split {%lu, %lu}",
+               splits.value()[split_index].skip_rows,
+               splits.value()[split_index].num_rows);
+      }
+    }
+    printf("\n");
+  }
+}
+#endif  // CHUNKING_DEBUG
+
+/**
+ * @brief Functor which reduces two cumulative_row_info structs of the same key.
+ */
+struct cumulative_row_sum {
+  cumulative_row_info operator()
+    __device__(cumulative_row_info const& a, cumulative_row_info const& b) const
+  {
+    return cumulative_row_info{a.row_count + b.row_count, a.size_bytes + b.size_bytes, a.key};
+  }
+};
+
+/**
+ * @brief Functor which computes the total data size for a given type of cudf column.
+ *
+ * In the case of strings, the return size does not include the chars themselves. That
+ * information is tracked separately (see PageInfo::str_bytes).
+ */
+struct row_size_functor {
+  __device__ size_t validity_size(size_t num_rows, bool nullable)
+  {
+    return nullable ? (cudf::util::div_rounding_up_safe(num_rows, size_t{32}) * 4) : 0;
+  }
+
+  template <typename T>
+  __device__ size_t operator()(size_t num_rows, bool nullable)
+  {
+    auto const element_size = sizeof(device_storage_type_t<T>);
+    return (element_size * num_rows) + validity_size(num_rows, nullable);
+  }
+};
+
+template <>
+__device__ size_t row_size_functor::operator()<list_view>(size_t num_rows, bool nullable)
+{
+  auto const offset_size = sizeof(size_type);
+  // NOTE: Adding the + 1 offset here isn't strictly correct.  There will only be 1 extra offset
+  // for the entire column, whereas this is adding an extra offset per page.  So we will get a
+  // small over-estimate of the real size of the order :  # of pages * 4 bytes. It seems better
+  // to overestimate size somewhat than to underestimate it and potentially generate chunks
+  // that are too large.
+  return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable);
+}
+
+template <>
+__device__ size_t row_size_functor::operator()<struct_view>(size_t num_rows, bool nullable)
+{
+  return validity_size(num_rows, nullable);
+}
+
+template <>
+__device__ size_t row_size_functor::operator()<string_view>(size_t num_rows, bool nullable)
+{
+  // only returns the size of offsets and validity. the size of the actual string chars
+  // is tracked separately.
+  auto const offset_size = sizeof(size_type);
+  // see note about offsets in the list_view template.
+  return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable);
+}
+
+/**
+ * @brief Functor which computes the total output cudf data size for all of
+ * the data in this page.
+ *
+ * Sums across all nesting levels.
+ */
+struct get_cumulative_row_info {
+  PageInfo const* const pages;
+
+  __device__ cumulative_row_info operator()(size_type index)
+  {
+    auto const& page = pages[index];
+    if (page.flags & PAGEINFO_FLAGS_DICTIONARY) {
+      return cumulative_row_info{0, 0, page.src_col_schema};
+    }
+
+    // total nested size, not counting string data
+    auto iter =
+      cudf::detail::make_counting_transform_iterator(0, [page, index] __device__(size_type i) {
+        auto const& pni = page.nesting[i];
+        return cudf::type_dispatcher(
+          data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable);
+      });
+
+    size_t const row_count = static_cast<size_t>(page.nesting[0].size);
+    return {
+      row_count,
+      thrust::reduce(thrust::seq, iter, iter + page.num_output_nesting_levels) + page.str_bytes,
+      page.src_col_schema};
+  }
+};
+
+/**
+ * @brief Functor which computes the effective size of all input columns by page.
+ *
+ * For a given row, we want to find the cost of all pages for all columns involved
+ * in loading up to that row.  The complication here is that not all pages are the
+ * same size between columns. Example:
+ *
+ *              page row counts
+ * Column A:    0 <----> 100 <----> 200
+ * Column B:    0 <---------------> 200 <--------> 400
+                          |
+ * if we decide to split at row 100, we don't really know the actual amount of bytes in column B
+ * at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
+ * page. Essentially, a conservative over-estimate of the real size.
+ */
+struct row_total_size {
+  cumulative_row_info const* c_info;
+  size_type const* key_offsets;
+  size_t num_keys;
+
+  __device__ cumulative_row_info operator()(cumulative_row_info const& i)
+  {
+    // sum sizes for each input column at this row
+    size_t sum = 0;
+    for (int idx = 0; idx < num_keys; idx++) {
+      auto const start = key_offsets[idx];
+      auto const end   = key_offsets[idx + 1];
+      auto iter        = cudf::detail::make_counting_transform_iterator(
+        0, [&] __device__(size_type i) { return c_info[i].row_count; });
+      auto const page_index =
+        thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_count) - iter;
+      sum += c_info[page_index].size_bytes;
+    }
+    return {i.row_count, sum, i.key};
+  }
+};
+
+/**
+ * @brief Given a vector of cumulative {row_count, byte_size} pairs and a chunk read
+ * limit, determine the set of splits.
+ *
+ * @param sizes Vector of cumulative {row_count, byte_size} pairs
+ * @param num_rows Total number of rows to read
+ * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
+ */
+std::vector<chunk_read_info> find_splits(std::vector<cumulative_row_info> const& sizes,
+                                         size_t num_rows,
+                                         size_t chunk_read_limit)
+{
+  // now we have an array of {row_count, real output bytes}. just walk through it and generate
+  // splits.
+  // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch
+  // sizes are reasonably large, this shouldn't iterate too many times
+  std::vector<chunk_read_info> splits;
+  {
+    size_t cur_pos             = 0;
+    size_t cur_cumulative_size = 0;
+    size_t cur_row_count       = 0;
+    auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_row_info const& i) {
+      return i.size_bytes - cur_cumulative_size;
+    });
+    auto end   = start + sizes.size();
+    while (cur_row_count < num_rows) {
+      int64_t split_pos =
+        thrust::lower_bound(thrust::seq, start + cur_pos, end, chunk_read_limit) - start;
+
+      // if we're past the end, or if the returned bucket is > than the chunk_read_limit, move back
+      // one.
+      if (static_cast<size_t>(split_pos) >= sizes.size() ||
+          (sizes[split_pos].size_bytes - cur_cumulative_size > chunk_read_limit)) {
+        split_pos--;
+      }
+
+      // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in
+      // a loop because all of the cumulative sizes for all the pages are sorted into one big list.
+      // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in
+      // the list twice. so we have to iterate until we skip past all of them.  The idea is that we
+      // either do this, or we have to call unique() on the input first.
+      while (split_pos < (static_cast<int64_t>(sizes.size()) - 1) &&
+             (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) {
+        split_pos++;
+      }
+
+      auto const start_row = cur_row_count;
+      cur_row_count        = sizes[split_pos].row_count;
+      splits.push_back(chunk_read_info{start_row, cur_row_count - start_row});
+      cur_pos             = split_pos;
+      cur_cumulative_size = sizes[split_pos].size_bytes;
+    }
+  }
+  // print_cumulative_row_info(sizes, "adjusted", splits);
+
+  return splits;
+}
+
+/**
+ * @brief Converts cuDF units to Parquet units.
+ *
+ * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type.
+ */
+[[nodiscard]] std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
+                                                                   type_id timestamp_type_id,
+                                                                   Type physical,
+                                                                   int8_t converted,
+                                                                   int32_t length)
+{
+  int32_t type_width = (physical == FIXED_LEN_BYTE_ARRAY) ? length : 0;
+  int32_t clock_rate = 0;
+  if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) {
+    type_width = 1;  // I32 -> I8
+  } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) {
+    type_width = 2;  // I32 -> I16
+  } else if (column_type_id == type_id::INT32) {
+    type_width = 4;  // str -> hash32
+  } else if (is_chrono(data_type{column_type_id})) {
+    clock_rate = to_clockrate(timestamp_type_id);
+  }
+
+  int8_t converted_type = converted;
+  if (converted_type == DECIMAL && column_type_id != type_id::FLOAT64 &&
+      not cudf::is_fixed_point(data_type{column_type_id})) {
+    converted_type = UNKNOWN;  // Not converting to float64 or decimal
+  }
+  return std::make_tuple(type_width, clock_rate, converted_type);
+}
+
+/**
+ * @brief Return the required number of bits to store a value.
+ */
+template <typename T = uint8_t>
+[[nodiscard]] T required_bits(uint32_t max_level)
+{
+  return static_cast<T>(CompactProtocolReader::NumRequiredBits(max_level));
+}
+
+struct row_count_compare {
+  __device__ bool operator()(cumulative_row_info const& a, cumulative_row_info const& b)
+  {
+    return a.row_count < b.row_count;
+  }
+};
+
+}  // anonymous namespace
+
+void reader::impl::create_global_chunk_info()
+{
+  auto const num_rows         = _file_itm_data.global_num_rows;
+  auto const& row_groups_info = _file_itm_data.row_groups;
+  auto& chunks                = _file_itm_data.chunks;
+
+  // Descriptors for all the chunks that make up the selected columns
+  auto const num_input_columns = _input_columns.size();
+  auto const num_chunks        = row_groups_info.size() * num_input_columns;
+
+  // Initialize column chunk information
+  auto remaining_rows = num_rows;
+  for (auto const& rg : row_groups_info) {
+    auto const& row_group      = _metadata->get_row_group(rg.index, rg.source_index);
+    auto const row_group_start = rg.start_row;
+    auto const row_group_rows  = std::min<int>(remaining_rows, row_group.num_rows);
+
+    // generate ColumnChunkDesc objects for everything to be decoded (all input columns)
+    for (size_t i = 0; i < num_input_columns; ++i) {
+      auto col = _input_columns[i];
+      // look up metadata
+      auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
+      auto& schema   = _metadata->get_schema(col.schema_idx);
+
+      auto [type_width, clock_rate, converted_type] =
+        conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()),
+                        _timestamp_type.id(),
+                        schema.type,
+                        schema.converted_type,
+                        schema.type_length);
+
+      chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size,
+                                       nullptr,
+                                       col_meta.num_values,
+                                       schema.type,
+                                       type_width,
+                                       row_group_start,
+                                       row_group_rows,
+                                       schema.max_definition_level,
+                                       schema.max_repetition_level,
+                                       _metadata->get_output_nesting_depth(col.schema_idx),
+                                       required_bits(schema.max_definition_level),
+                                       required_bits(schema.max_repetition_level),
+                                       col_meta.codec,
+                                       converted_type,
+                                       schema.logical_type,
+                                       schema.decimal_precision,
+                                       clock_rate,
+                                       i,
+                                       col.schema_idx));
+    }
+
+    remaining_rows -= row_group_rows;
+  }
+}
+
+void reader::impl::compute_input_passes()
+{
+  // at this point, row_groups has already been filtered down to just the row groups we need to
+  // handle optional skip_rows/num_rows parameters.
+  auto const& row_groups_info = _file_itm_data.row_groups;
+
+  // if the user hasn't specified an input size limit, read everything in a single pass.
+  if (_input_pass_read_limit == 0) {
+    _file_itm_data.input_pass_row_group_offsets.push_back(0);
+    _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size());
+    return;
+  }
+
+  // generate passes. make sure to account for the case where a single row group doesn't fit within
+  //
+  std::size_t const read_limit =
+    _input_pass_read_limit > 0 ? _input_pass_read_limit : std::numeric_limits<std::size_t>::max();
+  std::size_t cur_pass_byte_size = 0;
+  std::size_t cur_rg_start       = 0;
+  std::size_t cur_row_count      = 0;
+  _file_itm_data.input_pass_row_group_offsets.push_back(0);
+  _file_itm_data.input_pass_row_count.push_back(0);
+
+  for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) {
+    auto const& rgi       = row_groups_info[cur_rg_index];
+    auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index);
+
+    // can we add this row group
+    if (cur_pass_byte_size + row_group.total_byte_size >= read_limit) {
+      // A single row group (the current one) is larger than the read limit:
+      // We always need to include at least one row group, so end the pass at the end of the current
+      // row group
+      if (cur_rg_start == cur_rg_index) {
+        _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index + 1);
+        _file_itm_data.input_pass_row_count.push_back(cur_row_count + row_group.num_rows);
+        cur_rg_start       = cur_rg_index + 1;
+        cur_pass_byte_size = 0;
+      }
+      // End the pass at the end of the previous row group
+      else {
+        _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index);
+        _file_itm_data.input_pass_row_count.push_back(cur_row_count);
+        cur_rg_start       = cur_rg_index;
+        cur_pass_byte_size = row_group.total_byte_size;
+      }
+    } else {
+      cur_pass_byte_size += row_group.total_byte_size;
+    }
+    cur_row_count += row_group.num_rows;
+  }
+  // add the last pass if necessary
+  if (_file_itm_data.input_pass_row_group_offsets.back() != row_groups_info.size()) {
+    _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size());
+    _file_itm_data.input_pass_row_count.push_back(cur_row_count);
+  }
+}
+
+void reader::impl::setup_next_pass()
+{
+  // this will also cause the previous pass information to be deleted
+  _pass_itm_data = std::make_unique<cudf::io::parquet::detail::pass_intermediate_data>();
+
+  // setup row groups to be loaded for this pass
+  auto const row_group_start = _file_itm_data.input_pass_row_group_offsets[_current_input_pass];
+  auto const row_group_end   = _file_itm_data.input_pass_row_group_offsets[_current_input_pass + 1];
+  auto const num_row_groups  = row_group_end - row_group_start;
+  _pass_itm_data->row_groups.resize(num_row_groups);
+  std::copy(_file_itm_data.row_groups.begin() + row_group_start,
+            _file_itm_data.row_groups.begin() + row_group_end,
+            _pass_itm_data->row_groups.begin());
+
+  auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
+  CUDF_EXPECTS(_current_input_pass < num_passes, "Encountered an invalid read pass index");
+
+  auto const chunks_per_rowgroup = _input_columns.size();
+  auto const num_chunks          = chunks_per_rowgroup * num_row_groups;
+
+  auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup);
+  auto chunk_end   = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup);
+
+  _pass_itm_data->chunks = cudf::detail::hostdevice_vector<ColumnChunkDesc>(num_chunks, _stream);
+  std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin());
+
+  // adjust skip_rows and num_rows by what's available in the row groups we are processing
+  if (num_passes == 1) {
+    _pass_itm_data->skip_rows = _file_itm_data.global_skip_rows;
+    _pass_itm_data->num_rows  = _file_itm_data.global_num_rows;
+  } else {
+    auto const global_start_row = _file_itm_data.global_skip_rows;
+    auto const global_end_row   = global_start_row + _file_itm_data.global_num_rows;
+    auto const start_row =
+      std::max(_file_itm_data.input_pass_row_count[_current_input_pass], global_start_row);
+    auto const end_row =
+      std::min(_file_itm_data.input_pass_row_count[_current_input_pass + 1], global_end_row);
+
+    // skip_rows is always global in the sense that it is relative to the first row of
+    // everything we will be reading, regardless of what pass we are on.
+    // num_rows is how many rows we are reading this pass.
+    _pass_itm_data->skip_rows =
+      global_start_row + _file_itm_data.input_pass_row_count[_current_input_pass];
+    _pass_itm_data->num_rows = end_row - start_row;
+  }
+}
+
+void reader::impl::compute_splits_for_pass()
+{
+  auto const skip_rows = _pass_itm_data->skip_rows;
+  auto const num_rows  = _pass_itm_data->num_rows;
+
+  // simple case : no chunk size, no splits
+  if (_output_chunk_read_limit <= 0) {
+    _pass_itm_data->output_chunk_read_info = std::vector<chunk_read_info>{{skip_rows, num_rows}};
+    return;
+  }
+
+  auto& pages = _pass_itm_data->pages_info;
+
+  auto const& page_keys  = _pass_itm_data->page_keys;
+  auto const& page_index = _pass_itm_data->page_index;
+
+  // generate cumulative row counts and sizes
+  rmm::device_uvector<cumulative_row_info> c_info(page_keys.size(), _stream);
+  // convert PageInfo to cumulative_row_info
+  auto page_input = thrust::make_transform_iterator(page_index.begin(),
+                                                    get_cumulative_row_info{pages.device_ptr()});
+  thrust::inclusive_scan_by_key(rmm::exec_policy(_stream),
+                                page_keys.begin(),
+                                page_keys.end(),
+                                page_input,
+                                c_info.begin(),
+                                thrust::equal_to{},
+                                cumulative_row_sum{});
+  // print_cumulative_page_info(pages, page_index, c_info, stream);
+
+  // sort by row count
+  rmm::device_uvector<cumulative_row_info> c_info_sorted{c_info, _stream};
+  thrust::sort(
+    rmm::exec_policy(_stream), c_info_sorted.begin(), c_info_sorted.end(), row_count_compare{});
+
+  // std::vector<cumulative_row_info> h_c_info_sorted(c_info_sorted.size());
+  // CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(),
+  //                          c_info_sorted.data(),
+  //                          sizeof(cumulative_row_info) * c_info_sorted.size(),
+  //                          cudaMemcpyDefault));
+  // print_cumulative_row_info(h_c_info_sorted, "raw");
+
+  // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per
+  // key
+  rmm::device_uvector<size_type> key_offsets(page_keys.size() + 1, _stream);
+  auto const key_offsets_end = thrust::reduce_by_key(rmm::exec_policy(_stream),
+                                                     page_keys.begin(),
+                                                     page_keys.end(),
+                                                     thrust::make_constant_iterator(1),
+                                                     thrust::make_discard_iterator(),
+                                                     key_offsets.begin())
+                                 .second;
+  size_t const num_unique_keys = key_offsets_end - key_offsets.begin();
+  thrust::exclusive_scan(
+    rmm::exec_policy(_stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin());
+
+  // adjust the cumulative info such that for each row count, the size includes any pages that span
+  // that row count. this is so that if we have this case:
+  //              page row counts
+  // Column A:    0 <----> 100 <----> 200
+  // Column B:    0 <---------------> 200 <--------> 400
+  //                        |
+  // if we decide to split at row 100, we don't really know the actual amount of bytes in column B
+  // at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
+  // page.
+  //
+  rmm::device_uvector<cumulative_row_info> aggregated_info(c_info.size(), _stream);
+  thrust::transform(rmm::exec_policy(_stream),
+                    c_info_sorted.begin(),
+                    c_info_sorted.end(),
+                    aggregated_info.begin(),
+                    row_total_size{c_info.data(), key_offsets.data(), num_unique_keys});
+
+  // bring back to the cpu
+  std::vector<cumulative_row_info> h_aggregated_info(aggregated_info.size());
+  CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(),
+                                aggregated_info.data(),
+                                sizeof(cumulative_row_info) * c_info.size(),
+                                cudaMemcpyDefault,
+                                _stream.value()));
+  _stream.synchronize();
+
+  // generate the actual splits
+  _pass_itm_data->output_chunk_read_info =
+    find_splits(h_aggregated_info, num_rows, _output_chunk_read_limit);
+}
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
new file mode 100644
index 00000000000..dfc239d8451
--- /dev/null
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "reader_impl_helpers.hpp"
+
+#include <cudf/types.hpp>
+
+namespace cudf::io::parquet::detail {
+
+/**
+ * @brief Struct to store file-level data that remains constant for
+ * all passes/chunks in the file.
+ */
+struct file_intermediate_data {
+  // all row groups to read
+  std::vector<row_group_info> row_groups{};
+
+  // all chunks from the selected row groups. We may end up reading these chunks progressively
+  // instead of all at once
+  std::vector<ColumnChunkDesc> chunks{};
+
+  // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents
+  // the start/end of the chunks to be loaded for a given pass.
+  std::vector<std::size_t> input_pass_row_group_offsets{};
+  // row counts per input-pass
+  std::vector<std::size_t> input_pass_row_count{};
+
+  // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we
+  // may not be visiting every row group that contains these bounds
+  size_t global_skip_rows;
+  size_t global_num_rows;
+};
+
+/**
+ * @brief Struct to identify the range for each chunk of rows during a chunked reading pass.
+ */
+struct chunk_read_info {
+  size_t skip_rows;
+  size_t num_rows;
+};
+
+/**
+ * @brief Struct to store pass-level data that remains constant for a single pass.
+ */
+struct pass_intermediate_data {
+  std::vector<std::unique_ptr<datasource::buffer>> raw_page_data;
+  rmm::device_buffer decomp_page_data;
+
+  // rowgroup, chunk and page information for the current pass.
+  std::vector<row_group_info> row_groups{};
+  cudf::detail::hostdevice_vector<ColumnChunkDesc> chunks{};
+  cudf::detail::hostdevice_vector<PageInfo> pages_info{};
+  cudf::detail::hostdevice_vector<PageNestingInfo> page_nesting_info{};
+  cudf::detail::hostdevice_vector<PageNestingDecodeInfo> page_nesting_decode_info{};
+
+  rmm::device_uvector<int32_t> page_keys{0, rmm::cuda_stream_default};
+  rmm::device_uvector<int32_t> page_index{0, rmm::cuda_stream_default};
+  rmm::device_uvector<string_index_pair> str_dict_index{0, rmm::cuda_stream_default};
+
+  std::vector<chunk_read_info> output_chunk_read_info;
+  std::size_t current_output_chunk{0};
+
+  rmm::device_buffer level_decode_data{};
+  int level_type_size{0};
+
+  // skip_rows and num_rows values for this particular pass. these may be adjusted values from the
+  // global values stored in file_intermediate_data.
+  size_t skip_rows;
+  size_t num_rows;
+};
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 9778cfc47d2..040c6403f57 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -21,34 +21,34 @@
 #include <numeric>
 #include <regex>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
 namespace {
 
 ConvertedType logical_type_to_converted_type(LogicalType const& logical)
 {
   if (logical.isset.STRING) {
-    return parquet::UTF8;
+    return UTF8;
   } else if (logical.isset.MAP) {
-    return parquet::MAP;
+    return MAP;
   } else if (logical.isset.LIST) {
-    return parquet::LIST;
+    return LIST;
   } else if (logical.isset.ENUM) {
-    return parquet::ENUM;
+    return ENUM;
   } else if (logical.isset.DECIMAL) {
-    return parquet::DECIMAL;  // TODO set decimal values
+    return DECIMAL;  // TODO set decimal values
   } else if (logical.isset.DATE) {
-    return parquet::DATE;
+    return DATE;
   } else if (logical.isset.TIME) {
     if (logical.TIME.unit.isset.MILLIS)
-      return parquet::TIME_MILLIS;
+      return TIME_MILLIS;
     else if (logical.TIME.unit.isset.MICROS)
-      return parquet::TIME_MICROS;
+      return TIME_MICROS;
   } else if (logical.isset.TIMESTAMP) {
     if (logical.TIMESTAMP.unit.isset.MILLIS)
-      return parquet::TIMESTAMP_MILLIS;
+      return TIMESTAMP_MILLIS;
     else if (logical.TIMESTAMP.unit.isset.MICROS)
-      return parquet::TIMESTAMP_MICROS;
+      return TIMESTAMP_MICROS;
   } else if (logical.isset.INTEGER) {
     switch (logical.INTEGER.bitWidth) {
       case 8: return logical.INTEGER.isSigned ? INT_8 : UINT_8;
@@ -58,13 +58,13 @@ ConvertedType logical_type_to_converted_type(LogicalType const& logical)
       default: break;
     }
   } else if (logical.isset.UNKNOWN) {
-    return parquet::NA;
+    return NA;
   } else if (logical.isset.JSON) {
-    return parquet::JSON;
+    return JSON;
   } else if (logical.isset.BSON) {
-    return parquet::BSON;
+    return BSON;
   }
-  return parquet::UNKNOWN;
+  return UNKNOWN;
 }
 
 }  // namespace
@@ -76,39 +76,39 @@ type_id to_type_id(SchemaElement const& schema,
                    bool strings_to_categorical,
                    type_id timestamp_type_id)
 {
-  parquet::Type const physical            = schema.type;
-  parquet::LogicalType const logical_type = schema.logical_type;
-  parquet::ConvertedType converted_type   = schema.converted_type;
-  int32_t decimal_precision               = schema.decimal_precision;
+  Type const physical            = schema.type;
+  LogicalType const logical_type = schema.logical_type;
+  ConvertedType converted_type   = schema.converted_type;
+  int32_t decimal_precision      = schema.decimal_precision;
 
   // Logical type used for actual data interpretation; the legacy converted type
   // is superseded by 'logical' type whenever available.
   auto const inferred_converted_type = logical_type_to_converted_type(logical_type);
-  if (inferred_converted_type != parquet::UNKNOWN) { converted_type = inferred_converted_type; }
-  if (inferred_converted_type == parquet::DECIMAL) {
+  if (inferred_converted_type != UNKNOWN) { converted_type = inferred_converted_type; }
+  if (inferred_converted_type == DECIMAL) {
     decimal_precision = schema.logical_type.DECIMAL.precision;
   }
 
   switch (converted_type) {
-    case parquet::UINT_8: return type_id::UINT8;
-    case parquet::INT_8: return type_id::INT8;
-    case parquet::UINT_16: return type_id::UINT16;
-    case parquet::INT_16: return type_id::INT16;
-    case parquet::UINT_32: return type_id::UINT32;
-    case parquet::UINT_64: return type_id::UINT64;
-    case parquet::DATE: return type_id::TIMESTAMP_DAYS;
-    case parquet::TIME_MILLIS: return type_id::DURATION_MILLISECONDS;
-    case parquet::TIME_MICROS: return type_id::DURATION_MICROSECONDS;
-    case parquet::TIMESTAMP_MILLIS:
+    case UINT_8: return type_id::UINT8;
+    case INT_8: return type_id::INT8;
+    case UINT_16: return type_id::UINT16;
+    case INT_16: return type_id::INT16;
+    case UINT_32: return type_id::UINT32;
+    case UINT_64: return type_id::UINT64;
+    case DATE: return type_id::TIMESTAMP_DAYS;
+    case TIME_MILLIS: return type_id::DURATION_MILLISECONDS;
+    case TIME_MICROS: return type_id::DURATION_MICROSECONDS;
+    case TIMESTAMP_MILLIS:
       return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
                                                    : type_id::TIMESTAMP_MILLISECONDS;
-    case parquet::TIMESTAMP_MICROS:
+    case TIMESTAMP_MICROS:
       return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
                                                    : type_id::TIMESTAMP_MICROSECONDS;
-    case parquet::DECIMAL:
-      if (physical == parquet::INT32) { return type_id::DECIMAL32; }
-      if (physical == parquet::INT64) { return type_id::DECIMAL64; }
-      if (physical == parquet::FIXED_LEN_BYTE_ARRAY) {
+    case DECIMAL:
+      if (physical == INT32) { return type_id::DECIMAL32; }
+      if (physical == INT64) { return type_id::DECIMAL64; }
+      if (physical == FIXED_LEN_BYTE_ARRAY) {
         if (schema.type_length <= static_cast<int32_t>(sizeof(int32_t))) {
           return type_id::DECIMAL32;
         }
@@ -119,7 +119,7 @@ type_id to_type_id(SchemaElement const& schema,
           return type_id::DECIMAL128;
         }
       }
-      if (physical == parquet::BYTE_ARRAY) {
+      if (physical == BYTE_ARRAY) {
         CUDF_EXPECTS(decimal_precision <= MAX_DECIMAL128_PRECISION, "Invalid decimal precision");
         if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
           return type_id::DECIMAL32;
@@ -133,20 +133,20 @@ type_id to_type_id(SchemaElement const& schema,
       break;
 
     // maps are just List<Struct<>>.
-    case parquet::MAP:
-    case parquet::LIST: return type_id::LIST;
-    case parquet::NA: return type_id::STRING;
+    case MAP:
+    case LIST: return type_id::LIST;
+    case NA: return type_id::STRING;
     // return type_id::EMPTY; //TODO(kn): enable after Null/Empty column support
     default: break;
   }
 
-  if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and
+  if (inferred_converted_type == UNKNOWN and physical == INT64 and
       logical_type.TIMESTAMP.unit.isset.NANOS) {
     return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
                                                  : type_id::TIMESTAMP_NANOSECONDS;
   }
 
-  if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and
+  if (inferred_converted_type == UNKNOWN and physical == INT64 and
       logical_type.TIME.unit.isset.NANOS) {
     return type_id::DURATION_NANOSECONDS;
   }
@@ -157,16 +157,16 @@ type_id to_type_id(SchemaElement const& schema,
   // Physical storage type supported by Parquet; controls the on-disk storage
   // format in combination with the encoding type.
   switch (physical) {
-    case parquet::BOOLEAN: return type_id::BOOL8;
-    case parquet::INT32: return type_id::INT32;
-    case parquet::INT64: return type_id::INT64;
-    case parquet::FLOAT: return type_id::FLOAT32;
-    case parquet::DOUBLE: return type_id::FLOAT64;
-    case parquet::BYTE_ARRAY:
-    case parquet::FIXED_LEN_BYTE_ARRAY:
+    case BOOLEAN: return type_id::BOOL8;
+    case INT32: return type_id::INT32;
+    case INT64: return type_id::INT64;
+    case FLOAT: return type_id::FLOAT32;
+    case DOUBLE: return type_id::FLOAT64;
+    case BYTE_ARRAY:
+    case FIXED_LEN_BYTE_ARRAY:
       // Can be mapped to INT32 (32-bit hash) or STRING
       return strings_to_categorical ? type_id::INT32 : type_id::STRING;
-    case parquet::INT96:
+    case INT96:
       return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
                                                    : type_id::TIMESTAMP_NANOSECONDS;
     default: break;
@@ -213,7 +213,7 @@ void metadata::sanitize_schema()
         // add a struct child and move this element's children to the struct
         schema_elem.converted_type  = LIST;
         schema_elem.repetition_type = OPTIONAL;
-        auto const struct_node_idx  = schema.size();
+        auto const struct_node_idx  = static_cast<size_type>(schema.size());
 
         SchemaElement struct_elem;
         struct_elem.name            = "struct_node";
@@ -420,7 +420,7 @@ std::vector<std::string> aggregate_reader_metadata::get_pandas_index_names() con
   return names;
 }
 
-std::tuple<int64_t, size_type, std::vector<gpu::row_group_info>>
+std::tuple<int64_t, size_type, std::vector<row_group_info>>
 aggregate_reader_metadata::select_row_groups(
   host_span<std::vector<size_type> const> row_group_indices,
   int64_t skip_rows_opt,
@@ -438,7 +438,7 @@ aggregate_reader_metadata::select_row_groups(
         host_span<std::vector<size_type> const>(filtered_row_group_indices.value());
     }
   }
-  std::vector<gpu::row_group_info> selection;
+  std::vector<row_group_info> selection;
   auto [rows_to_skip, rows_to_read] = [&]() {
     if (not row_group_indices.empty()) { return std::pair<int64_t, size_type>{}; }
     auto const from_opts = cudf::io::detail::skip_rows_num_rows_from_options(
@@ -478,7 +478,7 @@ aggregate_reader_metadata::select_row_groups(
 }
 
 std::tuple<std::vector<input_column_info>,
-           std::vector<inline_column_buffer>,
+           std::vector<cudf::io::detail::inline_column_buffer>,
            std::vector<size_type>>
 aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>> const& use_names,
                                           bool include_index,
@@ -496,17 +496,18 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
              : -1;
   };
 
-  std::vector<inline_column_buffer> output_columns;
+  std::vector<cudf::io::detail::inline_column_buffer> output_columns;
   std::vector<input_column_info> input_columns;
   std::vector<int> nesting;
 
   // Return true if column path is valid. e.g. if the path is {"struct1", "child1"}, then it is
   // valid if "struct1.child1" exists in this file's schema. If "struct1" exists but "child1" is
   // not a child of "struct1" then the function will return false for "struct1"
-  std::function<bool(column_name_info const*, int, std::vector<inline_column_buffer>&, bool)>
+  std::function<bool(
+    column_name_info const*, int, std::vector<cudf::io::detail::inline_column_buffer>&, bool)>
     build_column = [&](column_name_info const* col_name_info,
                        int schema_idx,
-                       std::vector<inline_column_buffer>& out_col_array,
+                       std::vector<cudf::io::detail::inline_column_buffer>& out_col_array,
                        bool has_list_parent) {
       if (schema_idx < 0) { return false; }
       auto const& schema_elem = get_schema(schema_idx);
@@ -529,7 +530,8 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
                               : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
       auto const dtype    = to_data_type(col_type, schema_elem);
 
-      inline_column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL);
+      cudf::io::detail::inline_column_buffer output_col(dtype,
+                                                        schema_elem.repetition_type == OPTIONAL);
       if (has_list_parent) { output_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; }
       // store the index of this element if inserted in out_col_array
       nesting.push_back(static_cast<int>(out_col_array.size()));
@@ -569,7 +571,8 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
             to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
           auto const element_dtype = to_data_type(element_type, schema_elem);
 
-          inline_column_buffer element_col(element_dtype, schema_elem.repetition_type == OPTIONAL);
+          cudf::io::detail::inline_column_buffer element_col(
+            element_dtype, schema_elem.repetition_type == OPTIONAL);
           if (has_list_parent || col_type == type_id::LIST) {
             element_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT;
           }
@@ -732,4 +735,4 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
     std::move(input_columns), std::move(output_columns), std::move(output_column_schemas));
 }
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 9ee17f26a10..8d8ab8707be 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -32,9 +32,24 @@
 #include <tuple>
 #include <vector>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
-using namespace cudf::io::parquet;
+/**
+ * @brief The row_group_info class
+ */
+struct row_group_info {
+  size_type index;  // row group index within a file. aggregate_reader_metadata::get_row_group() is
+                    // called with index and source_index
+  size_t start_row;
+  size_type source_index;  // file index.
+
+  row_group_info() = default;
+
+  row_group_info(size_type index, size_t start_row, size_type source_index)
+    : index{index}, start_row{start_row}, source_index{source_index}
+  {
+  }
+};
 
 /**
  * @brief Function that translates Parquet datatype to cuDF type enum
@@ -182,7 +197,7 @@ class aggregate_reader_metadata {
    * @return A tuple of corrected row_start, row_count and list of row group indexes and its
    *         starting row
    */
-  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<gpu::row_group_info>> select_row_groups(
+  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<row_group_info>> select_row_groups(
     host_span<std::vector<size_type> const> row_group_indices,
     int64_t row_start,
     std::optional<size_type> const& row_count,
@@ -202,12 +217,13 @@ class aggregate_reader_metadata {
    * @return input column information, output column information, list of output column schema
    * indices
    */
-  [[nodiscard]] std::
-    tuple<std::vector<input_column_info>, std::vector<inline_column_buffer>, std::vector<size_type>>
-    select_columns(std::optional<std::vector<std::string>> const& use_names,
-                   bool include_index,
-                   bool strings_to_categorical,
-                   type_id timestamp_type_id) const;
+  [[nodiscard]] std::tuple<std::vector<input_column_info>,
+                           std::vector<cudf::io::detail::inline_column_buffer>,
+                           std::vector<size_type>>
+  select_columns(std::optional<std::vector<std::string>> const& use_names,
+                 bool include_index,
+                 bool strings_to_categorical,
+                 type_id timestamp_type_id) const;
 };
 
 /**
@@ -276,4 +292,4 @@ class named_to_reference_converter : public ast::detail::expression_transformer
   std::list<ast::operation> _operators;
 };
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index c731c467f2c..ce45f709ee1 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -18,7 +18,6 @@
 
 #include <io/comp/nvcomp_adapter.hpp>
 #include <io/utilities/config_utils.hpp>
-#include <io/utilities/time_utils.cuh>
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
@@ -43,7 +42,7 @@
 
 #include <numeric>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 namespace {
 
 /**
@@ -169,46 +168,6 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
   }
 }
 
-/**
- * @brief Return the required number of bits to store a value.
- */
-template <typename T = uint8_t>
-[[nodiscard]] T required_bits(uint32_t max_level)
-{
-  return static_cast<T>(CompactProtocolReader::NumRequiredBits(max_level));
-}
-
-/**
- * @brief Converts cuDF units to Parquet units.
- *
- * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type.
- */
-[[nodiscard]] std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
-                                                                   type_id timestamp_type_id,
-                                                                   parquet::Type physical,
-                                                                   int8_t converted,
-                                                                   int32_t length)
-{
-  int32_t type_width = (physical == parquet::FIXED_LEN_BYTE_ARRAY) ? length : 0;
-  int32_t clock_rate = 0;
-  if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) {
-    type_width = 1;  // I32 -> I8
-  } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) {
-    type_width = 2;  // I32 -> I16
-  } else if (column_type_id == type_id::INT32) {
-    type_width = 4;  // str -> hash32
-  } else if (is_chrono(data_type{column_type_id})) {
-    clock_rate = to_clockrate(timestamp_type_id);
-  }
-
-  int8_t converted_type = converted;
-  if (converted_type == parquet::DECIMAL && column_type_id != type_id::FLOAT64 &&
-      not cudf::is_fixed_point(data_type{column_type_id})) {
-    converted_type = parquet::UNKNOWN;  // Not converting to float64 or decimal
-  }
-  return std::make_tuple(type_width, clock_rate, converted_type);
-}
-
 /**
  * @brief Reads compressed page data to device memory.
  *
@@ -226,7 +185,7 @@ template <typename T = uint8_t>
 [[nodiscard]] std::future<void> read_column_chunks_async(
   std::vector<std::unique_ptr<datasource>> const& sources,
   std::vector<std::unique_ptr<datasource::buffer>>& page_data,
-  cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
+  cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
   size_t begin_chunk,
   size_t end_chunk,
   std::vector<size_t> const& column_chunk_offsets,
@@ -239,11 +198,10 @@ template <typename T = uint8_t>
     size_t const io_offset   = column_chunk_offsets[chunk];
     size_t io_size           = chunks[chunk].compressed_size;
     size_t next_chunk        = chunk + 1;
-    bool const is_compressed = (chunks[chunk].codec != parquet::Compression::UNCOMPRESSED);
+    bool const is_compressed = (chunks[chunk].codec != Compression::UNCOMPRESSED);
     while (next_chunk < end_chunk) {
-      size_t const next_offset = column_chunk_offsets[next_chunk];
-      bool const is_next_compressed =
-        (chunks[next_chunk].codec != parquet::Compression::UNCOMPRESSED);
+      size_t const next_offset      = column_chunk_offsets[next_chunk];
+      bool const is_next_compressed = (chunks[next_chunk].codec != Compression::UNCOMPRESSED);
       if (next_offset != io_offset + io_size || is_next_compressed != is_compressed ||
           chunk_source_map[chunk] != chunk_source_map[next_chunk]) {
         // Can't merge if not contiguous or mixing compressed and uncompressed
@@ -300,13 +258,13 @@ template <typename T = uint8_t>
  *
  * @return The total number of pages
  */
-[[nodiscard]] size_t count_page_headers(
-  cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& chunks, rmm::cuda_stream_view stream)
+[[nodiscard]] size_t count_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
+                                        rmm::cuda_stream_view stream)
 {
   size_t total_pages = 0;
 
   chunks.host_to_device_async(stream);
-  gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
+  DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
   chunks.device_to_host_sync(stream);
 
   for (size_t c = 0; c < chunks.size(); c++) {
@@ -337,8 +295,8 @@ constexpr bool is_supported_encoding(Encoding enc)
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @returns The size in bytes of level type data required
  */
-int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                        cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
+int decode_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
+                        cudf::detail::hostdevice_vector<PageInfo>& pages,
                         rmm::cuda_stream_view stream)
 {
   // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages),
@@ -350,14 +308,14 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
   }
 
   chunks.host_to_device_async(stream);
-  gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
+  DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
 
   // compute max bytes needed for level data
   auto level_bit_size =
     cudf::detail::make_counting_transform_iterator(0, [chunks = chunks.begin()] __device__(int i) {
       auto c = chunks[i];
       return static_cast<int>(
-        max(c.level_bits[gpu::level_type::REPETITION], c.level_bits[gpu::level_type::DEFINITION]));
+        max(c.level_bits[level_type::REPETITION], c.level_bits[level_type::DEFINITION]));
     });
   // max level data bit size.
   int const max_level_bits   = thrust::reduce(rmm::exec_policy(stream),
@@ -388,11 +346,11 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
  * @return Device buffer to decompressed page data
  */
 [[nodiscard]] rmm::device_buffer decompress_page_data(
-  cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-  cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
+  cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
+  cudf::detail::hostdevice_vector<PageInfo>& pages,
   rmm::cuda_stream_view stream)
 {
-  auto for_each_codec_page = [&](parquet::Compression codec, std::function<void(size_t)> const& f) {
+  auto for_each_codec_page = [&](Compression codec, std::function<void(size_t)> const& f) {
     for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
       const auto page_stride = chunks[c].max_num_pages;
       if (chunks[c].codec == codec) {
@@ -412,19 +370,16 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
   size_t total_decomp_size = 0;
 
   struct codec_stats {
-    parquet::Compression compression_type = UNCOMPRESSED;
-    size_t num_pages                      = 0;
-    int32_t max_decompressed_size         = 0;
-    size_t total_decomp_size              = 0;
+    Compression compression_type  = UNCOMPRESSED;
+    size_t num_pages              = 0;
+    int32_t max_decompressed_size = 0;
+    size_t total_decomp_size      = 0;
   };
 
-  std::array codecs{codec_stats{parquet::GZIP},
-                    codec_stats{parquet::SNAPPY},
-                    codec_stats{parquet::BROTLI},
-                    codec_stats{parquet::ZSTD}};
+  std::array codecs{codec_stats{GZIP}, codec_stats{SNAPPY}, codec_stats{BROTLI}, codec_stats{ZSTD}};
 
   auto is_codec_supported = [&codecs](int8_t codec) {
-    if (codec == parquet::UNCOMPRESSED) return true;
+    if (codec == UNCOMPRESSED) return true;
     return std::find_if(codecs.begin(), codecs.end(), [codec](auto& cstats) {
              return codec == cstats.compression_type;
            }) != codecs.end();
@@ -445,7 +400,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
       codec.num_pages++;
       num_comp_pages++;
     });
-    if (codec.compression_type == parquet::BROTLI && codec.num_pages > 0) {
+    if (codec.compression_type == BROTLI && codec.num_pages > 0) {
       debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.num_pages), stream);
     }
   }
@@ -482,7 +437,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
       auto& page          = pages[page_idx];
       // offset will only be non-zero for V2 pages
       auto const offset =
-        page.lvl_bytes[gpu::level_type::DEFINITION] + page.lvl_bytes[gpu::level_type::REPETITION];
+        page.lvl_bytes[level_type::DEFINITION] + page.lvl_bytes[level_type::REPETITION];
       // for V2 need to copy def and rep level info into place, and then offset the
       // input and output buffers. otherwise we'd have to keep both the compressed
       // and decompressed data.
@@ -509,11 +464,11 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
     device_span<compression_result> d_comp_res_view(comp_res.data() + start_pos, codec.num_pages);
 
     switch (codec.compression_type) {
-      case parquet::GZIP:
+      case GZIP:
         gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream);
         break;
-      case parquet::SNAPPY:
-        if (nvcomp_integration::is_stable_enabled()) {
+      case SNAPPY:
+        if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) {
           nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
                                      d_comp_in,
                                      d_comp_out,
@@ -525,7 +480,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
           gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, stream);
         }
         break;
-      case parquet::ZSTD:
+      case ZSTD:
         nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
                                    d_comp_in,
                                    d_comp_out,
@@ -534,7 +489,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
                                    codec.total_decomp_size,
                                    stream);
         break;
-      case parquet::BROTLI:
+      case BROTLI:
         gpu_debrotli(d_comp_in,
                      d_comp_out,
                      d_comp_res_view,
@@ -594,9 +549,9 @@ void reader::impl::allocate_nesting_info()
     });
 
   page_nesting_info =
-    cudf::detail::hostdevice_vector<gpu::PageNestingInfo>{total_page_nesting_infos, _stream};
+    cudf::detail::hostdevice_vector<PageNestingInfo>{total_page_nesting_infos, _stream};
   page_nesting_decode_info =
-    cudf::detail::hostdevice_vector<gpu::PageNestingDecodeInfo>{total_page_nesting_infos, _stream};
+    cudf::detail::hostdevice_vector<PageNestingDecodeInfo>{total_page_nesting_infos, _stream};
 
   // update pointers in the PageInfos
   int target_page_index = 0;
@@ -653,10 +608,10 @@ void reader::impl::allocate_nesting_info()
       if (!cur_schema.is_stub()) {
         // initialize each page within the chunk
         for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) {
-          gpu::PageNestingInfo* pni =
+          PageNestingInfo* pni =
             &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size)];
 
-          gpu::PageNestingDecodeInfo* nesting_info =
+          PageNestingDecodeInfo* nesting_info =
             &page_nesting_decode_info[nesting_info_index + (p_idx * per_page_nesting_info_size)];
 
           // if we have lists, set our start and end depth remappings
@@ -717,9 +672,9 @@ void reader::impl::allocate_level_decode_space()
   for (size_t idx = 0; idx < pages.size(); idx++) {
     auto& p = pages[idx];
 
-    p.lvl_decode_buf[gpu::level_type::DEFINITION] = buf;
+    p.lvl_decode_buf[level_type::DEFINITION] = buf;
     buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size);
-    p.lvl_decode_buf[gpu::level_type::REPETITION] = buf;
+    p.lvl_decode_buf[level_type::REPETITION] = buf;
     buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size);
   }
 }
@@ -793,164 +748,6 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::read_and_decompres
   return {total_decompressed_size > 0, std::move(read_chunk_tasks)};
 }
 
-void reader::impl::load_global_chunk_info()
-{
-  auto const num_rows         = _file_itm_data.global_num_rows;
-  auto const& row_groups_info = _file_itm_data.row_groups;
-  auto& chunks                = _file_itm_data.chunks;
-
-  // Descriptors for all the chunks that make up the selected columns
-  auto const num_input_columns = _input_columns.size();
-  auto const num_chunks        = row_groups_info.size() * num_input_columns;
-
-  // Initialize column chunk information
-  auto remaining_rows = num_rows;
-  for (auto const& rg : row_groups_info) {
-    auto const& row_group      = _metadata->get_row_group(rg.index, rg.source_index);
-    auto const row_group_start = rg.start_row;
-    auto const row_group_rows  = std::min<int>(remaining_rows, row_group.num_rows);
-
-    // generate ColumnChunkDesc objects for everything to be decoded (all input columns)
-    for (size_t i = 0; i < num_input_columns; ++i) {
-      auto col = _input_columns[i];
-      // look up metadata
-      auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
-      auto& schema   = _metadata->get_schema(col.schema_idx);
-
-      auto [type_width, clock_rate, converted_type] =
-        conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()),
-                        _timestamp_type.id(),
-                        schema.type,
-                        schema.converted_type,
-                        schema.type_length);
-
-      chunks.push_back(gpu::ColumnChunkDesc(col_meta.total_compressed_size,
-                                            nullptr,
-                                            col_meta.num_values,
-                                            schema.type,
-                                            type_width,
-                                            row_group_start,
-                                            row_group_rows,
-                                            schema.max_definition_level,
-                                            schema.max_repetition_level,
-                                            _metadata->get_output_nesting_depth(col.schema_idx),
-                                            required_bits(schema.max_definition_level),
-                                            required_bits(schema.max_repetition_level),
-                                            col_meta.codec,
-                                            converted_type,
-                                            schema.logical_type,
-                                            schema.decimal_precision,
-                                            clock_rate,
-                                            i,
-                                            col.schema_idx));
-    }
-
-    remaining_rows -= row_group_rows;
-  }
-}
-
-void reader::impl::compute_input_pass_row_group_info()
-{
-  // at this point, row_groups has already been filtered down to just the row groups we need to
-  // handle optional skip_rows/num_rows parameters.
-  auto const& row_groups_info = _file_itm_data.row_groups;
-
-  // if the user hasn't specified an input size limit, read everything in a single pass.
-  if (_input_pass_read_limit == 0) {
-    _input_pass_row_group_offsets.push_back(0);
-    _input_pass_row_group_offsets.push_back(row_groups_info.size());
-    return;
-  }
-
-  // generate passes. make sure to account for the case where a single row group doesn't fit within
-  //
-  std::size_t const read_limit =
-    _input_pass_read_limit > 0 ? _input_pass_read_limit : std::numeric_limits<std::size_t>::max();
-  std::size_t cur_pass_byte_size = 0;
-  std::size_t cur_rg_start       = 0;
-  std::size_t cur_row_count      = 0;
-  _input_pass_row_group_offsets.push_back(0);
-  _input_pass_row_count.push_back(0);
-
-  for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) {
-    auto const& rgi       = row_groups_info[cur_rg_index];
-    auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index);
-
-    // can we add this row group
-    if (cur_pass_byte_size + row_group.total_byte_size >= read_limit) {
-      // A single row group (the current one) is larger than the read limit:
-      // We always need to include at least one row group, so end the pass at the end of the current
-      // row group
-      if (cur_rg_start == cur_rg_index) {
-        _input_pass_row_group_offsets.push_back(cur_rg_index + 1);
-        _input_pass_row_count.push_back(cur_row_count + row_group.num_rows);
-        cur_rg_start       = cur_rg_index + 1;
-        cur_pass_byte_size = 0;
-      }
-      // End the pass at the end of the previous row group
-      else {
-        _input_pass_row_group_offsets.push_back(cur_rg_index);
-        _input_pass_row_count.push_back(cur_row_count);
-        cur_rg_start       = cur_rg_index;
-        cur_pass_byte_size = row_group.total_byte_size;
-      }
-    } else {
-      cur_pass_byte_size += row_group.total_byte_size;
-    }
-    cur_row_count += row_group.num_rows;
-  }
-  // add the last pass if necessary
-  if (_input_pass_row_group_offsets.back() != row_groups_info.size()) {
-    _input_pass_row_group_offsets.push_back(row_groups_info.size());
-    _input_pass_row_count.push_back(cur_row_count);
-  }
-}
-
-void reader::impl::setup_pass()
-{
-  // this will also cause the previous pass information to be deleted
-  _pass_itm_data = std::make_unique<cudf::io::parquet::gpu::pass_intermediate_data>();
-
-  // setup row groups to be loaded for this pass
-  auto const row_group_start = _input_pass_row_group_offsets[_current_input_pass];
-  auto const row_group_end   = _input_pass_row_group_offsets[_current_input_pass + 1];
-  auto const num_row_groups  = row_group_end - row_group_start;
-  _pass_itm_data->row_groups.resize(num_row_groups);
-  std::copy(_file_itm_data.row_groups.begin() + row_group_start,
-            _file_itm_data.row_groups.begin() + row_group_end,
-            _pass_itm_data->row_groups.begin());
-
-  auto const num_passes = _input_pass_row_group_offsets.size() - 1;
-  CUDF_EXPECTS(_current_input_pass < num_passes, "Encountered an invalid read pass index");
-
-  auto const chunks_per_rowgroup = _input_columns.size();
-  auto const num_chunks          = chunks_per_rowgroup * num_row_groups;
-
-  auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup);
-  auto chunk_end   = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup);
-
-  _pass_itm_data->chunks =
-    cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>(num_chunks, _stream);
-  std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin());
-
-  // adjust skip_rows and num_rows by what's available in the row groups we are processing
-  if (num_passes == 1) {
-    _pass_itm_data->skip_rows = _file_itm_data.global_skip_rows;
-    _pass_itm_data->num_rows  = _file_itm_data.global_num_rows;
-  } else {
-    auto const global_start_row = _file_itm_data.global_skip_rows;
-    auto const global_end_row   = global_start_row + _file_itm_data.global_num_rows;
-    auto const start_row = std::max(_input_pass_row_count[_current_input_pass], global_start_row);
-    auto const end_row   = std::min(_input_pass_row_count[_current_input_pass + 1], global_end_row);
-
-    // skip_rows is always global in the sense that it is relative to the first row of
-    // everything we will be reading, regardless of what pass we are on.
-    // num_rows is how many rows we are reading this pass.
-    _pass_itm_data->skip_rows = global_start_row + _input_pass_row_count[_current_input_pass];
-    _pass_itm_data->num_rows  = end_row - start_row;
-  }
-}
-
 void reader::impl::load_and_decompress_data()
 {
   // This function should never be called if `num_rows == 0`.
@@ -970,7 +767,7 @@ void reader::impl::load_and_decompress_data()
   // Process dataset chunk pages into output columns
   auto const total_pages = count_page_headers(chunks, _stream);
   if (total_pages <= 0) { return; }
-  pages = cudf::detail::hostdevice_vector<gpu::PageInfo>(total_pages, total_pages, _stream);
+  pages = cudf::detail::hostdevice_vector<PageInfo>(total_pages, total_pages, _stream);
 
   // decoding of column/page information
   _pass_itm_data->level_type_size = decode_page_headers(chunks, pages, _stream);
@@ -978,7 +775,7 @@ void reader::impl::load_and_decompress_data()
     decomp_page_data = decompress_page_data(chunks, pages, _stream);
     // Free compressed data
     for (size_t c = 0; c < chunks.size(); c++) {
-      if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { raw_page_data[c].reset(); }
+      if (chunks[c].codec != Compression::UNCOMPRESSED) { raw_page_data[c].reset(); }
     }
   }
 
@@ -1019,14 +816,13 @@ struct cumulative_row_info {
 };
 
 #if defined(PREPROCESS_DEBUG)
-void print_pages(cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
-                 rmm::cuda_stream_view _stream)
+void print_pages(cudf::detail::hostdevice_vector<PageInfo>& pages, rmm::cuda_stream_view _stream)
 {
   pages.device_to_host_sync(_stream);
   for (size_t idx = 0; idx < pages.size(); idx++) {
     auto const& p = pages[idx];
     // skip dictionary pages
-    if (p.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; }
+    if (p.flags & PAGEINFO_FLAGS_DICTIONARY) { continue; }
     printf(
       "P(%lu, s:%d): chunk_row(%d), num_rows(%d), skipped_values(%d), skipped_leaf_values(%d), "
       "str_bytes(%d)\n",
@@ -1039,372 +835,19 @@ void print_pages(cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
       p.str_bytes);
   }
 }
-
-void print_cumulative_page_info(cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
-                                rmm::device_uvector<int32_t> const& page_index,
-                                rmm::device_uvector<cumulative_row_info> const& c_info,
-                                rmm::cuda_stream_view stream)
-{
-  pages.device_to_host_sync(stream);
-
-  printf("------------\nCumulative sizes by page\n");
-
-  std::vector<int> schemas(pages.size());
-  std::vector<int> h_page_index(pages.size());
-  CUDF_CUDA_TRY(cudaMemcpy(
-    h_page_index.data(), page_index.data(), sizeof(int) * pages.size(), cudaMemcpyDefault));
-  std::vector<cumulative_row_info> h_cinfo(pages.size());
-  CUDF_CUDA_TRY(cudaMemcpy(
-    h_cinfo.data(), c_info.data(), sizeof(cumulative_row_info) * pages.size(), cudaMemcpyDefault));
-  auto schema_iter = cudf::detail::make_counting_transform_iterator(
-    0, [&](size_type i) { return pages[h_page_index[i]].src_col_schema; });
-  thrust::copy(thrust::seq, schema_iter, schema_iter + pages.size(), schemas.begin());
-  auto last = thrust::unique(thrust::seq, schemas.begin(), schemas.end());
-  schemas.resize(last - schemas.begin());
-  printf("Num schemas: %lu\n", schemas.size());
-
-  for (size_t idx = 0; idx < schemas.size(); idx++) {
-    printf("Schema %d\n", schemas[idx]);
-    for (size_t pidx = 0; pidx < pages.size(); pidx++) {
-      auto const& page = pages[h_page_index[pidx]];
-      if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) {
-        continue;
-      }
-      printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes);
-    }
-  }
-}
-
-void print_cumulative_row_info(
-  host_span<cumulative_row_info const> sizes,
-  std::string const& label,
-  std::optional<std::vector<gpu::chunk_read_info>> splits = std::nullopt)
-{
-  if (splits.has_value()) {
-    printf("------------\nSplits\n");
-    for (size_t idx = 0; idx < splits->size(); idx++) {
-      printf("{%lu, %lu}\n", splits.value()[idx].skip_rows, splits.value()[idx].num_rows);
-    }
-  }
-
-  printf("------------\nCumulative sizes %s\n", label.c_str());
-  for (size_t idx = 0; idx < sizes.size(); idx++) {
-    printf("{%lu, %lu, %d}", sizes[idx].row_count, sizes[idx].size_bytes, sizes[idx].key);
-    if (splits.has_value()) {
-      // if we have a split at this row count and this is the last instance of this row count
-      auto start = thrust::make_transform_iterator(
-        splits->begin(), [](gpu::chunk_read_info const& i) { return i.skip_rows; });
-      auto end               = start + splits->size();
-      auto split             = std::find(start, end, sizes[idx].row_count);
-      auto const split_index = [&]() -> int {
-        if (split != end &&
-            ((idx == sizes.size() - 1) || (sizes[idx + 1].row_count > sizes[idx].row_count))) {
-          return static_cast<int>(std::distance(start, split));
-        }
-        return idx == 0 ? 0 : -1;
-      }();
-      if (split_index >= 0) {
-        printf(" <-- split {%lu, %lu}",
-               splits.value()[split_index].skip_rows,
-               splits.value()[split_index].num_rows);
-      }
-    }
-    printf("\n");
-  }
-}
 #endif  // PREPROCESS_DEBUG
 
-/**
- * @brief Functor which reduces two cumulative_row_info structs of the same key.
- */
-struct cumulative_row_sum {
-  cumulative_row_info operator()
-    __device__(cumulative_row_info const& a, cumulative_row_info const& b) const
-  {
-    return cumulative_row_info{a.row_count + b.row_count, a.size_bytes + b.size_bytes, a.key};
-  }
-};
-
-/**
- * @brief Functor which computes the total data size for a given type of cudf column.
- *
- * In the case of strings, the return size does not include the chars themselves. That
- * information is tracked separately (see PageInfo::str_bytes).
- */
-struct row_size_functor {
-  __device__ size_t validity_size(size_t num_rows, bool nullable)
-  {
-    return nullable ? (cudf::util::div_rounding_up_safe(num_rows, size_t{32}) * 4) : 0;
-  }
-
-  template <typename T>
-  __device__ size_t operator()(size_t num_rows, bool nullable)
-  {
-    auto const element_size = sizeof(device_storage_type_t<T>);
-    return (element_size * num_rows) + validity_size(num_rows, nullable);
-  }
-};
-
-template <>
-__device__ size_t row_size_functor::operator()<list_view>(size_t num_rows, bool nullable)
-{
-  auto const offset_size = sizeof(size_type);
-  // NOTE: Adding the + 1 offset here isn't strictly correct.  There will only be 1 extra offset
-  // for the entire column, whereas this is adding an extra offset per page.  So we will get a
-  // small over-estimate of the real size of the order :  # of pages * 4 bytes. It seems better
-  // to overestimate size somewhat than to underestimate it and potentially generate chunks
-  // that are too large.
-  return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable);
-}
-
-template <>
-__device__ size_t row_size_functor::operator()<struct_view>(size_t num_rows, bool nullable)
-{
-  return validity_size(num_rows, nullable);
-}
-
-template <>
-__device__ size_t row_size_functor::operator()<string_view>(size_t num_rows, bool nullable)
-{
-  // only returns the size of offsets and validity. the size of the actual string chars
-  // is tracked separately.
-  auto const offset_size = sizeof(size_type);
-  // see note about offsets in the list_view template.
-  return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable);
-}
-
-/**
- * @brief Functor which computes the total output cudf data size for all of
- * the data in this page.
- *
- * Sums across all nesting levels.
- */
-struct get_cumulative_row_info {
-  gpu::PageInfo const* const pages;
-
-  __device__ cumulative_row_info operator()(size_type index)
-  {
-    auto const& page = pages[index];
-    if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) {
-      return cumulative_row_info{0, 0, page.src_col_schema};
-    }
-
-    // total nested size, not counting string data
-    auto iter =
-      cudf::detail::make_counting_transform_iterator(0, [page, index] __device__(size_type i) {
-        auto const& pni = page.nesting[i];
-        return cudf::type_dispatcher(
-          data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable);
-      });
-
-    size_t const row_count = static_cast<size_t>(page.nesting[0].size);
-    return {
-      row_count,
-      thrust::reduce(thrust::seq, iter, iter + page.num_output_nesting_levels) + page.str_bytes,
-      page.src_col_schema};
-  }
-};
-
-/**
- * @brief Functor which computes the effective size of all input columns by page.
- *
- * For a given row, we want to find the cost of all pages for all columns involved
- * in loading up to that row.  The complication here is that not all pages are the
- * same size between columns. Example:
- *
- *              page row counts
- * Column A:    0 <----> 100 <----> 200
- * Column B:    0 <---------------> 200 <--------> 400
-                          |
- * if we decide to split at row 100, we don't really know the actual amount of bytes in column B
- * at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
- * page. Essentially, a conservative over-estimate of the real size.
- */
-struct row_total_size {
-  cumulative_row_info const* c_info;
-  size_type const* key_offsets;
-  size_t num_keys;
-
-  __device__ cumulative_row_info operator()(cumulative_row_info const& i)
-  {
-    // sum sizes for each input column at this row
-    size_t sum = 0;
-    for (int idx = 0; idx < num_keys; idx++) {
-      auto const start = key_offsets[idx];
-      auto const end   = key_offsets[idx + 1];
-      auto iter        = cudf::detail::make_counting_transform_iterator(
-        0, [&] __device__(size_type i) { return c_info[i].row_count; });
-      auto const page_index =
-        thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_count) - iter;
-      sum += c_info[page_index].size_bytes;
-    }
-    return {i.row_count, sum, i.key};
-  }
-};
-
-/**
- * @brief Given a vector of cumulative {row_count, byte_size} pairs and a chunk read
- * limit, determine the set of splits.
- *
- * @param sizes Vector of cumulative {row_count, byte_size} pairs
- * @param num_rows Total number of rows to read
- * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
- */
-std::vector<gpu::chunk_read_info> find_splits(std::vector<cumulative_row_info> const& sizes,
-                                              size_t num_rows,
-                                              size_t chunk_read_limit)
-{
-  // now we have an array of {row_count, real output bytes}. just walk through it and generate
-  // splits.
-  // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch
-  // sizes are reasonably large, this shouldn't iterate too many times
-  std::vector<gpu::chunk_read_info> splits;
-  {
-    size_t cur_pos             = 0;
-    size_t cur_cumulative_size = 0;
-    size_t cur_row_count       = 0;
-    auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_row_info const& i) {
-      return i.size_bytes - cur_cumulative_size;
-    });
-    auto end   = start + sizes.size();
-    while (cur_row_count < num_rows) {
-      int64_t split_pos =
-        thrust::lower_bound(thrust::seq, start + cur_pos, end, chunk_read_limit) - start;
-
-      // if we're past the end, or if the returned bucket is > than the chunk_read_limit, move back
-      // one.
-      if (static_cast<size_t>(split_pos) >= sizes.size() ||
-          (sizes[split_pos].size_bytes - cur_cumulative_size > chunk_read_limit)) {
-        split_pos--;
-      }
-
-      // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in
-      // a loop because all of the cumulative sizes for all the pages are sorted into one big list.
-      // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in
-      // the list twice. so we have to iterate until we skip past all of them.  The idea is that we
-      // either do this, or we have to call unique() on the input first.
-      while (split_pos < (static_cast<int64_t>(sizes.size()) - 1) &&
-             (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) {
-        split_pos++;
-      }
-
-      auto const start_row = cur_row_count;
-      cur_row_count        = sizes[split_pos].row_count;
-      splits.push_back(gpu::chunk_read_info{start_row, cur_row_count - start_row});
-      cur_pos             = split_pos;
-      cur_cumulative_size = sizes[split_pos].size_bytes;
-    }
-  }
-  // print_cumulative_row_info(sizes, "adjusted", splits);
-
-  return splits;
-}
-
-/**
- * @brief Given a set of pages that have had their sizes computed by nesting level and
- * a limit on total read size, generate a set of {skip_rows, num_rows} pairs representing
- * a set of reads that will generate output columns of total size <= `chunk_read_limit` bytes.
- *
- * @param pages All pages in the file
- * @param id Additional intermediate information required to process the pages
- * @param num_rows Total number of rows to read
- * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
- * @param stream CUDA stream to use
- */
-std::vector<gpu::chunk_read_info> compute_splits(
-  cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
-  gpu::pass_intermediate_data const& id,
-  size_t num_rows,
-  size_t chunk_read_limit,
-  rmm::cuda_stream_view stream)
-{
-  auto const& page_keys  = id.page_keys;
-  auto const& page_index = id.page_index;
-
-  // generate cumulative row counts and sizes
-  rmm::device_uvector<cumulative_row_info> c_info(page_keys.size(), stream);
-  // convert PageInfo to cumulative_row_info
-  auto page_input = thrust::make_transform_iterator(page_index.begin(),
-                                                    get_cumulative_row_info{pages.device_ptr()});
-  thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
-                                page_keys.begin(),
-                                page_keys.end(),
-                                page_input,
-                                c_info.begin(),
-                                thrust::equal_to{},
-                                cumulative_row_sum{});
-  // print_cumulative_page_info(pages, page_index, c_info, stream);
-
-  // sort by row count
-  rmm::device_uvector<cumulative_row_info> c_info_sorted{c_info, stream};
-  thrust::sort(rmm::exec_policy(stream),
-               c_info_sorted.begin(),
-               c_info_sorted.end(),
-               [] __device__(cumulative_row_info const& a, cumulative_row_info const& b) {
-                 return a.row_count < b.row_count;
-               });
-
-  // std::vector<cumulative_row_info> h_c_info_sorted(c_info_sorted.size());
-  // CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(),
-  //                          c_info_sorted.data(),
-  //                          sizeof(cumulative_row_info) * c_info_sorted.size(),
-  //                          cudaMemcpyDefault));
-  // print_cumulative_row_info(h_c_info_sorted, "raw");
-
-  // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per
-  // key
-  rmm::device_uvector<size_type> key_offsets(page_keys.size() + 1, stream);
-  auto const key_offsets_end = thrust::reduce_by_key(rmm::exec_policy(stream),
-                                                     page_keys.begin(),
-                                                     page_keys.end(),
-                                                     thrust::make_constant_iterator(1),
-                                                     thrust::make_discard_iterator(),
-                                                     key_offsets.begin())
-                                 .second;
-  size_t const num_unique_keys = key_offsets_end - key_offsets.begin();
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin());
-
-  // adjust the cumulative info such that for each row count, the size includes any pages that span
-  // that row count. this is so that if we have this case:
-  //              page row counts
-  // Column A:    0 <----> 100 <----> 200
-  // Column B:    0 <---------------> 200 <--------> 400
-  //                        |
-  // if we decide to split at row 100, we don't really know the actual amount of bytes in column B
-  // at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
-  // page.
-  //
-  rmm::device_uvector<cumulative_row_info> aggregated_info(c_info.size(), stream);
-  thrust::transform(rmm::exec_policy(stream),
-                    c_info_sorted.begin(),
-                    c_info_sorted.end(),
-                    aggregated_info.begin(),
-                    row_total_size{c_info.data(), key_offsets.data(), num_unique_keys});
-
-  // bring back to the cpu
-  std::vector<cumulative_row_info> h_aggregated_info(aggregated_info.size());
-  CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(),
-                                aggregated_info.data(),
-                                sizeof(cumulative_row_info) * c_info.size(),
-                                cudaMemcpyDefault,
-                                stream.value()));
-  stream.synchronize();
-
-  return find_splits(h_aggregated_info, num_rows, chunk_read_limit);
-}
-
 struct get_page_chunk_idx {
-  __device__ size_type operator()(gpu::PageInfo const& page) { return page.chunk_idx; }
+  __device__ size_type operator()(PageInfo const& page) { return page.chunk_idx; }
 };
 
 struct get_page_num_rows {
-  __device__ size_type operator()(gpu::PageInfo const& page) { return page.num_rows; }
+  __device__ size_type operator()(PageInfo const& page) { return page.num_rows; }
 };
 
 struct get_page_column_index {
-  gpu::ColumnChunkDesc const* chunks;
-  __device__ size_type operator()(gpu::PageInfo const& page)
+  ColumnChunkDesc const* chunks;
+  __device__ size_type operator()(PageInfo const& page)
   {
     return chunks[page.chunk_idx].src_col_index;
   }
@@ -1441,7 +884,7 @@ struct get_page_nesting_size {
   input_col_info const* const input_cols;
   size_type const max_depth;
   size_t const num_pages;
-  gpu::PageInfo const* const pages;
+  PageInfo const* const pages;
   int const* page_indices;
 
   __device__ size_type operator()(size_t index) const
@@ -1450,7 +893,7 @@ struct get_page_nesting_size {
 
     auto const& page = pages[page_indices[indices.page_idx]];
     if (page.src_col_schema != input_cols[indices.col_idx].schema_idx ||
-        page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY ||
+        page.flags & PAGEINFO_FLAGS_DICTIONARY ||
         indices.depth_idx >= input_cols[indices.col_idx].nesting_depth) {
       return 0;
     }
@@ -1468,7 +911,7 @@ struct get_reduction_key {
  * @brief Writes to the chunk_row field of the PageInfo struct.
  */
 struct chunk_row_output_iter {
-  gpu::PageInfo* p;
+  PageInfo* p;
   using value_type        = size_type;
   using difference_type   = size_type;
   using pointer           = size_type*;
@@ -1490,7 +933,7 @@ struct chunk_row_output_iter {
  * @brief Writes to the page_start_value field of the PageNestingInfo struct, keyed by schema.
  */
 struct start_offset_output_iterator {
-  gpu::PageInfo const* pages;
+  PageInfo const* pages;
   int const* page_indices;
   size_t cur_index;
   input_col_info const* input_cols;
@@ -1529,9 +972,9 @@ struct start_offset_output_iterator {
   {
     auto const indices = reduction_indices{index, max_depth, num_pages};
 
-    gpu::PageInfo const& p = pages[page_indices[indices.page_idx]];
+    PageInfo const& p = pages[page_indices[indices.page_idx]];
     if (p.src_col_schema != input_cols[indices.col_idx].schema_idx ||
-        p.flags & gpu::PAGEINFO_FLAGS_DICTIONARY ||
+        p.flags & PAGEINFO_FLAGS_DICTIONARY ||
         indices.depth_idx >= input_cols[indices.col_idx].nesting_depth) {
       return empty;
     }
@@ -1540,15 +983,15 @@ struct start_offset_output_iterator {
 };
 
 struct flat_column_num_rows {
-  gpu::PageInfo const* pages;
-  gpu::ColumnChunkDesc const* chunks;
+  PageInfo const* pages;
+  ColumnChunkDesc const* chunks;
 
   __device__ size_type operator()(size_type pindex) const
   {
-    gpu::PageInfo const& page = pages[pindex];
+    PageInfo const& page = pages[pindex];
     // ignore dictionary pages and pages belonging to any column containing repetition (lists)
-    if ((page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) ||
-        (chunks[page.chunk_idx].max_level[gpu::level_type::REPETITION] > 0)) {
+    if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) ||
+        (chunks[page.chunk_idx].max_level[level_type::REPETITION] > 0)) {
       return 0;
     }
     return page.num_rows;
@@ -1581,8 +1024,8 @@ struct row_counts_different {
  * @param expected_row_count Expected row count, if applicable
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void detect_malformed_pages(cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
-                            cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc> const& chunks,
+void detect_malformed_pages(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                            cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
                             device_span<int const> page_keys,
                             device_span<int const> page_index,
                             std::optional<size_t> expected_row_count,
@@ -1631,23 +1074,21 @@ void detect_malformed_pages(cudf::detail::hostdevice_vector<gpu::PageInfo>& page
 }
 
 struct page_to_string_size {
-  gpu::PageInfo* pages;
-  gpu::ColumnChunkDesc const* chunks;
+  PageInfo* pages;
+  ColumnChunkDesc const* chunks;
 
   __device__ size_t operator()(size_type page_idx) const
   {
     auto const page  = pages[page_idx];
     auto const chunk = chunks[page.chunk_idx];
 
-    if (not is_string_col(chunk) || (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) != 0) {
-      return 0;
-    }
+    if (not is_string_col(chunk) || (page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0) { return 0; }
     return pages[page_idx].str_bytes;
   }
 };
 
 struct page_offset_output_iter {
-  gpu::PageInfo* p;
+  PageInfo* p;
   size_type const* index;
 
   using value_type        = size_type;
@@ -1738,7 +1179,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
       cols          = &out_buf.children;
 
       // if this has a list parent, we have to get column sizes from the
-      // data computed during gpu::ComputePageSizes
+      // data computed during ComputePageSizes
       if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) {
         has_lists = true;
         break;
@@ -1749,7 +1190,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
 
   // generate string dict indices if necessary
   {
-    auto is_dict_chunk = [](gpu::ColumnChunkDesc const& chunk) {
+    auto is_dict_chunk = [](ColumnChunkDesc const& chunk) {
       return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0;
     };
 
@@ -1785,7 +1226,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
 
     if (total_str_dict_indexes > 0) {
       chunks.host_to_device_async(_stream);
-      gpu::BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream);
+      BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream);
     }
   }
 
@@ -1800,14 +1241,14 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
     // if:
     // - user has passed custom row bounds
     // - we will be doing a chunked read
-    gpu::ComputePageSizes(pages,
-                          chunks,
-                          0,  // 0-max size_t. process all possible rows
-                          std::numeric_limits<size_t>::max(),
-                          true,                  // compute num_rows
-                          chunk_read_limit > 0,  // compute string sizes
-                          _pass_itm_data->level_type_size,
-                          _stream);
+    ComputePageSizes(pages,
+                     chunks,
+                     0,  // 0-max size_t. process all possible rows
+                     std::numeric_limits<size_t>::max(),
+                     true,                  // compute num_rows
+                     chunk_read_limit > 0,  // compute string sizes
+                     _pass_itm_data->level_type_size,
+                     _stream);
 
     // computes:
     // PageInfo::chunk_row (the absolute start row index) for all pages
@@ -1831,12 +1272,8 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
   _pass_itm_data->page_keys  = std::move(page_keys);
   _pass_itm_data->page_index = std::move(page_index);
 
-  // compute splits if necessary. otherwise return a single split representing
-  // the whole file.
-  _pass_itm_data->output_chunk_read_info =
-    _output_chunk_read_limit > 0
-      ? compute_splits(pages, *_pass_itm_data, num_rows, chunk_read_limit, _stream)
-      : std::vector<gpu::chunk_read_info>{{skip_rows, num_rows}};
+  // compute splits for the pass
+  compute_splits_for_pass();
 }
 
 void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds)
@@ -1853,14 +1290,14 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
   // respect the user bounds. It is only necessary to do this second pass if uses_custom_row_bounds
   // is set (if the user has specified artificial bounds).
   if (uses_custom_row_bounds) {
-    gpu::ComputePageSizes(pages,
-                          chunks,
-                          skip_rows,
-                          num_rows,
-                          false,  // num_rows is already computed
-                          false,  // no need to compute string sizes
-                          _pass_itm_data->level_type_size,
-                          _stream);
+    ComputePageSizes(pages,
+                     chunks,
+                     skip_rows,
+                     num_rows,
+                     false,  // num_rows is already computed
+                     false,  // no need to compute string sizes
+                     _pass_itm_data->level_type_size,
+                     _stream);
 
     // print_pages(pages, _stream);
   }
@@ -1879,7 +1316,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
       cols          = &out_buf.children;
 
       // if this has a list parent, we have to get column sizes from the
-      // data computed during gpu::ComputePageSizes
+      // data computed during ComputePageSizes
       if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) {
         has_lists = true;
       }
@@ -2014,4 +1451,4 @@ std::vector<size_t> reader::impl::calculate_page_string_offsets()
   return col_sizes;
 }
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index 2545a074a38..799d6d9fd64 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -20,7 +20,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 template <int num_threads>
 constexpr int rle_stream_required_run_buffer_size()
@@ -362,4 +362,4 @@ struct rle_stream {
   }
 };
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index a124f352ee4..50589f23626 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -54,12 +54,9 @@
 #include <numeric>
 #include <utility>
 
-namespace cudf {
-namespace io {
-namespace detail {
-namespace parquet {
-using namespace cudf::io::parquet;
-using namespace cudf::io;
+namespace cudf::io::parquet::detail {
+
+using namespace cudf::io::detail;
 
 struct aggregate_writer_metadata {
   aggregate_writer_metadata(host_span<partition_info const> partitions,
@@ -185,13 +182,13 @@ namespace {
  * @param compression The compression type
  * @return The supported Parquet compression
  */
-parquet::Compression to_parquet_compression(compression_type compression)
+Compression to_parquet_compression(compression_type compression)
 {
   switch (compression) {
     case compression_type::AUTO:
-    case compression_type::SNAPPY: return parquet::Compression::SNAPPY;
-    case compression_type::ZSTD: return parquet::Compression::ZSTD;
-    case compression_type::NONE: return parquet::Compression::UNCOMPRESSED;
+    case compression_type::SNAPPY: return Compression::SNAPPY;
+    case compression_type::ZSTD: return Compression::ZSTD;
+    case compression_type::NONE: return Compression::UNCOMPRESSED;
     default: CUDF_FAIL("Unsupported compression type");
   }
 }
@@ -206,7 +203,7 @@ void update_chunk_encodings(std::vector<Encoding>& encodings, uint32_t enc_mask)
 {
   for (uint8_t enc = 0; enc < static_cast<uint8_t>(Encoding::NUM_ENCODINGS); enc++) {
     auto const enc_enum = static_cast<Encoding>(enc);
-    if ((enc_mask & gpu::encoding_to_mask(enc_enum)) != 0) { encodings.push_back(enc_enum); }
+    if ((enc_mask & encoding_to_mask(enc_enum)) != 0) { encodings.push_back(enc_enum); }
   }
 }
 
@@ -761,11 +758,11 @@ struct parquet_column_view {
                       std::vector<schema_tree_node> const& schema_tree,
                       rmm::cuda_stream_view stream);
 
-  [[nodiscard]] gpu::parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const;
+  [[nodiscard]] parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const;
 
   [[nodiscard]] column_view cudf_column_view() const { return cudf_col; }
-  [[nodiscard]] parquet::Type physical_type() const { return schema_node.type; }
-  [[nodiscard]] parquet::ConvertedType converted_type() const { return schema_node.converted_type; }
+  [[nodiscard]] Type physical_type() const { return schema_node.type; }
+  [[nodiscard]] ConvertedType converted_type() const { return schema_node.converted_type; }
 
   std::vector<std::string> const& get_path_in_schema() { return path_in_schema; }
 
@@ -846,11 +843,11 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
   uint16_t max_rep_level = 0;
   curr_schema_node       = schema_node;
   while (curr_schema_node.parent_idx != -1) {
-    if (curr_schema_node.repetition_type == parquet::REPEATED or
-        curr_schema_node.repetition_type == parquet::OPTIONAL) {
+    if (curr_schema_node.repetition_type == REPEATED or
+        curr_schema_node.repetition_type == OPTIONAL) {
       ++max_def_level;
     }
-    if (curr_schema_node.repetition_type == parquet::REPEATED) { ++max_rep_level; }
+    if (curr_schema_node.repetition_type == REPEATED) { ++max_rep_level; }
     curr_schema_node = schema_tree[curr_schema_node.parent_idx];
   }
   CUDF_EXPECTS(max_def_level < 256, "Definition levels above 255 are not supported");
@@ -897,9 +894,9 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
   }
 }
 
-gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream_view) const
+parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream_view) const
 {
-  auto desc        = gpu::parquet_column_device_view{};  // Zero out all fields
+  auto desc        = parquet_column_device_view{};  // Zero out all fields
   desc.stats_dtype = schema_node.stats_dtype;
   desc.ts_scale    = schema_node.ts_scale;
 
@@ -931,8 +928,8 @@ gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_s
  * @param fragment_size Number of rows per fragment
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void init_row_group_fragments(cudf::detail::hostdevice_2dvector<gpu::PageFragment>& frag,
-                              device_span<gpu::parquet_column_device_view const> col_desc,
+void init_row_group_fragments(cudf::detail::hostdevice_2dvector<PageFragment>& frag,
+                              device_span<parquet_column_device_view const> col_desc,
                               host_span<partition_info const> partitions,
                               device_span<int const> part_frag_offset,
                               uint32_t fragment_size,
@@ -940,7 +937,7 @@ void init_row_group_fragments(cudf::detail::hostdevice_2dvector<gpu::PageFragmen
 {
   auto d_partitions = cudf::detail::make_device_uvector_async(
     partitions, stream, rmm::mr::get_current_device_resource());
-  gpu::InitRowGroupFragments(frag, col_desc, d_partitions, part_frag_offset, fragment_size, stream);
+  InitRowGroupFragments(frag, col_desc, d_partitions, part_frag_offset, fragment_size, stream);
   frag.device_to_host_sync(stream);
 }
 
@@ -954,13 +951,13 @@ void init_row_group_fragments(cudf::detail::hostdevice_2dvector<gpu::PageFragmen
  * @param frag_sizes Array of fragment sizes for each column
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void calculate_page_fragments(device_span<gpu::PageFragment> frag,
+void calculate_page_fragments(device_span<PageFragment> frag,
                               host_span<size_type const> frag_sizes,
                               rmm::cuda_stream_view stream)
 {
   auto d_frag_sz = cudf::detail::make_device_uvector_async(
     frag_sizes, stream, rmm::mr::get_current_device_resource());
-  gpu::CalculatePageFragments(frag, d_frag_sz, stream);
+  CalculatePageFragments(frag, d_frag_sz, stream);
 }
 
 /**
@@ -972,13 +969,13 @@ void calculate_page_fragments(device_span<gpu::PageFragment> frag,
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 void gather_fragment_statistics(device_span<statistics_chunk> frag_stats,
-                                device_span<gpu::PageFragment const> frags,
+                                device_span<PageFragment const> frags,
                                 bool int96_timestamps,
                                 rmm::cuda_stream_view stream)
 {
   rmm::device_uvector<statistics_group> frag_stats_group(frag_stats.size(), stream);
 
-  gpu::InitFragmentStatistics(frag_stats_group, frags, stream);
+  InitFragmentStatistics(frag_stats_group, frags, stream);
   detail::calculate_group_statistics<detail::io_file_format::PARQUET>(
     frag_stats.data(), frag_stats_group.data(), frag_stats.size(), stream, int96_timestamps);
   stream.synchronize();
@@ -1008,8 +1005,8 @@ size_t max_compression_output_size(Compression codec, uint32_t compression_block
   return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize);
 }
 
-auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
-                     device_span<gpu::parquet_column_device_view const> col_desc,
+auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
+                     device_span<parquet_column_device_view const> col_desc,
                      uint32_t num_columns,
                      size_t max_page_size_bytes,
                      size_type max_page_size_rows,
@@ -1021,19 +1018,19 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
 
   chunks.host_to_device_async(stream);
   // Calculate number of pages and store in respective chunks
-  gpu::InitEncoderPages(chunks,
-                        {},
-                        {},
-                        {},
-                        col_desc,
-                        num_columns,
-                        max_page_size_bytes,
-                        max_page_size_rows,
-                        page_alignment(compression_codec),
-                        write_v2_headers,
-                        nullptr,
-                        nullptr,
-                        stream);
+  InitEncoderPages(chunks,
+                   {},
+                   {},
+                   {},
+                   col_desc,
+                   num_columns,
+                   max_page_size_bytes,
+                   max_page_size_rows,
+                   page_alignment(compression_codec),
+                   write_v2_headers,
+                   nullptr,
+                   nullptr,
+                   stream);
   chunks.device_to_host_sync(stream);
 
   int num_pages = 0;
@@ -1046,19 +1043,19 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   // Now that we know the number of pages, allocate an array to hold per page size and get it
   // populated
   cudf::detail::hostdevice_vector<size_type> page_sizes(num_pages, stream);
-  gpu::InitEncoderPages(chunks,
-                        {},
-                        page_sizes,
-                        {},
-                        col_desc,
-                        num_columns,
-                        max_page_size_bytes,
-                        max_page_size_rows,
-                        page_alignment(compression_codec),
-                        write_v2_headers,
-                        nullptr,
-                        nullptr,
-                        stream);
+  InitEncoderPages(chunks,
+                   {},
+                   page_sizes,
+                   {},
+                   col_desc,
+                   num_columns,
+                   max_page_size_bytes,
+                   max_page_size_rows,
+                   page_alignment(compression_codec),
+                   write_v2_headers,
+                   nullptr,
+                   nullptr,
+                   stream);
   page_sizes.device_to_host_sync(stream);
 
   // Get per-page max compressed size
@@ -1072,26 +1069,26 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   comp_page_sizes.host_to_device_async(stream);
 
   // Use per-page max compressed size to calculate chunk.compressed_size
-  gpu::InitEncoderPages(chunks,
-                        {},
-                        {},
-                        comp_page_sizes,
-                        col_desc,
-                        num_columns,
-                        max_page_size_bytes,
-                        max_page_size_rows,
-                        page_alignment(compression_codec),
-                        write_v2_headers,
-                        nullptr,
-                        nullptr,
-                        stream);
+  InitEncoderPages(chunks,
+                   {},
+                   {},
+                   comp_page_sizes,
+                   col_desc,
+                   num_columns,
+                   max_page_size_bytes,
+                   max_page_size_rows,
+                   page_alignment(compression_codec),
+                   write_v2_headers,
+                   nullptr,
+                   nullptr,
+                   stream);
   chunks.device_to_host_sync(stream);
   return comp_page_sizes;
 }
 
 size_t max_page_bytes(Compression compression, size_t max_page_size_bytes)
 {
-  if (compression == parquet::Compression::UNCOMPRESSED) { return max_page_size_bytes; }
+  if (compression == Compression::UNCOMPRESSED) { return max_page_size_bytes; }
 
   auto const ncomp_type   = to_nvcomp_compression_type(compression);
   auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type)
@@ -1104,9 +1101,9 @@ size_t max_page_bytes(Compression compression, size_t max_page_size_bytes)
 }
 
 std::pair<std::vector<rmm::device_uvector<size_type>>, std::vector<rmm::device_uvector<size_type>>>
-build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
-                         host_span<gpu::parquet_column_device_view const> col_desc,
-                         device_2dspan<gpu::PageFragment const> frags,
+build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
+                         host_span<parquet_column_device_view const> col_desc,
+                         device_2dspan<PageFragment const> frags,
                          Compression compression,
                          dictionary_policy dict_policy,
                          size_t max_dict_size,
@@ -1130,7 +1127,7 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   }
 
   // Allocate slots for each chunk
-  std::vector<rmm::device_uvector<gpu::slot_type>> hash_maps_storage;
+  std::vector<rmm::device_uvector<slot_type>> hash_maps_storage;
   hash_maps_storage.reserve(h_chunks.size());
   for (auto& chunk : h_chunks) {
     if (col_desc[chunk.col_desc_id].physical_type == Type::BOOLEAN ||
@@ -1149,8 +1146,8 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
 
   chunks.host_to_device_async(stream);
 
-  gpu::initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream);
-  gpu::populate_chunk_hash_maps(frags, stream);
+  initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream);
+  populate_chunk_hash_maps(frags, stream);
 
   chunks.device_to_host_sync(stream);
 
@@ -1197,8 +1194,8 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
     chunk.dict_index          = inserted_dict_index.data();
   }
   chunks.host_to_device_async(stream);
-  gpu::collect_map_entries(chunks.device_view().flat_view(), stream);
-  gpu::get_dictionary_indices(frags, stream);
+  collect_map_entries(chunks.device_view().flat_view(), stream);
+  get_dictionary_indices(frags, stream);
 
   return std::pair(std::move(dict_data), std::move(dict_index));
 }
@@ -1221,9 +1218,9 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
  * @param write_v2_headers True if version 2 page headers are to be written
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
-                        device_span<gpu::parquet_column_device_view const> col_desc,
-                        device_span<gpu::EncPage> pages,
+void init_encoder_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
+                        device_span<parquet_column_device_view const> col_desc,
+                        device_span<EncPage> pages,
                         cudf::detail::hostdevice_vector<size_type>& comp_page_sizes,
                         statistics_chunk* page_stats,
                         statistics_chunk* frag_stats,
@@ -1286,8 +1283,8 @@ void init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
  * @param write_v2_headers True if V2 page headers should be written
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
-                  device_span<gpu::EncPage> pages,
+void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
+                  device_span<EncPage> pages,
                   uint32_t pages_in_batch,
                   uint32_t first_page_in_batch,
                   uint32_t rowgroups_in_batch,
@@ -1308,8 +1305,7 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
       ? device_span<statistics_chunk const>(page_stats + first_page_in_batch, pages_in_batch)
       : device_span<statistics_chunk const>();
 
-  uint32_t max_comp_pages =
-    (compression != parquet::Compression::UNCOMPRESSED) ? pages_in_batch : 0;
+  uint32_t max_comp_pages = (compression != Compression::UNCOMPRESSED) ? pages_in_batch : 0;
 
   rmm::device_uvector<device_span<uint8_t const>> comp_in(max_comp_pages, stream);
   rmm::device_uvector<device_span<uint8_t>> comp_out(max_comp_pages, stream);
@@ -1319,9 +1315,9 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                comp_res.end(),
                compression_result{0, compression_status::FAILURE});
 
-  gpu::EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream);
+  EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream);
   switch (compression) {
-    case parquet::Compression::SNAPPY:
+    case Compression::SNAPPY:
       if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) {
         gpu_snap(comp_in, comp_out, comp_res, stream);
       } else {
@@ -1329,7 +1325,7 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
           nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream);
       }
       break;
-    case parquet::Compression::ZSTD: {
+    case Compression::ZSTD: {
       if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD);
           reason) {
         CUDF_FAIL("Compression error: " + reason.value());
@@ -1338,7 +1334,7 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
 
       break;
     }
-    case parquet::Compression::UNCOMPRESSED: break;
+    case Compression::UNCOMPRESSED: break;
     default: CUDF_FAIL("invalid compression type");
   }
 
@@ -1378,7 +1374,7 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
  * @param column_index_truncate_length maximum length of min or max values in column index, in bytes
  * @return Computed buffer size needed to encode the column index
  */
-size_t column_index_buffer_size(gpu::EncColumnChunk* ck, int32_t column_index_truncate_length)
+size_t column_index_buffer_size(EncColumnChunk* ck, int32_t column_index_truncate_length)
 {
   // encoding the column index for a given chunk requires:
   //   each list (4 of them) requires 6 bytes of overhead
@@ -1499,8 +1495,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   std::vector<SchemaElement> this_table_schema(schema_tree.begin(), schema_tree.end());
 
   // Initialize column description
-  cudf::detail::hostdevice_vector<gpu::parquet_column_device_view> col_desc(parquet_columns.size(),
-                                                                            stream);
+  cudf::detail::hostdevice_vector<parquet_column_device_view> col_desc(parquet_columns.size(),
+                                                                       stream);
   std::transform(
     parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [&](auto const& pcol) {
       return pcol.get_device_view(stream);
@@ -1576,7 +1572,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
 
   auto d_part_frag_offset = cudf::detail::make_device_uvector_async(
     part_frag_offset, stream, rmm::mr::get_current_device_resource());
-  cudf::detail::hostdevice_2dvector<gpu::PageFragment> row_group_fragments(
+  cudf::detail::hostdevice_2dvector<PageFragment> row_group_fragments(
     num_columns, num_fragments, stream);
 
   // Create table_device_view so that corresponding column_device_view data
@@ -1588,7 +1584,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   if (num_fragments != 0) {
     // Move column info to device
     col_desc.host_to_device_async(stream);
-    leaf_column_views = create_leaf_column_device_views<gpu::parquet_column_device_view>(
+    leaf_column_views = create_leaf_column_device_views<parquet_column_device_view>(
       col_desc, *parent_column_table_device_view, stream);
 
     init_row_group_fragments(row_group_fragments,
@@ -1662,7 +1658,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
 
   // Initialize row groups and column chunks
   auto const num_chunks = num_rowgroups * num_columns;
-  hostdevice_2dvector<gpu::EncColumnChunk> chunks(num_rowgroups, num_columns, stream);
+  hostdevice_2dvector<EncColumnChunk> chunks(num_rowgroups, num_columns, stream);
 
   // total fragments per column (in case they are non-uniform)
   std::vector<size_type> frags_per_column(num_columns, 0);
@@ -1678,7 +1674,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
       row_group.total_byte_size = 0;
       row_group.columns.resize(num_columns);
       for (int c = 0; c < num_columns; c++) {
-        gpu::EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c];
+        EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c];
 
         ck                   = {};
         ck.col_desc          = col_desc.device_ptr() + c;
@@ -1700,7 +1696,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
             return l + r.num_values;
           });
         ck.plain_data_size = std::accumulate(
-          chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, gpu::PageFragment frag) {
+          chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, PageFragment frag) {
             return sum + frag.fragment_data_size;
           });
         auto& column_chunk_meta          = row_group.columns[c].meta_data;
@@ -1731,7 +1727,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     frags_per_column.empty() ? 0 : frag_offsets.back() + frags_per_column.back();
 
   rmm::device_uvector<statistics_chunk> frag_stats(0, stream);
-  cudf::detail::hostdevice_vector<gpu::PageFragment> page_fragments(total_frags, stream);
+  cudf::detail::hostdevice_vector<PageFragment> page_fragments(total_frags, stream);
 
   // update fragments and/or prepare for fragment statistics calculation if necessary
   if (total_frags != 0) {
@@ -1749,9 +1745,9 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
           auto const& row_group = agg_meta->file(p).row_groups[global_r];
           uint32_t const fragments_in_chunk =
             util::div_rounding_up_unsafe(row_group.num_rows, frag_size);
-          gpu::EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c];
-          ck.fragments            = page_fragments.device_ptr(frag_offset);
-          ck.first_fragment       = frag_offset;
+          EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c];
+          ck.fragments       = page_fragments.device_ptr(frag_offset);
+          ck.first_fragment  = frag_offset;
 
           // update the chunk pointer here for each fragment in chunk.fragments
           for (uint32_t i = 0; i < fragments_in_chunk; i++) {
@@ -1817,8 +1813,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     size_t comp_rowgroup_size = 0;
     if (r < num_rowgroups) {
       for (int i = 0; i < num_columns; i++) {
-        gpu::EncColumnChunk* ck = &chunks[r][i];
-        ck->first_page          = num_pages;
+        EncColumnChunk* ck = &chunks[r][i];
+        ck->first_page     = num_pages;
         num_pages += ck->num_pages;
         pages_in_batch += ck->num_pages;
         rowgroup_size += ck->bfr_size;
@@ -1850,7 +1846,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   }
 
   // Clear compressed buffer size if compression has been turned off
-  if (compression == parquet::Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; }
+  if (compression == Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; }
 
   // Initialize data pointers in batch
   uint32_t const num_stats_bfr =
@@ -1864,7 +1860,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                               stream);
 
   rmm::device_buffer col_idx_bfr(column_index_bfr_size, stream);
-  rmm::device_uvector<gpu::EncPage> pages(num_pages, stream);
+  rmm::device_uvector<EncPage> pages(num_pages, stream);
 
   // This contains stats for both the pages and the rowgroups. TODO: make them separate.
   rmm::device_uvector<statistics_chunk> page_stats(num_stats_bfr, stream);
@@ -1874,10 +1870,10 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     auto bfr_c = static_cast<uint8_t*>(comp_bfr.data());
     for (auto j = 0; j < batch_list[b]; j++, r++) {
       for (auto i = 0; i < num_columns; i++) {
-        gpu::EncColumnChunk& ck = chunks[r][i];
-        ck.uncompressed_bfr     = bfr;
-        ck.compressed_bfr       = bfr_c;
-        ck.column_index_blob    = bfr_i;
+        EncColumnChunk& ck   = chunks[r][i];
+        ck.uncompressed_bfr  = bfr;
+        ck.compressed_bfr    = bfr_c;
+        ck.column_index_blob = bfr_i;
         bfr += ck.bfr_size;
         bfr_c += ck.compressed_size;
         if (stats_granularity == statistics_freq::STATISTICS_COLUMN) {
@@ -1960,7 +1956,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         if (ck.ck_stat_size != 0) {
           std::vector<uint8_t> const stats_blob = cudf::detail::make_std_vector_sync(
             device_span<uint8_t const>(dev_bfr, ck.ck_stat_size), stream);
-          cudf::io::parquet::CompactProtocolReader cp(stats_blob.data(), stats_blob.size());
+          CompactProtocolReader cp(stats_blob.data(), stats_blob.size());
           cp.read(&column_chunk_meta.statistics);
           need_sync = true;
         }
@@ -2142,8 +2138,8 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
 
 void writer::impl::write_parquet_data_to_sink(
   std::unique_ptr<aggregate_writer_metadata>& updated_agg_meta,
-  device_span<gpu::EncPage const> pages,
-  host_2dspan<gpu::EncColumnChunk const> chunks,
+  device_span<EncPage const> pages,
+  host_2dspan<EncColumnChunk const> chunks,
   host_span<size_t const> global_rowgroup_base,
   host_span<int const> first_rg_in_part,
   host_span<size_type const> batch_list,
@@ -2209,7 +2205,7 @@ void writer::impl::write_parquet_data_to_sink(
         int const global_r    = global_rowgroup_base[p] + r - first_rg_in_part[p];
         auto const& row_group = _agg_meta->file(p).row_groups[global_r];
         for (std::size_t i = 0; i < num_columns; i++) {
-          gpu::EncColumnChunk const& ck = chunks[r][i];
+          EncColumnChunk const& ck      = chunks[r][i];
           auto const& column_chunk_meta = row_group.columns[i].meta_data;
 
           // start transfer of the column index
@@ -2392,7 +2388,4 @@ std::unique_ptr<std::vector<uint8_t>> writer::merge_row_group_metadata(
   return std::make_unique<std::vector<uint8_t>>(std::move(output));
 }
 
-}  // namespace parquet
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 89ef85ba2bd..1d27a8400c8 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -38,15 +38,11 @@
 #include <string>
 #include <vector>
 
-namespace cudf {
-namespace io {
-namespace detail {
-namespace parquet {
+namespace cudf::io::parquet::detail {
+
 // Forward internal classes
 struct aggregate_writer_metadata;
 
-using namespace cudf::io::parquet;
-using namespace cudf::io;
 using cudf::detail::device_2dspan;
 using cudf::detail::host_2dspan;
 using cudf::detail::hostdevice_2dvector;
@@ -66,7 +62,7 @@ class writer::impl {
    */
   explicit impl(std::vector<std::unique_ptr<data_sink>> sinks,
                 parquet_writer_options const& options,
-                single_write_mode mode,
+                cudf::io::detail::single_write_mode mode,
                 rmm::cuda_stream_view stream);
 
   /**
@@ -79,7 +75,7 @@ class writer::impl {
    */
   explicit impl(std::vector<std::unique_ptr<data_sink>> sinks,
                 chunked_parquet_writer_options const& options,
-                single_write_mode mode,
+                cudf::io::detail::single_write_mode mode,
                 rmm::cuda_stream_view stream);
 
   /**
@@ -139,8 +135,8 @@ class writer::impl {
    * @param[out] bounce_buffer Temporary host output buffer
    */
   void write_parquet_data_to_sink(std::unique_ptr<aggregate_writer_metadata>& updated_agg_meta,
-                                  device_span<gpu::EncPage const> pages,
-                                  host_2dspan<gpu::EncColumnChunk const> chunks,
+                                  device_span<EncPage const> pages,
+                                  host_2dspan<EncColumnChunk const> chunks,
                                   host_span<size_t const> global_rowgroup_base,
                                   host_span<int const> first_rg_in_part,
                                   host_span<size_type const> batch_list,
@@ -164,9 +160,10 @@ class writer::impl {
   bool const _write_v2_headers;
   int32_t const _column_index_truncate_length;
   std::vector<std::map<std::string, std::string>> const _kv_meta;  // Optional user metadata.
-  single_write_mode const _single_write_mode;  // Special parameter only used by `write()` to
-                                               // indicate that we are guaranteeing a single table
-                                               // write. This enables some internal optimizations.
+  cudf::io::detail::single_write_mode const
+    _single_write_mode;  // Special parameter only used by `write()` to
+                         // indicate that we are guaranteeing a single table
+                         // write. This enables some internal optimizations.
   std::vector<std::unique_ptr<data_sink>> const _out_sink;
 
   // Internal states, filled during `write()` and written to sink during `write` and `close()`.
@@ -180,7 +177,4 @@ class writer::impl {
   bool _closed                = false;  // To track if the output has been written to sink.
 };
 
-}  // namespace parquet
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index f3a43cbc63c..dd049d401cf 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -51,19 +51,21 @@ std::unique_ptr<column> gather_column_buffer::make_string_column_impl(rmm::cuda_
   return make_strings_column(*_strings, stream, _mr);
 }
 
-void inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
+void cudf::io::detail::inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(type.id() == type_id::STRING, "allocate_strings_data called for non-string column");
   // size + 1 for final offset. _string_data will be initialized later.
   _data = create_data(data_type{type_id::INT32}, size + 1, stream, _mr);
 }
 
-void inline_column_buffer::create_string_data(size_t num_bytes, rmm::cuda_stream_view stream)
+void cudf::io::detail::inline_column_buffer::create_string_data(size_t num_bytes,
+                                                                rmm::cuda_stream_view stream)
 {
   _string_data = rmm::device_buffer(num_bytes, stream, _mr);
 }
 
-std::unique_ptr<column> inline_column_buffer::make_string_column_impl(rmm::cuda_stream_view stream)
+std::unique_ptr<column> cudf::io::detail::inline_column_buffer::make_string_column_impl(
+  rmm::cuda_stream_view stream)
 {
   // no need for copies, just transfer ownership of the data_buffers to the columns
   auto const state = mask_state::UNALLOCATED;
@@ -324,7 +326,7 @@ std::unique_ptr<column> empty_like(column_buffer_base<string_policy>& buffer,
 }
 
 using pointer_type = gather_column_buffer;
-using string_type  = inline_column_buffer;
+using string_type  = cudf::io::detail::inline_column_buffer;
 
 using pointer_column_buffer = column_buffer_base<pointer_type>;
 using string_column_buffer  = column_buffer_base<string_type>;
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 7a7121aa91d..5cdd92ce3b7 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -375,6 +375,14 @@ class user_datasource_wrapper : public datasource {
     return source->device_read(offset, size, stream);
   }
 
+  std::future<size_t> device_read_async(size_t offset,
+                                        size_t size,
+                                        uint8_t* dst,
+                                        rmm::cuda_stream_view stream) override
+  {
+    return source->device_read_async(offset, size, dst, stream);
+  }
+
   [[nodiscard]] size_t size() const override { return source->size(); }
 
  private:
diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu
index fbe297765f8..99dbd55678b 100644
--- a/cpp/src/lists/combine/concatenate_list_elements.cu
+++ b/cpp/src/lists/combine/concatenate_list_elements.cu
@@ -271,10 +271,11 @@ std::unique_ptr<column> concatenate_list_elements(column_view const& input,
  */
 std::unique_ptr<column> concatenate_list_elements(column_view const& input,
                                                   concatenate_null_policy null_policy,
+                                                  rmm::cuda_stream_view stream,
                                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate_list_elements(input, null_policy, cudf::get_default_stream(), mr);
+  return detail::concatenate_list_elements(input, null_policy, stream, mr);
 }
 
 }  // namespace lists
diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu
index 658538b0195..49be7b5ff17 100644
--- a/cpp/src/lists/combine/concatenate_rows.cu
+++ b/cpp/src/lists/combine/concatenate_rows.cu
@@ -305,10 +305,11 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
  */
 std::unique_ptr<column> concatenate_rows(table_view const& input,
                                          concatenate_null_policy null_policy,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate_rows(input, null_policy, cudf::get_default_stream(), mr);
+  return detail::concatenate_rows(input, null_policy, stream, mr);
 }
 
 }  // namespace lists
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index df1d043bdb6..4733a5d63a8 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -287,7 +287,7 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
   }
 
   auto search_key_col = cudf::make_column_from_scalar(search_key, lists.size(), stream, mr);
-  return index_of(lists, search_key_col->view(), find_option, stream, mr);
+  return detail::index_of(lists, search_key_col->view(), find_option, stream, mr);
 }
 
 std::unique_ptr<column> index_of(lists_column_view const& lists,
@@ -306,11 +306,11 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
-  auto key_indices = index_of(lists,
-                              search_key,
-                              duplicate_find_option::FIND_FIRST,
-                              stream,
-                              rmm::mr::get_current_device_resource());
+  auto key_indices = detail::index_of(lists,
+                                      search_key,
+                                      duplicate_find_option::FIND_FIRST,
+                                      stream,
+                                      rmm::mr::get_current_device_resource());
   return to_contains(std::move(key_indices), stream, mr);
 }
 
@@ -322,11 +322,11 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
   CUDF_EXPECTS(search_keys.size() == lists.size(),
                "Number of search keys must match list column size.");
 
-  auto key_indices = index_of(lists,
-                              search_keys,
-                              duplicate_find_option::FIND_FIRST,
-                              stream,
-                              rmm::mr::get_current_device_resource());
+  auto key_indices = detail::index_of(lists,
+                                      search_keys,
+                                      duplicate_find_option::FIND_FIRST,
+                                      stream,
+                                      rmm::mr::get_current_device_resource());
   return to_contains(std::move(key_indices), stream, mr);
 }
 
@@ -364,43 +364,48 @@ std::unique_ptr<column> contains_nulls(lists_column_view const& lists,
 
 std::unique_ptr<column> contains(lists_column_view const& lists,
                                  cudf::scalar const& search_key,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(lists, search_key, cudf::get_default_stream(), mr);
+  return detail::contains(lists, search_key, stream, mr);
 }
 
 std::unique_ptr<column> contains(lists_column_view const& lists,
                                  column_view const& search_keys,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(lists, search_keys, cudf::get_default_stream(), mr);
+  return detail::contains(lists, search_keys, stream, mr);
 }
 
 std::unique_ptr<column> contains_nulls(lists_column_view const& lists,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains_nulls(lists, cudf::get_default_stream(), mr);
+  return detail::contains_nulls(lists, stream, mr);
 }
 
 std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  cudf::scalar const& search_key,
                                  duplicate_find_option find_option,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::index_of(lists, search_key, find_option, cudf::get_default_stream(), mr);
+  return detail::index_of(lists, search_key, find_option, stream, mr);
 }
 
 std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  column_view const& search_keys,
                                  duplicate_find_option find_option,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::index_of(lists, search_keys, find_option, cudf::get_default_stream(), mr);
+  return detail::index_of(lists, search_keys, find_option, stream, mr);
 }
 
 }  // namespace cudf::lists
diff --git a/cpp/src/lists/copying/concatenate.cu b/cpp/src/lists/copying/concatenate.cu
index ddd0dfbe084..5407b88236f 100644
--- a/cpp/src/lists/copying/concatenate.cu
+++ b/cpp/src/lists/copying/concatenate.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/concatenate_masks.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -123,8 +124,8 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
   // if any of the input columns have nulls, construct the output mask
   bool const has_nulls =
     std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
-  rmm::device_buffer null_mask = create_null_mask(
-    total_list_count, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED);
+  rmm::device_buffer null_mask = cudf::detail::create_null_mask(
+    total_list_count, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED, stream, mr);
   auto null_mask_data = static_cast<bitmask_type*>(null_mask.data());
   auto const null_count =
     has_nulls ? cudf::detail::concatenate_masks(columns, null_mask_data, stream) : size_type{0};
diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu
index 40a14d805e1..2fd0851067a 100644
--- a/cpp/src/lists/count_elements.cu
+++ b/cpp/src/lists/count_elements.cu
@@ -73,10 +73,11 @@ std::unique_ptr<column> count_elements(lists_column_view const& input,
 // external APIS
 
 std::unique_ptr<column> count_elements(lists_column_view const& input,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_elements(input, cudf::get_default_stream(), mr);
+  return detail::count_elements(input, stream, mr);
 }
 
 }  // namespace lists
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index c501a8bf7b4..850ccaa4535 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -168,18 +168,20 @@ std::unique_ptr<column> pad(strings_column_view const& input,
                             size_type width,
                             side_type side,
                             std::string_view fill_char,
+                            rmm::cuda_stream_view stream,
                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::pad(input, width, side, fill_char, cudf::get_default_stream(), mr);
+  return detail::pad(input, width, side, fill_char, stream, mr);
 }
 
 std::unique_ptr<column> zfill(strings_column_view const& input,
                               size_type width,
+                              rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::zfill(input, width, cudf::get_default_stream(), mr);
+  return detail::zfill(input, width, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index cce6a19a5a6..5a1fee92c7d 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -248,20 +248,21 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       numeric_scalar<size_type> const& start,
                                       numeric_scalar<size_type> const& stop,
                                       numeric_scalar<size_type> const& step,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::slice_strings(strings, start, stop, step, cudf::get_default_stream(), mr);
+  return detail::slice_strings(strings, start, stop, step, stream, mr);
 }
 
 std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       column_view const& starts_column,
                                       column_view const& stops_column,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::slice_strings(
-    strings, starts_column, stops_column, cudf::get_default_stream(), mr);
+  return detail::slice_strings(strings, starts_column, stops_column, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/strip.cu b/cpp/src/strings/strip.cu
index 6fb7c671a87..26df76850f7 100644
--- a/cpp/src/strings/strip.cu
+++ b/cpp/src/strings/strip.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -86,10 +86,11 @@ std::unique_ptr<column> strip(strings_column_view const& input,
 std::unique_ptr<column> strip(strings_column_view const& input,
                               side_type side,
                               string_scalar const& to_strip,
+                              rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::strip(input, side, to_strip, cudf::get_default_stream(), mr);
+  return detail::strip(input, side, to_strip, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/wrap.cu b/cpp/src/strings/wrap.cu
index 335908d65d1..aa87a663964 100644
--- a/cpp/src/strings/wrap.cu
+++ b/cpp/src/strings/wrap.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,10 +19,9 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/case.hpp>
-#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/strings/wrap.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -133,10 +132,11 @@ std::unique_ptr<column> wrap(strings_column_view const& strings,
 
 std::unique_ptr<column> wrap(strings_column_view const& strings,
                              size_type width,
+                             rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::wrap<detail::execute_wrap>(strings, width, cudf::get_default_stream(), mr);
+  return detail::wrap<detail::execute_wrap>(strings, width, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index ac13c121530..b15a6c41d39 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -633,11 +633,12 @@ ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing)
 ConfigureTest(
-  STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp STREAM_MODE
-  testing
+  STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp
+  streams/strings/strings_tests.cpp STREAM_MODE testing
 )
 ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing)
 
 # ##################################################################################################
 # Install tests ####################################################################################
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 73c946a5feb..fa85e3a4a1d 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -200,29 +200,30 @@ std::unique_ptr<cudf::column> make_parquet_list_list_col(
 // of the file to populate the FileMetaData pointed to by file_meta_data.
 // throws cudf::logic_error if the file or metadata is invalid.
 void read_footer(std::unique_ptr<cudf::io::datasource> const& source,
-                 cudf::io::parquet::FileMetaData* file_meta_data)
+                 cudf::io::parquet::detail::FileMetaData* file_meta_data)
 {
-  constexpr auto header_len = sizeof(cudf::io::parquet::file_header_s);
-  constexpr auto ender_len  = sizeof(cudf::io::parquet::file_ender_s);
+  constexpr auto header_len = sizeof(cudf::io::parquet::detail::file_header_s);
+  constexpr auto ender_len  = sizeof(cudf::io::parquet::detail::file_ender_s);
 
   auto const len           = source->size();
   auto const header_buffer = source->host_read(0, header_len);
   auto const header =
-    reinterpret_cast<cudf::io::parquet::file_header_s const*>(header_buffer->data());
+    reinterpret_cast<cudf::io::parquet::detail::file_header_s const*>(header_buffer->data());
   auto const ender_buffer = source->host_read(len - ender_len, ender_len);
-  auto const ender = reinterpret_cast<cudf::io::parquet::file_ender_s const*>(ender_buffer->data());
+  auto const ender =
+    reinterpret_cast<cudf::io::parquet::detail::file_ender_s const*>(ender_buffer->data());
 
   // checks for valid header, footer, and file length
   ASSERT_GT(len, header_len + ender_len);
-  ASSERT_TRUE(header->magic == cudf::io::parquet::parquet_magic &&
-              ender->magic == cudf::io::parquet::parquet_magic);
+  ASSERT_TRUE(header->magic == cudf::io::parquet::detail::parquet_magic &&
+              ender->magic == cudf::io::parquet::detail::parquet_magic);
   ASSERT_TRUE(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len));
 
   // parquet files end with 4-byte footer_length and 4-byte magic == "PAR1"
   // seek backwards from the end of the file (footer_length + 8 bytes of ender)
   auto const footer_buffer =
     source->host_read(len - ender->footer_len - ender_len, ender->footer_len);
-  cudf::io::parquet::CompactProtocolReader cp(footer_buffer->data(), ender->footer_len);
+  cudf::io::parquet::detail::CompactProtocolReader cp(footer_buffer->data(), ender->footer_len);
 
   // returns true on success
   bool res = cp.read(file_meta_data);
@@ -233,14 +234,14 @@ void read_footer(std::unique_ptr<cudf::io::datasource> const& source,
 // this assumes the data is uncompressed.
 // throws cudf::logic_error if the page_loc data is invalid.
 int read_dict_bits(std::unique_ptr<cudf::io::datasource> const& source,
-                   cudf::io::parquet::PageLocation const& page_loc)
+                   cudf::io::parquet::detail::PageLocation const& page_loc)
 {
   CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header");
   CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length");
 
-  cudf::io::parquet::PageHeader page_hdr;
+  cudf::io::parquet::detail::PageHeader page_hdr;
   auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size);
-  cudf::io::parquet::CompactProtocolReader cp(page_buf->data(), page_buf->size());
+  cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size());
   bool res = cp.read(&page_hdr);
   CUDF_EXPECTS(res, "Cannot parse page header");
 
@@ -252,15 +253,16 @@ int read_dict_bits(std::unique_ptr<cudf::io::datasource> const& source,
 // read column index from datasource at location indicated by chunk,
 // parse and return as a ColumnIndex struct.
 // throws cudf::logic_error if the chunk data is invalid.
-cudf::io::parquet::ColumnIndex read_column_index(
-  std::unique_ptr<cudf::io::datasource> const& source, cudf::io::parquet::ColumnChunk const& chunk)
+cudf::io::parquet::detail::ColumnIndex read_column_index(
+  std::unique_ptr<cudf::io::datasource> const& source,
+  cudf::io::parquet::detail::ColumnChunk const& chunk)
 {
   CUDF_EXPECTS(chunk.column_index_offset > 0, "Cannot find column index");
   CUDF_EXPECTS(chunk.column_index_length > 0, "Invalid column index length");
 
-  cudf::io::parquet::ColumnIndex colidx;
+  cudf::io::parquet::detail::ColumnIndex colidx;
   auto const ci_buf = source->host_read(chunk.column_index_offset, chunk.column_index_length);
-  cudf::io::parquet::CompactProtocolReader cp(ci_buf->data(), ci_buf->size());
+  cudf::io::parquet::detail::CompactProtocolReader cp(ci_buf->data(), ci_buf->size());
   bool res = cp.read(&colidx);
   CUDF_EXPECTS(res, "Cannot parse column index");
   return colidx;
@@ -269,22 +271,24 @@ cudf::io::parquet::ColumnIndex read_column_index(
 // read offset index from datasource at location indicated by chunk,
 // parse and return as an OffsetIndex struct.
 // throws cudf::logic_error if the chunk data is invalid.
-cudf::io::parquet::OffsetIndex read_offset_index(
-  std::unique_ptr<cudf::io::datasource> const& source, cudf::io::parquet::ColumnChunk const& chunk)
+cudf::io::parquet::detail::OffsetIndex read_offset_index(
+  std::unique_ptr<cudf::io::datasource> const& source,
+  cudf::io::parquet::detail::ColumnChunk const& chunk)
 {
   CUDF_EXPECTS(chunk.offset_index_offset > 0, "Cannot find offset index");
   CUDF_EXPECTS(chunk.offset_index_length > 0, "Invalid offset index length");
 
-  cudf::io::parquet::OffsetIndex offidx;
+  cudf::io::parquet::detail::OffsetIndex offidx;
   auto const oi_buf = source->host_read(chunk.offset_index_offset, chunk.offset_index_length);
-  cudf::io::parquet::CompactProtocolReader cp(oi_buf->data(), oi_buf->size());
+  cudf::io::parquet::detail::CompactProtocolReader cp(oi_buf->data(), oi_buf->size());
   bool res = cp.read(&offidx);
   CUDF_EXPECTS(res, "Cannot parse offset index");
   return offidx;
 }
 
 // Return as a Statistics from the column chunk
-cudf::io::parquet::Statistics const& get_statistics(cudf::io::parquet::ColumnChunk const& chunk)
+cudf::io::parquet::detail::Statistics const& get_statistics(
+  cudf::io::parquet::detail::ColumnChunk const& chunk)
 {
   return chunk.meta_data.statistics;
 }
@@ -292,15 +296,16 @@ cudf::io::parquet::Statistics const& get_statistics(cudf::io::parquet::ColumnChu
 // read page header from datasource at location indicated by page_loc,
 // parse and return as a PageHeader struct.
 // throws cudf::logic_error if the page_loc data is invalid.
-cudf::io::parquet::PageHeader read_page_header(std::unique_ptr<cudf::io::datasource> const& source,
-                                               cudf::io::parquet::PageLocation const& page_loc)
+cudf::io::parquet::detail::PageHeader read_page_header(
+  std::unique_ptr<cudf::io::datasource> const& source,
+  cudf::io::parquet::detail::PageLocation const& page_loc)
 {
   CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header");
   CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length");
 
-  cudf::io::parquet::PageHeader page_hdr;
+  cudf::io::parquet::detail::PageHeader page_hdr;
   auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size);
-  cudf::io::parquet::CompactProtocolReader cp(page_buf->data(), page_buf->size());
+  cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size());
   bool res = cp.read(&page_hdr);
   CUDF_EXPECTS(res, "Cannot parse page header");
   return page_hdr;
@@ -3686,7 +3691,7 @@ TEST_F(ParquetWriterTest, CheckPageRows)
 
   // check first page header and make sure it has only page_rows values
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   ASSERT_GT(fmd.row_groups.size(), 0);
@@ -3697,7 +3702,7 @@ TEST_F(ParquetWriterTest, CheckPageRows)
   // read first data page header.  sizeof(PageHeader) is not exact, but the thrift encoded
   // version should be smaller than size of the struct.
   auto const ph = read_page_header(
-    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0});
+    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0});
 
   EXPECT_EQ(ph.data_page_header.num_values, page_rows);
 }
@@ -3722,7 +3727,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsAdjusted)
 
   // check first page header and make sure it has only page_rows values
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   ASSERT_GT(fmd.row_groups.size(), 0);
@@ -3733,7 +3738,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsAdjusted)
   // read first data page header.  sizeof(PageHeader) is not exact, but the thrift encoded
   // version should be smaller than size of the struct.
   auto const ph = read_page_header(
-    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0});
+    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0});
 
   EXPECT_LE(ph.data_page_header.num_values, rows_per_page);
 }
@@ -3759,7 +3764,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsTooSmall)
 
   // check that file is written correctly when rows/page < fragment size
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   ASSERT_TRUE(fmd.row_groups.size() > 0);
@@ -3770,7 +3775,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsTooSmall)
   // read first data page header.  sizeof(PageHeader) is not exact, but the thrift encoded
   // version should be smaller than size of the struct.
   auto const ph = read_page_header(
-    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0});
+    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0});
 
   // there should be only one page since the fragment size is larger than rows_per_page
   EXPECT_EQ(ph.data_page_header.num_values, num_rows);
@@ -3798,7 +3803,7 @@ TEST_F(ParquetWriterTest, Decimal128Stats)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4031,7 +4036,7 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   ASSERT_GT(fmd.row_groups.size(), 0);
@@ -4041,10 +4046,10 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted)
 
   // now check that the boundary order for chunk 1 is ascending,
   // chunk 2 is descending, and chunk 3 is unordered
-  cudf::io::parquet::BoundaryOrder expected_orders[] = {
-    cudf::io::parquet::BoundaryOrder::ASCENDING,
-    cudf::io::parquet::BoundaryOrder::DESCENDING,
-    cudf::io::parquet::BoundaryOrder::UNORDERED};
+  cudf::io::parquet::detail::BoundaryOrder expected_orders[] = {
+    cudf::io::parquet::detail::BoundaryOrder::ASCENDING,
+    cudf::io::parquet::detail::BoundaryOrder::DESCENDING,
+    cudf::io::parquet::detail::BoundaryOrder::UNORDERED};
 
   for (std::size_t i = 0; i < columns.size(); i++) {
     auto const ci = read_column_index(source, columns[i]);
@@ -4067,15 +4072,15 @@ int32_t compare(T& v1, T& v2)
 // 1 if v1 > v2.
 int32_t compare_binary(std::vector<uint8_t> const& v1,
                        std::vector<uint8_t> const& v2,
-                       cudf::io::parquet::Type ptype,
-                       cudf::io::parquet::ConvertedType ctype)
+                       cudf::io::parquet::detail::Type ptype,
+                       cudf::io::parquet::detail::ConvertedType ctype)
 {
   switch (ptype) {
-    case cudf::io::parquet::INT32:
+    case cudf::io::parquet::detail::INT32:
       switch (ctype) {
-        case cudf::io::parquet::UINT_8:
-        case cudf::io::parquet::UINT_16:
-        case cudf::io::parquet::UINT_32:
+        case cudf::io::parquet::detail::UINT_8:
+        case cudf::io::parquet::detail::UINT_16:
+        case cudf::io::parquet::detail::UINT_32:
           return compare(*(reinterpret_cast<uint32_t const*>(v1.data())),
                          *(reinterpret_cast<uint32_t const*>(v2.data())));
         default:
@@ -4083,23 +4088,23 @@ int32_t compare_binary(std::vector<uint8_t> const& v1,
                          *(reinterpret_cast<int32_t const*>(v2.data())));
       }
 
-    case cudf::io::parquet::INT64:
-      if (ctype == cudf::io::parquet::UINT_64) {
+    case cudf::io::parquet::detail::INT64:
+      if (ctype == cudf::io::parquet::detail::UINT_64) {
         return compare(*(reinterpret_cast<uint64_t const*>(v1.data())),
                        *(reinterpret_cast<uint64_t const*>(v2.data())));
       }
       return compare(*(reinterpret_cast<int64_t const*>(v1.data())),
                      *(reinterpret_cast<int64_t const*>(v2.data())));
 
-    case cudf::io::parquet::FLOAT:
+    case cudf::io::parquet::detail::FLOAT:
       return compare(*(reinterpret_cast<float const*>(v1.data())),
                      *(reinterpret_cast<float const*>(v2.data())));
 
-    case cudf::io::parquet::DOUBLE:
+    case cudf::io::parquet::detail::DOUBLE:
       return compare(*(reinterpret_cast<double const*>(v1.data())),
                      *(reinterpret_cast<double const*>(v2.data())));
 
-    case cudf::io::parquet::BYTE_ARRAY: {
+    case cudf::io::parquet::detail::BYTE_ARRAY: {
       int32_t v1sz = v1.size();
       int32_t v2sz = v2.size();
       int32_t ret  = memcmp(v1.data(), v2.data(), std::min(v1sz, v2sz));
@@ -4142,7 +4147,7 @@ TEST_P(ParquetV2Test, LargeColumnIndex)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4156,18 +4161,20 @@ TEST_P(ParquetV2Test, LargeColumnIndex)
       // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max
       auto const ptype = fmd.schema[c + 1].type;
       auto const ctype = fmd.schema[c + 1].converted_type;
-      EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value, ptype, ctype) <= 0);
-      EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value, ptype, ctype) >= 0);
+      ASSERT_TRUE(stats.min_value.has_value());
+      ASSERT_TRUE(stats.max_value.has_value());
+      EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0);
+      EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0);
     }
   }
 }
 
 TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
 {
-  constexpr auto num_rows = 100000;
-  auto const is_v2        = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  constexpr auto num_rows      = 100000;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   // fixed length strings
   auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
@@ -4210,7 +4217,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4237,6 +4244,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
       auto const ci    = read_column_index(source, chunk);
       auto const stats = get_statistics(chunk);
 
+      ASSERT_TRUE(stats.min_value.has_value());
+      ASSERT_TRUE(stats.max_value.has_value());
+
       // schema indexing starts at 1
       auto const ptype = fmd.schema[c + 1].type;
       auto const ctype = fmd.schema[c + 1].converted_type;
@@ -4245,20 +4255,20 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
         EXPECT_FALSE(ci.null_pages[p]);
         // null_counts should always be 0
         EXPECT_EQ(ci.null_counts[p], 0);
-        EXPECT_TRUE(compare_binary(stats.min_value, ci.min_values[p], ptype, ctype) <= 0);
+        EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0);
       }
       for (size_t p = 0; p < ci.max_values.size(); p++)
-        EXPECT_TRUE(compare_binary(stats.max_value, ci.max_values[p], ptype, ctype) >= 0);
+        EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0);
     }
   }
 }
 
 TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
 {
-  constexpr auto num_rows = 100000;
-  auto const is_v2        = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  constexpr auto num_rows      = 100000;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   // fixed length strings
   auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
@@ -4311,7 +4321,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4339,7 +4349,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
       auto const stats = get_statistics(chunk);
 
       // should be half nulls, except no nulls in column 0
-      EXPECT_EQ(stats.null_count, c == 0 ? 0 : num_rows / 2);
+      ASSERT_TRUE(stats.min_value.has_value());
+      ASSERT_TRUE(stats.max_value.has_value());
+      ASSERT_TRUE(stats.null_count.has_value());
+      EXPECT_EQ(stats.null_count.value(), c == 0 ? 0 : num_rows / 2);
 
       // schema indexing starts at 1
       auto const ptype = fmd.schema[c + 1].type;
@@ -4351,10 +4364,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
         } else {
           EXPECT_EQ(ci.null_counts[p], 0);
         }
-        EXPECT_TRUE(compare_binary(stats.min_value, ci.min_values[p], ptype, ctype) <= 0);
+        EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0);
       }
       for (size_t p = 0; p < ci.max_values.size(); p++) {
-        EXPECT_TRUE(compare_binary(stats.max_value, ci.max_values[p], ptype, ctype) >= 0);
+        EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0);
       }
     }
   }
@@ -4362,10 +4375,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
 
 TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
 {
-  constexpr auto num_rows = 100000;
-  auto const is_v2        = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  constexpr auto num_rows      = 100000;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   // fixed length strings
   auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
@@ -4403,7 +4416,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4431,7 +4444,12 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
       auto const stats = get_statistics(chunk);
 
       // there should be no nulls except column 1 which is all nulls
-      EXPECT_EQ(stats.null_count, c == 1 ? num_rows : 0);
+      if (c != 1) {
+        ASSERT_TRUE(stats.min_value.has_value());
+        ASSERT_TRUE(stats.max_value.has_value());
+      }
+      ASSERT_TRUE(stats.null_count.has_value());
+      EXPECT_EQ(stats.null_count.value(), c == 1 ? num_rows : 0);
 
       // schema indexing starts at 1
       auto const ptype = fmd.schema[c + 1].type;
@@ -4444,12 +4462,12 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
         }
         if (not ci.null_pages[p]) {
           EXPECT_EQ(ci.null_counts[p], 0);
-          EXPECT_TRUE(compare_binary(stats.min_value, ci.min_values[p], ptype, ctype) <= 0);
+          EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0);
         }
       }
       for (size_t p = 0; p < ci.max_values.size(); p++) {
         if (not ci.null_pages[p]) {
-          EXPECT_TRUE(compare_binary(stats.max_value, ci.max_values[p], ptype, ctype) >= 0);
+          EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0);
         }
       }
     }
@@ -4458,9 +4476,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
 
 TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct)
 {
-  auto const is_v2 = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   auto c0 = testdata::ascending<uint32_t>();
 
@@ -4495,7 +4513,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4528,13 +4546,16 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct)
       auto const ci    = read_column_index(source, chunk);
       auto const stats = get_statistics(chunk);
 
+      ASSERT_TRUE(stats.min_value.has_value());
+      ASSERT_TRUE(stats.max_value.has_value());
+
       auto const ptype = fmd.schema[colidx].type;
       auto const ctype = fmd.schema[colidx].converted_type;
       for (size_t p = 0; p < ci.min_values.size(); p++) {
-        EXPECT_TRUE(compare_binary(stats.min_value, ci.min_values[p], ptype, ctype) <= 0);
+        EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0);
       }
       for (size_t p = 0; p < ci.max_values.size(); p++) {
-        EXPECT_TRUE(compare_binary(stats.max_value, ci.max_values[p], ptype, ctype) >= 0);
+        EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0);
       }
     }
   }
@@ -4542,9 +4563,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct)
 
 TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls)
 {
-  auto const is_v2 = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   auto validity2 =
     cudf::detail::make_counting_transform_iterator(0, [](cudf::size_type i) { return i % 2; });
@@ -4586,7 +4607,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4616,9 +4637,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls)
 
 TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls)
 {
-  auto const is_v2 = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   using cudf::test::iterators::null_at;
   using cudf::test::iterators::nulls_at;
@@ -4711,7 +4732,7 @@ TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4812,7 +4833,7 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4824,11 +4845,14 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation)
       auto const ci    = read_column_index(source, chunk);
       auto const stats = get_statistics(chunk);
 
+      ASSERT_TRUE(stats.min_value.has_value());
+      ASSERT_TRUE(stats.max_value.has_value());
+
       // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max
       auto const ptype = fmd.schema[c + 1].type;
       auto const ctype = fmd.schema[c + 1].converted_type;
-      EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value, ptype, ctype) <= 0);
-      EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value, ptype, ctype) >= 0);
+      EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0);
+      EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0);
 
       // check that truncated values == expected
       EXPECT_EQ(memcmp(ci.min_values[0].data(), truncated_min[c], ci.min_values[0].size()), 0);
@@ -4870,7 +4894,7 @@ TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4885,8 +4909,10 @@ TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation)
       // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max
       auto const ptype = fmd.schema[c + 1].type;
       auto const ctype = fmd.schema[c + 1].converted_type;
-      EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value, ptype, ctype) <= 0);
-      EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value, ptype, ctype) >= 0);
+      ASSERT_TRUE(stats.min_value.has_value());
+      ASSERT_TRUE(stats.max_value.has_value());
+      EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0);
+      EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0);
 
       // check that truncated values == expected
       EXPECT_EQ(ci.min_values[0], truncated_min[c]);
@@ -5030,10 +5056,10 @@ TEST_F(ParquetReaderTest, NestedByteArray)
   cudf::io::write_parquet(out_opts);
 
   auto source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
-  EXPECT_EQ(fmd.schema[5].type, cudf::io::parquet::Type::BYTE_ARRAY);
+  EXPECT_EQ(fmd.schema[5].type, cudf::io::parquet::detail::Type::BYTE_ARRAY);
 
   std::vector<cudf::io::reader_column_schema> md{
     {},
@@ -5081,12 +5107,12 @@ TEST_F(ParquetWriterTest, ByteArrayStats)
   auto result = cudf::io::read_parquet(in_opts);
 
   auto source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
-  EXPECT_EQ(fmd.schema[1].type, cudf::io::parquet::Type::BYTE_ARRAY);
-  EXPECT_EQ(fmd.schema[2].type, cudf::io::parquet::Type::BYTE_ARRAY);
+  EXPECT_EQ(fmd.schema[1].type, cudf::io::parquet::detail::Type::BYTE_ARRAY);
+  EXPECT_EQ(fmd.schema[2].type, cudf::io::parquet::detail::Type::BYTE_ARRAY);
 
   auto const stats0 = get_statistics(fmd.row_groups[0].columns[0]);
   auto const stats1 = get_statistics(fmd.row_groups[0].columns[1]);
@@ -5137,9 +5163,9 @@ TEST_F(ParquetReaderTest, StructByteArray)
 
 TEST_F(ParquetReaderTest, NestingOptimizationTest)
 {
-  // test nesting levels > cudf::io::parquet::gpu::max_cacheable_nesting_decode_info deep.
+  // test nesting levels > cudf::io::parquet::detail::max_cacheable_nesting_decode_info deep.
   constexpr cudf::size_type num_nesting_levels = 16;
-  static_assert(num_nesting_levels > cudf::io::parquet::gpu::max_cacheable_nesting_decode_info);
+  static_assert(num_nesting_levels > cudf::io::parquet::detail::max_cacheable_nesting_decode_info);
   constexpr cudf::size_type rows_per_level = 2;
 
   constexpr cudf::size_type num_values = (1 << num_nesting_levels) * rows_per_level;
@@ -5206,13 +5232,13 @@ TEST_F(ParquetWriterTest, SingleValueDictionaryTest)
 
   // make sure dictionary was used
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto used_dict = [&fmd]() {
     for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) {
-      if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or
-          enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) {
+      if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or
+          enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) {
         return true;
       }
     }
@@ -5252,13 +5278,13 @@ TEST_F(ParquetWriterTest, DictionaryNeverTest)
 
   // make sure dictionary was not used
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto used_dict = [&fmd]() {
     for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) {
-      if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or
-          enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) {
+      if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or
+          enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) {
         return true;
       }
     }
@@ -5303,13 +5329,13 @@ TEST_F(ParquetWriterTest, DictionaryAdaptiveTest)
   // make sure dictionary was used as expected. col0 should use one,
   // col1 should not.
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto used_dict = [&fmd](int col) {
     for (auto enc : fmd.row_groups[0].columns[col].meta_data.encodings) {
-      if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or
-          enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) {
+      if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or
+          enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) {
         return true;
       }
     }
@@ -5354,13 +5380,13 @@ TEST_F(ParquetWriterTest, DictionaryAlwaysTest)
 
   // make sure dictionary was used for both columns
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto used_dict = [&fmd](int col) {
     for (auto enc : fmd.row_groups[0].columns[col].meta_data.encodings) {
-      if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or
-          enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) {
+      if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or
+          enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) {
         return true;
       }
     }
@@ -5438,13 +5464,13 @@ TEST_P(ParquetSizedTest, DictionaryTest)
 
   // make sure dictionary was used
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto used_dict = [&fmd]() {
     for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) {
-      if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or
-          enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) {
+      if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or
+          enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) {
         return true;
       }
     }
@@ -6664,7 +6690,7 @@ TEST_F(ParquetWriterTest, PreserveNullability)
 
 TEST_P(ParquetV2Test, CheckEncodings)
 {
-  using cudf::io::parquet::Encoding;
+  using cudf::io::parquet::detail::Encoding;
   constexpr auto num_rows = 100'000;
   auto const is_v2        = GetParam();
 
@@ -6697,7 +6723,7 @@ TEST_P(ParquetV2Test, CheckEncodings)
   };
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto const& chunk0_enc = fmd.row_groups[0].columns[0].meta_data.encodings;
@@ -6732,6 +6758,38 @@ TEST_P(ParquetV2Test, CheckEncodings)
   }
 }
 
+TEST_F(ParquetWriterTest, EmptyMinStringStatistics)
+{
+  char const* const min_val = "";
+  char const* const max_val = "zzz";
+  std::vector<char const*> strings{min_val, max_val, "pining", "for", "the", "fjords"};
+
+  column_wrapper<cudf::string_view> string_col{strings.begin(), strings.end()};
+  auto const output   = table_view{{string_col}};
+  auto const filepath = temp_env->get_temp_filepath("EmptyMinStringStatistics.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, output);
+  cudf::io::write_parquet(out_opts);
+
+  auto const source = cudf::io::datasource::create(filepath);
+  cudf::io::parquet::detail::FileMetaData fmd;
+  read_footer(source, &fmd);
+
+  ASSERT_TRUE(fmd.row_groups.size() > 0);
+  ASSERT_TRUE(fmd.row_groups[0].columns.size() > 0);
+  auto const& chunk = fmd.row_groups[0].columns[0];
+  auto const stats  = get_statistics(chunk);
+
+  ASSERT_TRUE(stats.min_value.has_value());
+  ASSERT_TRUE(stats.max_value.has_value());
+  auto const min_value = std::string{reinterpret_cast<char const*>(stats.min_value.value().data()),
+                                     stats.min_value.value().size()};
+  auto const max_value = std::string{reinterpret_cast<char const*>(stats.max_value.value().data()),
+                                     stats.max_value.value().size()};
+  EXPECT_EQ(min_value, std::string(min_val));
+  EXPECT_EQ(max_value, std::string(max_val));
+}
+
 TEST_F(ParquetReaderTest, RepeatedNoAnnotations)
 {
   constexpr unsigned char repeated_bytes[] = {
diff --git a/cpp/tests/streams/lists_test.cpp b/cpp/tests/streams/lists_test.cpp
new file mode 100644
index 00000000000..e292b551d83
--- /dev/null
+++ b/cpp/tests/streams/lists_test.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/lists/combine.hpp>
+#include <cudf/lists/contains.hpp>
+#include <cudf/lists/count_elements.hpp>
+
+class ListTest : public cudf::test::BaseFixture {};
+
+TEST_F(ListTest, ConcatenateRows)
+{
+  cudf::test::lists_column_wrapper<int> list_col_1{{0, 1}, {2, 3}, {4, 5}};
+  cudf::test::lists_column_wrapper<int> list_col_2{{0, 1}, {2, 3}, {4, 5}};
+  cudf::table_view lists_table({list_col_1, list_col_2});
+  cudf::lists::concatenate_rows(
+    lists_table, cudf::lists::concatenate_null_policy::IGNORE, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ConcatenateListElements)
+{
+  cudf::test::lists_column_wrapper<int> ll_column{{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}};
+  cudf::lists::concatenate_list_elements(
+    ll_column, cudf::lists::concatenate_null_policy::IGNORE, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ContainsNulls)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3}, {4, 5}};
+  cudf::lists::contains_nulls(list_col, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ContainsSearchKey)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3}, {4, 5}};
+  cudf::numeric_scalar<int32_t> search_key(2, true, cudf::test::get_default_stream());
+  cudf::lists::contains(list_col, search_key, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ContainsSearchKeys)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3}, {4, 5}};
+  cudf::test::fixed_width_column_wrapper<int> search_keys({1, 2, 3});
+  cudf::lists::contains(list_col, search_keys, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, IndexOfSearchKey)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3}, {4, 5}};
+  cudf::numeric_scalar<int32_t> search_key(2, true, cudf::test::get_default_stream());
+  cudf::lists::index_of(list_col,
+                        search_key,
+                        cudf::lists::duplicate_find_option::FIND_FIRST,
+                        cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, IndexOfSearchKeys)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3}, {4, 5}};
+  cudf::test::fixed_width_column_wrapper<int> search_keys({1, 2, 3});
+  cudf::lists::index_of(list_col,
+                        search_keys,
+                        cudf::lists::duplicate_find_option::FIND_FIRST,
+                        cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, CountElements)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3, 7}, {4, 5}};
+  cudf::lists::count_elements(list_col, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/strings/strings_tests.cpp b/cpp/tests/streams/strings/strings_tests.cpp
new file mode 100644
index 00000000000..0db467a6895
--- /dev/null
+++ b/cpp/tests/streams/strings/strings_tests.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/strings/padding.hpp>
+#include <cudf/strings/slice.hpp>
+#include <cudf/strings/strip.hpp>
+#include <cudf/strings/wrap.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <string>
+
+class StringsTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsTest, Strip)
+{
+  auto input = cudf::test::strings_column_wrapper({"  aBc  ", "   ", "aaaa ", "\tb"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const strip = cudf::string_scalar(" ", true, cudf::test::get_default_stream());
+  auto const side  = cudf::strings::side_type::BOTH;
+  cudf::strings::strip(view, side, strip, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsTest, Pad)
+{
+  auto input = cudf::test::strings_column_wrapper({"333", "", "4444", "1"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const side = cudf::strings::side_type::BOTH;
+  cudf::strings::pad(view, 6, side, " ", cudf::test::get_default_stream());
+  cudf::strings::zfill(view, 6, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsTest, Wrap)
+{
+  auto input = cudf::test::strings_column_wrapper({"the quick brown fox jumped"});
+  auto view  = cudf::strings_column_view(input);
+
+  cudf::strings::wrap(view, 6, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsTest, Slice)
+{
+  auto input = cudf::test::strings_column_wrapper({"hello", "these", "are test strings"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto start = cudf::numeric_scalar(2, true, cudf::test::get_default_stream());
+  auto stop  = cudf::numeric_scalar(5, true, cudf::test::get_default_stream());
+  auto step  = cudf::numeric_scalar(1, true, cudf::test::get_default_stream());
+  cudf::strings::slice_strings(view, start, stop, step, cudf::test::get_default_stream());
+
+  auto starts = cudf::test::fixed_width_column_wrapper<cudf::size_type>({1, 2, 3});
+  auto stops  = cudf::test::fixed_width_column_wrapper<cudf::size_type>({4, 5, 6});
+  cudf::strings::slice_strings(view, starts, stops, cudf::test::get_default_stream());
+}
diff --git a/java/src/main/java/ai/rapids/cudf/Cuda.java b/java/src/main/java/ai/rapids/cudf/Cuda.java
index e1298e29925..7cc3d30a9cf 100755
--- a/java/src/main/java/ai/rapids/cudf/Cuda.java
+++ b/java/src/main/java/ai/rapids/cudf/Cuda.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,9 +15,6 @@
  */
 package ai.rapids.cudf;
 
-import ai.rapids.cudf.NvtxColor;
-import ai.rapids.cudf.NvtxRange;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -90,6 +87,21 @@ private Stream() {
       this.id = -1;
     }
 
+    private Stream(long id) {
+      this.cleaner = null;
+      this.id = id;
+    }
+
+    /**
+     * Wrap a given stream ID to make it accessible.
+     */
+    static Stream wrap(long id) {
+      if (id == -1) {
+        return DEFAULT_STREAM;
+      }
+      return new Stream(id);
+    }
+
     /**
      * Have this stream not execute new work until the work recorded in event completes.
      * @param event the event to wait on.
@@ -122,7 +134,9 @@ public synchronized void close() {
         cleaner.delRef();
       }
       if (closed) {
-        cleaner.logRefCountDebug("double free " + this);
+        if (cleaner != null) {
+          cleaner.logRefCountDebug("double free " + this);
+        }
         throw new IllegalStateException("Close called too many times " + this);
       }
       if (cleaner != null) {
diff --git a/java/src/main/java/ai/rapids/cudf/DataSource.java b/java/src/main/java/ai/rapids/cudf/DataSource.java
new file mode 100644
index 00000000000..1e5893235df
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/DataSource.java
@@ -0,0 +1,189 @@
+/*
+ *
+ *  Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.HashMap;
+
+/**
+ * Base class that can be used to provide data dynamically to CUDF. This follows somewhat
+ * closely with cudf::io::datasource. There are a few main differences.
+ * <br/>
+ * First this does not expose async device reads. It will call the non-async device read API
+ * instead. This might be added in the future, but there was no direct use case for it in java
+ * right now to warrant the added complexity.
+ * <br/>
+ * Second there is no implementation of the device read API that returns a buffer instead of
+ * writing into one. This is not used by CUDF yet so testing an implementation that isn't used
+ * didn't feel ideal. If it is needed we will add one in the future.
+ */
+public abstract class DataSource implements AutoCloseable {
+  private static final Logger log = LoggerFactory.getLogger(DataSource.class);
+
+  /**
+   * This is used to keep track of the HostMemoryBuffers in java land so the C++ layer
+   * does not have to do it.
+   */
+  private final HashMap<Long, HostMemoryBuffer> cachedBuffers = new HashMap<>();
+
+  @Override
+  public void close() {
+    if (!cachedBuffers.isEmpty()) {
+      throw new IllegalStateException("DataSource closed before all returned host buffers were closed");
+    }
+  }
+
+  /**
+   * Get the size of the source in bytes.
+   */
+  public abstract long size();
+
+  /**
+   * Read data from the source at the given offset. Return a HostMemoryBuffer for the data
+   * that was read.
+   * @param offset where to start reading from.
+   * @param amount the maximum number of bytes to read.
+   * @return a buffer that points to the data.
+   * @throws IOException on any error.
+   */
+  public abstract HostMemoryBuffer hostRead(long offset, long amount) throws IOException;
+
+
+  /**
+   * Called when the buffer returned from hostRead is done. The default is to close the buffer.
+   */
+  protected void onHostBufferDone(HostMemoryBuffer buffer) {
+    if (buffer != null) {
+      buffer.close();
+    }
+  }
+
+  /**
+   * Read data from the source at the given offset into dest. Note that dest should not be closed,
+   * and no reference to it can outlive the call to hostRead. The target amount to read is
+   * dest's length.
+   * @param offset the offset to start reading from in the source.
+   * @param dest where to write the data.
+   * @return the actual number of bytes written to dest.
+   */
+  public abstract long hostRead(long offset, HostMemoryBuffer dest) throws IOException;
+
+  /**
+   * Return true if this supports reading directly to the device else false. The default is
+   * no device support. This cannot change dynamically. It is typically read just once.
+   */
+  public boolean supportsDeviceRead() {
+    return false;
+  }
+
+  /**
+   * Get the size cutoff between device reads and host reads when device reads are supported.
+   * Anything larger than the cutoff will be a device read and anything smaller will be a
+   * host read. By default, the cutoff is 0 so all reads will be device reads if device reads
+   * are supported.
+   */
+  public long getDeviceReadCutoff() {
+    return 0;
+  }
+
+  /**
+   * Read data from the source at the given offset into dest. Note that dest should not be closed,
+   * and no reference to it can outlive the call to hostRead. The target amount to read is
+   * dest's length.
+   * @param offset the offset to start reading from
+   * @param dest where to write the data.
+   * @param stream the stream to do the copy on.
+   * @return the actual number of bytes written to dest.
+   */
+  public long deviceRead(long offset, DeviceMemoryBuffer dest,
+                         Cuda.Stream stream) throws IOException {
+    throw new IllegalStateException("Device read is not implemented");
+  }
+
+  /////////////////////////////////////////////////
+  // Internal methods called from JNI
+  /////////////////////////////////////////////////
+
+  private static class NoopCleaner extends MemoryBuffer.MemoryBufferCleaner {
+    @Override
+    protected boolean cleanImpl(boolean logErrorIfNotClean) {
+      return true;
+    }
+
+    @Override
+    public boolean isClean() {
+      return true;
+    }
+  }
+  private static final NoopCleaner cleaner = new NoopCleaner();
+
+  // Called from JNI
+  private void onHostBufferDone(long bufferId) {
+    HostMemoryBuffer hmb = cachedBuffers.remove(bufferId);
+    if (hmb != null) {
+      onHostBufferDone(hmb);
+    } else {
+      // Called from C++ destructor so avoid throwing...
+      log.warn("Got a close callback for a buffer we could not find " + bufferId);
+    }
+  }
+
+  // Called from JNI
+  private long hostRead(long offset, long amount, long dst) throws IOException {
+    if (amount < 0) {
+      throw new IllegalArgumentException("Cannot allocate more than " + Long.MAX_VALUE + " bytes");
+    }
+    try (HostMemoryBuffer dstBuffer = new HostMemoryBuffer(dst, amount, cleaner)) {
+      return hostRead(offset, dstBuffer);
+    }
+  }
+
+  // Called from JNI
+  private long[] hostReadBuff(long offset, long amount) throws IOException {
+    if (amount < 0) {
+      throw new IllegalArgumentException("Cannot read more than " + Long.MAX_VALUE + " bytes");
+    }
+    HostMemoryBuffer buff = hostRead(offset, amount);
+    long[] ret = new long[3];
+    if (buff != null) {
+      long id = buff.id;
+      if (cachedBuffers.put(id, buff) != null) {
+        throw new IllegalStateException("Already had a buffer cached for " + buff);
+      }
+      ret[0] = buff.address;
+      ret[1] = buff.length;
+      ret[2] = id;
+    } // else they are all 0 because java does that already
+    return ret;
+  }
+
+  // Called from JNI
+  private long deviceRead(long offset, long amount, long dst, long stream) throws IOException {
+    if (amount < 0) {
+      throw new IllegalArgumentException("Cannot read more than " + Long.MAX_VALUE + " bytes");
+    }
+    Cuda.Stream strm = Cuda.Stream.wrap(stream);
+    try (DeviceMemoryBuffer dstBuffer = new DeviceMemoryBuffer(dst, amount, cleaner)) {
+      return deviceRead(offset, dstBuffer, strm);
+    }
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/DataSourceHelper.java b/java/src/main/java/ai/rapids/cudf/DataSourceHelper.java
new file mode 100644
index 00000000000..5d4dcb8e4e7
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/DataSourceHelper.java
@@ -0,0 +1,44 @@
+/*
+ *
+ *  Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * This is here because we need some JNI methods to work with a DataSource, but
+ * we also want to cache callback methods at startup for performance reasons. If
+ * we put both in the same class we will get a deadlock because of how we load
+ * the JNI. We have a static block that blocks loading the class until the JNI
+ * library is loaded and the JNI library cannot load until the class is loaded
+ * and cached. This breaks the loop.
+ */
+class DataSourceHelper {
+    static {
+        NativeDepsLoader.loadNativeDeps();
+    }
+
+    static long createWrapperDataSource(DataSource ds) {
+        return createWrapperDataSource(ds, ds.size(), ds.supportsDeviceRead(),
+                ds.getDeviceReadCutoff());
+    }
+
+    private static native long createWrapperDataSource(DataSource ds, long size,
+                                                       boolean deviceReadSupport,
+                                                       long deviceReadCutoff);
+
+    static native void destroyWrapperDataSource(long handle);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/DeviceMemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/DeviceMemoryBuffer.java
index c4d9bdb8f91..9eab607ed0b 100644
--- a/java/src/main/java/ai/rapids/cudf/DeviceMemoryBuffer.java
+++ b/java/src/main/java/ai/rapids/cudf/DeviceMemoryBuffer.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -112,6 +112,10 @@ public static DeviceMemoryBuffer fromRmm(long address, long lengthInBytes, long
     return new DeviceMemoryBuffer(address, lengthInBytes, rmmBufferAddress);
   }
 
+  DeviceMemoryBuffer(long address, long lengthInBytes, MemoryBufferCleaner cleaner) {
+    super(address, lengthInBytes, cleaner);
+  }
+
   DeviceMemoryBuffer(long address, long lengthInBytes, long rmmBufferAddress) {
     super(address, lengthInBytes, new RmmDeviceBufferCleaner(rmmBufferAddress));
   }
diff --git a/java/src/main/java/ai/rapids/cudf/MultiBufferDataSource.java b/java/src/main/java/ai/rapids/cudf/MultiBufferDataSource.java
new file mode 100644
index 00000000000..6986b6a7fec
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/MultiBufferDataSource.java
@@ -0,0 +1,230 @@
+/*
+ *
+ *  Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * This is a DataSource that can take multiple HostMemoryBuffers. They
+ * are treated as if they are all part of a single file connected end to end.
+ */
+public class MultiBufferDataSource extends DataSource {
+  private final long sizeInBytes;
+  private final HostMemoryBuffer[] hostBuffers;
+  private final long[] startOffsets;
+  private final HostMemoryAllocator allocator;
+
+  // Metrics
+  private long hostReads = 0;
+  private long hostReadBytes = 0;
+  private long devReads = 0;
+  private long devReadBytes = 0;
+
+  /**
+   * Create a new data source backed by multiple buffers.
+   * @param buffers the buffers that will back the data source.
+   */
+  public MultiBufferDataSource(HostMemoryBuffer ... buffers) {
+    this(DefaultHostMemoryAllocator.get(), buffers);
+  }
+
+  /**
+   * Create a new data source backed by multiple buffers.
+   * @param allocator the allocator to use for host buffers, if needed.
+   * @param buffers the buffers that will back the data source.
+   */
+  public MultiBufferDataSource(HostMemoryAllocator allocator, HostMemoryBuffer ... buffers) {
+    int numBuffers = buffers.length;
+    hostBuffers = new HostMemoryBuffer[numBuffers];
+    startOffsets = new long[numBuffers];
+
+    long currentOffset = 0;
+    for (int i = 0; i < numBuffers; i++) {
+      HostMemoryBuffer hmb = buffers[i];
+      hmb.incRefCount();
+      hostBuffers[i] = hmb;
+      startOffsets[i] = currentOffset;
+      currentOffset += hmb.getLength();
+    }
+    sizeInBytes = currentOffset;
+    this.allocator = allocator;
+  }
+
+  @Override
+  public long size() {
+    return sizeInBytes;
+  }
+
+  private int getStartBufferIndexForOffset(long offset) {
+    assert (offset >= 0);
+
+    // It is super common to read from the start or end of a file (the header or footer)
+    // so special case them
+    if (offset == 0) {
+      return 0;
+    }
+    int startIndex = 0;
+    int endIndex = startOffsets.length - 1;
+    if (offset >= startOffsets[endIndex]) {
+      return endIndex;
+    }
+    while (startIndex != endIndex) {
+      int midIndex = (int)(((long)startIndex + endIndex) / 2);
+      long midStartOffset = startOffsets[midIndex];
+      if (offset >= midStartOffset) {
+        // It is either in mid or after mid.
+        if (midIndex == endIndex || offset <= startOffsets[midIndex + 1]) {
+          // We found it in mid
+          return midIndex;
+        } else {
+          // It is after mid
+          startIndex = midIndex + 1;
+        }
+      } else {
+        // It is before mid
+        endIndex = midIndex - 1;
+      }
+    }
+    return startIndex;
+  }
+
+
+  interface DoCopy<T extends MemoryBuffer> {
+    void copyFromHostBuffer(T dest, long destOffset, HostMemoryBuffer src,
+                            long srcOffset, long srcAmount);
+  }
+
+  private <T extends MemoryBuffer> long read(long offset, T dest, DoCopy<T> doCopy) {
+    assert (offset >= 0);
+    long realOffset = Math.min(offset, sizeInBytes);
+    long realAmount = Math.min(sizeInBytes - realOffset, dest.getLength());
+
+    int index = getStartBufferIndexForOffset(realOffset);
+
+    HostMemoryBuffer buffer = hostBuffers[index];
+    long bufferOffset = realOffset - startOffsets[index];
+    long bufferAmount = Math.min(buffer.length - bufferOffset, realAmount);
+    long remainingAmount = realAmount;
+    long currentOffset = realOffset;
+    long outputOffset = 0;
+
+    while (remainingAmount > 0) {
+      doCopy.copyFromHostBuffer(dest, outputOffset, buffer,
+          bufferOffset, bufferAmount);
+      remainingAmount -= bufferAmount;
+      outputOffset += bufferAmount;
+      currentOffset += bufferAmount;
+      index++;
+      if (index < hostBuffers.length) {
+        buffer = hostBuffers[index];
+        bufferOffset = currentOffset - startOffsets[index];
+        bufferAmount = Math.min(buffer.length - bufferOffset, remainingAmount);
+      }
+    }
+
+    return realAmount;
+  }
+
+  @Override
+  public HostMemoryBuffer hostRead(long offset, long amount) {
+    assert (offset >= 0);
+    assert (amount >= 0);
+    long realOffset = Math.min(offset, sizeInBytes);
+    long realAmount = Math.min(sizeInBytes - realOffset, amount);
+
+    int index = getStartBufferIndexForOffset(realOffset);
+
+    HostMemoryBuffer buffer = hostBuffers[index];
+    long bufferOffset = realOffset - startOffsets[index];
+    long bufferAmount = Math.min(buffer.length - bufferOffset, realAmount);
+    if (bufferAmount == realAmount) {
+      hostReads += 1;
+      hostReadBytes += realAmount;
+      // It all fits in a single buffer, so do a zero copy operation
+      return buffer.slice(bufferOffset, bufferAmount);
+    } else {
+      // We will have to allocate a new buffer and copy data into it.
+      boolean success = false;
+      HostMemoryBuffer ret = allocator.allocate(realAmount, true);
+      try {
+        long amountRead = read(offset, ret, HostMemoryBuffer::copyFromHostBuffer);
+        assert(amountRead == realAmount);
+        hostReads += 1;
+        hostReadBytes += amountRead;
+        success = true;
+        return ret;
+      } finally {
+        if (!success) {
+          ret.close();
+        }
+      }
+    }
+  }
+
+  @Override
+  public long hostRead(long offset, HostMemoryBuffer dest) {
+    long ret = read(offset, dest, HostMemoryBuffer::copyFromHostBuffer);
+    hostReads += 1;
+    hostReadBytes += ret;
+    return ret;
+  }
+
+  @Override
+  public boolean supportsDeviceRead() {
+    return true;
+  }
+
+  @Override
+  public long deviceRead(long offset, DeviceMemoryBuffer dest,
+                         Cuda.Stream stream) {
+    long ret = read(offset, dest, (destParam, destOffset, src, srcOffset, srcAmount) ->
+        destParam.copyFromHostBufferAsync(destOffset, src, srcOffset, srcAmount, stream));
+    devReads += 1;
+    devReadBytes += ret;
+    return ret;
+  }
+
+
+  @Override
+  public void close() {
+    try {
+      super.close();
+    } finally {
+      for (HostMemoryBuffer hmb: hostBuffers) {
+        if (hmb != null) {
+          hmb.close();
+        }
+      }
+    }
+  }
+
+  public long getHostReads() {
+    return hostReads;
+  }
+
+  public long getHostReadBytes() {
+    return hostReadBytes;
+  }
+
+  public long getDevReads() {
+    return devReads;
+  }
+
+  public long getDevReadBytes() {
+    return devReadBytes;
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java
index c34336ac73f..17d59b757c3 100644
--- a/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java
+++ b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2022, NVIDIA CORPORATION.
+ *  Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -51,7 +51,7 @@ public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, File f
     handle = create(chunkSizeByteLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
         filePath.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId());
 
-    if(handle == 0) {
+    if (handle == 0) {
       throw new IllegalStateException("Cannot create native chunked Parquet reader object.");
     }
   }
@@ -71,18 +71,45 @@ public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, HostMe
     handle = create(chunkSizeByteLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null,
         buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId());
 
-    if(handle == 0) {
+    if (handle == 0) {
       throw new IllegalStateException("Cannot create native chunked Parquet reader object.");
     }
   }
 
+  /**
+   * Construct a reader instance from a DataSource
+   * @param chunkSizeByteLimit Limit on total number of bytes to be returned per read,
+   *                           or 0 if there is no limit.
+   * @param opts The options for Parquet reading.
+   * @param ds the data source to read from
+   */
+  public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, DataSource ds) {
+    dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds);
+    if (dataSourceHandle == 0) {
+      throw new IllegalStateException("Cannot create native datasource object");
+    }
+
+    boolean passed = false;
+    try {
+      handle = createWithDataSource(chunkSizeByteLimit, opts.getIncludeColumnNames(),
+              opts.getReadBinaryAsString(), opts.timeUnit().typeId.getNativeId(),
+              dataSourceHandle);
+      passed = true;
+    } finally {
+      if (!passed) {
+        DataSourceHelper.destroyWrapperDataSource(dataSourceHandle);
+        dataSourceHandle = 0;
+      }
+    }
+  }
+
   /**
    * Check if the given file has anything left to read.
    *
    * @return A boolean value indicating if there is more data to read from file.
    */
   public boolean hasNext() {
-    if(handle == 0) {
+    if (handle == 0) {
       throw new IllegalStateException("Native chunked Parquet reader object may have been closed.");
     }
 
@@ -104,7 +131,7 @@ public boolean hasNext() {
    * @return A table of new rows reading from the given file.
    */
   public Table readChunk() {
-    if(handle == 0) {
+    if (handle == 0) {
       throw new IllegalStateException("Native chunked Parquet reader object may have been closed.");
     }
 
@@ -118,6 +145,10 @@ public void close() {
       close(handle);
       handle = 0;
     }
+    if (dataSourceHandle != 0) {
+      DataSourceHelper.destroyWrapperDataSource(dataSourceHandle);
+      dataSourceHandle = 0;
+    }
   }
 
 
@@ -131,6 +162,7 @@ public void close() {
    */
   private long handle;
 
+  private long dataSourceHandle = 0;
 
   /**
    * Create a native chunked Parquet reader object on heap and return its memory address.
@@ -147,6 +179,9 @@ public void close() {
   private static native long create(long chunkSizeByteLimit, String[] filterColumnNames,
       boolean[] binaryToString, String filePath, long bufferAddrs, long length, int timeUnit);
 
+  private static native long createWithDataSource(long chunkedSizeByteLimit,
+      String[] filterColumnNames, boolean[] binaryToString, int timeUnit, long dataSourceHandle);
+
   private static native boolean hasNext(long handle);
 
   private static native long[] readChunk(long handle);
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 51a33ebb72f..3bd1e3f25a7 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -235,6 +235,14 @@ private static native long[] readCSV(String[] columnNames,
                                        byte comment, String[] nullValues,
                                        String[] trueValues, String[] falseValues) throws CudfException;
 
+  private static native long[] readCSVFromDataSource(String[] columnNames,
+                                       int[] dTypeIds, int[] dTypeScales,
+                                       String[] filterColumnNames,
+                                       int headerRow, byte delim, int quoteStyle, byte quote,
+                                       byte comment, String[] nullValues,
+                                       String[] trueValues, String[] falseValues,
+                                       long dataSourceHandle) throws CudfException;
+
   /**
    * read JSON data and return a pointer to a TableWithMeta object.
    */
@@ -244,6 +252,12 @@ private static native long readJSON(String[] columnNames,
                                         boolean dayFirst, boolean lines,
                                         boolean recoverWithNulls) throws CudfException;
 
+  private static native long readJSONFromDataSource(String[] columnNames,
+                                      int[] dTypeIds, int[] dTypeScales,
+                                      boolean dayFirst, boolean lines,
+                                      boolean recoverWithNulls,
+                                      long dsHandle) throws CudfException;
+
   private static native long readAndInferJSON(long address, long length,
       boolean dayFirst, boolean lines, boolean recoverWithNulls) throws CudfException;
 
@@ -260,6 +274,10 @@ private static native long readAndInferJSON(long address, long length,
   private static native long[] readParquet(String[] filterColumnNames, boolean[] binaryToString, String filePath,
                                            long address, long length, int timeUnit) throws CudfException;
 
+  private static native long[] readParquetFromDataSource(String[] filterColumnNames,
+                                                         boolean[] binaryToString, int timeUnit,
+                                                         long dataSourceHandle) throws CudfException;
+
   /**
    * Read in Avro formatted data.
    * @param filterColumnNames  name of the columns to read, or an empty array if we want to read
@@ -271,6 +289,9 @@ private static native long[] readParquet(String[] filterColumnNames, boolean[] b
   private static native long[] readAvro(String[] filterColumnNames, String filePath,
                                         long address, long length) throws CudfException;
 
+  private static native long[] readAvroFromDataSource(String[] filterColumnNames,
+                                                      long dataSourceHandle) throws CudfException;
+
   /**
    * Setup everything to write parquet formatted data to a file.
    * @param columnNames     names that correspond to the table columns
@@ -372,6 +393,11 @@ private static native long[] readORC(String[] filterColumnNames,
                                        boolean usingNumPyTypes, int timeUnit,
                                        String[] decimal128Columns) throws CudfException;
 
+  private static native long[] readORCFromDataSource(String[] filterColumnNames,
+                                                     boolean usingNumPyTypes, int timeUnit,
+                                                     String[] decimal128Columns,
+                                                     long dataSourceHandle) throws CudfException;
+
   /**
    * Setup everything to write ORC formatted data to a file.
    * @param columnNames     names that correspond to the table columns
@@ -881,6 +907,27 @@ public static Table readCSV(Schema schema, CSVOptions opts, HostMemoryBuffer buf
         opts.getFalseValues()));
   }
 
+  public static Table readCSV(Schema schema, CSVOptions opts, DataSource ds) {
+    long dsHandle = DataSourceHelper.createWrapperDataSource(ds);
+    try {
+      return new Table(readCSVFromDataSource(schema.getColumnNames(),
+              schema.getTypeIds(),
+              schema.getTypeScales(),
+              opts.getIncludeColumnNames(),
+              opts.getHeaderRow(),
+              opts.getDelim(),
+              opts.getQuoteStyle().nativeId,
+              opts.getQuote(),
+              opts.getComment(),
+              opts.getNullValues(),
+              opts.getTrueValues(),
+              opts.getFalseValues(),
+              dsHandle));
+    } finally {
+      DataSourceHelper.destroyWrapperDataSource(dsHandle);
+    }
+  }
+
   private static native void writeCSVToFile(long table,
                                             String[] columnNames,
                                             boolean includeHeader,
@@ -1128,6 +1175,24 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
     }
   }
 
+  /**
+   * Read JSON formatted data.
+   * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema.
+   * @param opts various JSON parsing options.
+   * @param ds the DataSource to read from.
+   * @return the data parsed as a table on the GPU.
+   */
+  public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
+    long dsHandle = DataSourceHelper.createWrapperDataSource(ds);
+    try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getColumnNames(),
+            schema.getTypeIds(), schema.getTypeScales(), opts.isDayFirst(), opts.isLines(),
+            opts.isRecoverWithNull(), dsHandle))) {
+      return gatherJSONColumns(schema, twm);
+    } finally {
+      DataSourceHelper.destroyWrapperDataSource(dsHandle);
+    }
+  }
+
   /**
    * Read a Parquet file using the default ParquetOptions.
    * @param path the local file to read.
@@ -1214,6 +1279,17 @@ public static Table readParquet(ParquetOptions opts, HostMemoryBuffer buffer,
         null, buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId()));
   }
 
+  public static Table readParquet(ParquetOptions opts, DataSource ds) {
+    long dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds);
+    try {
+      return new Table(readParquetFromDataSource(opts.getIncludeColumnNames(),
+              opts.getReadBinaryAsString(), opts.timeUnit().typeId.getNativeId(),
+              dataSourceHandle));
+    } finally {
+      DataSourceHelper.destroyWrapperDataSource(dataSourceHandle);
+    }
+  }
+
   /**
    * Read an Avro file using the default AvroOptions.
    * @param path the local file to read.
@@ -1297,6 +1373,16 @@ public static Table readAvro(AvroOptions opts, HostMemoryBuffer buffer,
         null, buffer.getAddress() + offset, len));
   }
 
+  public static Table readAvro(AvroOptions opts, DataSource ds) {
+    long dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds);
+    try {
+      return new Table(readAvroFromDataSource(opts.getIncludeColumnNames(),
+              dataSourceHandle));
+    } finally {
+      DataSourceHelper.destroyWrapperDataSource(dataSourceHandle);
+    }
+  }
+
   /**
    * Read a ORC file using the default ORCOptions.
    * @param path the local file to read.
@@ -1388,6 +1474,17 @@ public static Table readORC(ORCOptions opts, HostMemoryBuffer buffer,
         opts.getDecimal128Columns()));
   }
 
+  public static Table readORC(ORCOptions opts, DataSource ds) {
+    long dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds);
+    try {
+      return new Table(readORCFromDataSource(opts.getIncludeColumnNames(),
+              opts.usingNumPyTypes(), opts.timeUnit().typeId.getNativeId(),
+              opts.getDecimal128Columns(), dataSourceHandle));
+    } finally {
+      DataSourceHelper.destroyWrapperDataSource(dataSourceHandle);
+    }
+  }
+
   private static class ParquetTableWriter extends TableWriter {
     HostBufferConsumer consumer;
 
@@ -2262,7 +2359,7 @@ public Table dropDuplicates(int[] keyColumns, DuplicateKeepOption keep, boolean
 
   /**
    * Count how many rows in the table are distinct from one another.
-   * @param nullEqual if nulls should be considered equal to each other or not.
+   * @param nullsEqual if nulls should be considered equal to each other or not.
    */
   public int distinctCount(NullEquality nullsEqual) {
     return distinctCount(nativeHandle, nullsEqual.nullsEqual);
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 0dcfee2cffe..01161a03dd4 100644
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -135,6 +135,7 @@ add_library(
   src/ColumnViewJni.cu
   src/CompiledExpression.cpp
   src/ContiguousTableJni.cpp
+  src/DataSourceHelperJni.cpp
   src/HashJoinJni.cpp
   src/HostMemoryBufferNativeUtilsJni.cpp
   src/NvcompJni.cpp
diff --git a/java/src/main/native/src/ChunkedReaderJni.cpp b/java/src/main/native/src/ChunkedReaderJni.cpp
index 8d0a8bdbfe7..0044385f267 100644
--- a/java/src/main/native/src/ChunkedReaderJni.cpp
+++ b/java/src/main/native/src/ChunkedReaderJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -85,6 +85,40 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_create(
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_createWithDataSource(
+    JNIEnv *env, jclass, jlong chunk_read_limit, jobjectArray filter_col_names,
+    jbooleanArray j_col_binary_read, jint unit, jlong ds_handle) {
+  JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", 0);
+  JNI_NULL_CHECK(env, ds_handle, "Null DataSouurce", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+
+    cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
+
+    // TODO: This variable is unused now, but we still don't know what to do with it yet.
+    // As such, it needs to stay here for a little more time before we decide to use it again,
+    // or remove it completely.
+    cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
+    (void)n_col_binary_read;
+
+    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    cudf::io::source_info source{ds};
+
+    auto opts_builder = cudf::io::parquet_reader_options::builder(source);
+    if (n_filter_col_names.size() > 0) {
+      opts_builder = opts_builder.columns(n_filter_col_names.as_cpp_vector());
+    }
+    auto const read_opts = opts_builder.convert_strings_to_categories(false)
+                               .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+                               .build();
+
+    return reinterpret_cast<jlong>(new cudf::io::chunked_parquet_reader(
+        static_cast<std::size_t>(chunk_read_limit), read_opts));
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_hasNext(JNIEnv *env, jclass,
                                                                             jlong handle) {
   JNI_NULL_CHECK(env, handle, "handle is null", false);
diff --git a/java/src/main/native/src/CudfJni.cpp b/java/src/main/native/src/CudfJni.cpp
index 0f143086451..d0a25d449a6 100644
--- a/java/src/main/native/src/CudfJni.cpp
+++ b/java/src/main/native/src/CudfJni.cpp
@@ -175,6 +175,14 @@ JNIEXPORT jint JNI_OnLoad(JavaVM *vm, void *) {
     return JNI_ERR;
   }
 
+  if (!cudf::jni::cache_data_source_jni(env)) {
+    if (!env->ExceptionCheck()) {
+      env->ThrowNew(env->FindClass("java/lang/RuntimeException"),
+                    "Unable to locate data source helper methods needed by JNI");
+    }
+    return JNI_ERR;
+  }
+
   return cudf::jni::MINIMUM_JNI_VERSION;
 }
 
diff --git a/java/src/main/native/src/DataSourceHelperJni.cpp b/java/src/main/native/src/DataSourceHelperJni.cpp
new file mode 100644
index 00000000000..8d0e4d36413
--- /dev/null
+++ b/java/src/main/native/src/DataSourceHelperJni.cpp
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/io/datasource.hpp>
+
+#include "cudf_jni_apis.hpp"
+#include "jni_utils.hpp"
+
+namespace {
+
+#define DATA_SOURCE_CLASS "ai/rapids/cudf/DataSource"
+
+jclass DataSource_jclass;
+jmethodID hostRead_method;
+jmethodID hostReadBuff_method;
+jmethodID onHostBufferDone_method;
+jmethodID deviceRead_method;
+
+} // anonymous namespace
+
+namespace cudf {
+namespace jni {
+bool cache_data_source_jni(JNIEnv *env) {
+  jclass cls = env->FindClass(DATA_SOURCE_CLASS);
+  if (cls == nullptr) {
+    return false;
+  }
+
+  hostRead_method = env->GetMethodID(cls, "hostRead", "(JJJ)J");
+  if (hostRead_method == nullptr) {
+    return false;
+  }
+
+  hostReadBuff_method = env->GetMethodID(cls, "hostReadBuff", "(JJ)[J");
+  if (hostReadBuff_method == nullptr) {
+    return false;
+  }
+
+  onHostBufferDone_method = env->GetMethodID(cls, "onHostBufferDone", "(J)V");
+  if (onHostBufferDone_method == nullptr) {
+    return false;
+  }
+
+  deviceRead_method = env->GetMethodID(cls, "deviceRead", "(JJJJ)J");
+  if (deviceRead_method == nullptr) {
+    return false;
+  }
+
+  // Convert local reference to global so it cannot be garbage collected.
+  DataSource_jclass = static_cast<jclass>(env->NewGlobalRef(cls));
+  if (DataSource_jclass == nullptr) {
+    return false;
+  }
+  return true;
+}
+
+void release_data_source_jni(JNIEnv *env) {
+  DataSource_jclass = cudf::jni::del_global_ref(env, DataSource_jclass);
+}
+
+class host_buffer_done_callback {
+public:
+  explicit host_buffer_done_callback(JavaVM *jvm, jobject ds, long id) : jvm(jvm), ds(ds), id(id) {}
+
+  host_buffer_done_callback(host_buffer_done_callback const &other) = delete;
+  host_buffer_done_callback(host_buffer_done_callback &&other)
+      : jvm(other.jvm), ds(other.ds), id(other.id) {
+    other.jvm = nullptr;
+    other.ds = nullptr;
+    other.id = -1;
+  }
+
+  host_buffer_done_callback &operator=(host_buffer_done_callback &&other) = delete;
+  host_buffer_done_callback &operator=(host_buffer_done_callback const &other) = delete;
+
+  ~host_buffer_done_callback() {
+    // because we are in a destructor we cannot throw an exception, so for now we are
+    // just going to keep the java exceptions around and have them be thrown when this
+    // thread returns to the JVM. It might be kind of confusing, but we will not lose
+    // them.
+    if (jvm != nullptr) {
+      // We cannot throw an exception in the destructor, so this is really best effort
+      JNIEnv *env = nullptr;
+      if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
+        env->CallVoidMethod(this->ds, onHostBufferDone_method, id);
+      }
+    }
+  }
+
+private:
+  JavaVM *jvm;
+  jobject ds;
+  long id;
+};
+
+class jni_datasource : public cudf::io::datasource {
+public:
+  explicit jni_datasource(JNIEnv *env, jobject ds, size_t ds_size, bool device_read_supported,
+                          size_t device_read_cutoff)
+      : ds_size(ds_size), device_read_supported(device_read_supported),
+        device_read_cutoff(device_read_cutoff) {
+    if (env->GetJavaVM(&jvm) < 0) {
+      throw std::runtime_error("GetJavaVM failed");
+    }
+    this->ds = add_global_ref(env, ds);
+  }
+
+  virtual ~jni_datasource() {
+    JNIEnv *env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
+      ds = del_global_ref(env, ds);
+    }
+    ds = nullptr;
+  }
+
+  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override {
+    JNIEnv *env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
+      throw cudf::jni::jni_exception("Could not load JNIEnv");
+    }
+
+    jlongArray jbuffer_info =
+        static_cast<jlongArray>(env->CallObjectMethod(this->ds, hostReadBuff_method, offset, size));
+    if (env->ExceptionOccurred()) {
+      throw cudf::jni::jni_exception("Java exception in hostRead");
+    }
+
+    cudf::jni::native_jlongArray buffer_info(env, jbuffer_info);
+    auto ptr = reinterpret_cast<uint8_t *>(buffer_info[0]);
+    size_t length = buffer_info[1];
+    long id = buffer_info[2];
+
+    cudf::jni::host_buffer_done_callback cb(this->jvm, this->ds, id);
+    return std::make_unique<owning_buffer<cudf::jni::host_buffer_done_callback>>(std::move(cb), ptr,
+                                                                                 length);
+  }
+
+  size_t host_read(size_t offset, size_t size, uint8_t *dst) override {
+    JNIEnv *env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
+      throw cudf::jni::jni_exception("Could not load JNIEnv");
+    }
+
+    jlong amount_read =
+        env->CallLongMethod(this->ds, hostRead_method, offset, size, reinterpret_cast<jlong>(dst));
+    if (env->ExceptionOccurred()) {
+      throw cudf::jni::jni_exception("Java exception in hostRead");
+    }
+    return amount_read;
+  }
+
+  size_t size() const override { return ds_size; }
+
+  bool supports_device_read() const override { return device_read_supported; }
+
+  bool is_device_read_preferred(size_t size) const override {
+    return device_read_supported && size >= device_read_cutoff;
+  }
+
+  size_t device_read(size_t offset, size_t size, uint8_t *dst,
+                     rmm::cuda_stream_view stream) override {
+    JNIEnv *env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
+      throw cudf::jni::jni_exception("Could not load JNIEnv");
+    }
+
+    jlong amount_read =
+        env->CallLongMethod(this->ds, deviceRead_method, offset, size, reinterpret_cast<jlong>(dst),
+                            reinterpret_cast<jlong>(stream.value()));
+    if (env->ExceptionOccurred()) {
+      throw cudf::jni::jni_exception("Java exception in deviceRead");
+    }
+    return amount_read;
+  }
+
+  std::future<size_t> device_read_async(size_t offset, size_t size, uint8_t *dst,
+                                        rmm::cuda_stream_view stream) override {
+    auto amount_read = device_read(offset, size, dst, stream);
+    // This is a bit ugly, but we don't have a good way or a need to return
+    // a future for the read
+    std::promise<size_t> ret;
+    ret.set_value(amount_read);
+    return ret.get_future();
+  }
+
+private:
+  size_t ds_size;
+  bool device_read_supported;
+  size_t device_read_cutoff;
+  JavaVM *jvm;
+  jobject ds;
+};
+} // namespace jni
+} // namespace cudf
+
+extern "C" {
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_DataSourceHelper_createWrapperDataSource(
+    JNIEnv *env, jclass, jobject ds, jlong ds_size, jboolean device_read_supported,
+    jlong device_read_cutoff) {
+  JNI_NULL_CHECK(env, ds, "Null data source", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto source =
+        new cudf::jni::jni_datasource(env, ds, ds_size, device_read_supported, device_read_cutoff);
+    return reinterpret_cast<jlong>(source);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_DataSourceHelper_destroyWrapperDataSource(JNIEnv *env,
+                                                                                     jclass,
+                                                                                     jlong handle) {
+  try {
+    cudf::jni::auto_set_device(env);
+    if (handle != 0) {
+      auto source = reinterpret_cast<cudf::jni::jni_datasource *>(handle);
+      delete (source);
+    }
+  }
+  CATCH_STD(env, );
+}
+
+} // extern "C"
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index b208ef8f381..fad19bdf895 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1135,6 +1135,67 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv *env, jclass
   CATCH_STD(env, NULL);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSVFromDataSource(
+    JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
+    jobjectArray filter_col_names, jint header_row, jbyte delim, jint j_quote_style, jbyte quote,
+    jbyte comment, jobjectArray null_values, jobjectArray true_values, jobjectArray false_values,
+    jlong ds_handle) {
+  JNI_NULL_CHECK(env, null_values, "null_values must be supplied, even if it is empty", NULL);
+  JNI_NULL_CHECK(env, ds_handle, "no data source handle given", NULL);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::jni::native_jstringArray n_col_names(env, col_names);
+    cudf::jni::native_jintArray n_types(env, j_types);
+    cudf::jni::native_jintArray n_scales(env, j_scales);
+    if (n_types.is_null() != n_scales.is_null()) {
+      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match null",
+                    NULL);
+    }
+    std::vector<cudf::data_type> data_types;
+    if (!n_types.is_null()) {
+      if (n_types.size() != n_scales.size()) {
+        JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match size",
+                      NULL);
+      }
+      data_types.reserve(n_types.size());
+      std::transform(n_types.begin(), n_types.end(), n_scales.begin(),
+                     std::back_inserter(data_types), [](auto type, auto scale) {
+                       return cudf::data_type{static_cast<cudf::type_id>(type), scale};
+                     });
+    }
+
+    cudf::jni::native_jstringArray n_null_values(env, null_values);
+    cudf::jni::native_jstringArray n_true_values(env, true_values);
+    cudf::jni::native_jstringArray n_false_values(env, false_values);
+    cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
+
+    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    cudf::io::source_info source{ds};
+
+    auto const quote_style = static_cast<cudf::io::quote_style>(j_quote_style);
+
+    cudf::io::csv_reader_options opts = cudf::io::csv_reader_options::builder(source)
+                                            .delimiter(delim)
+                                            .header(header_row)
+                                            .names(n_col_names.as_cpp_vector())
+                                            .dtypes(data_types)
+                                            .use_cols_names(n_filter_col_names.as_cpp_vector())
+                                            .true_values(n_true_values.as_cpp_vector())
+                                            .false_values(n_false_values.as_cpp_vector())
+                                            .na_values(n_null_values.as_cpp_vector())
+                                            .keep_default_na(false)
+                                            .na_filter(n_null_values.size() > 0)
+                                            .quoting(quote_style)
+                                            .quotechar(quote)
+                                            .comment(comment)
+                                            .build();
+
+    return convert_table_for_return(env, cudf::io::read_csv(opts).tbl);
+  }
+  CATCH_STD(env, NULL);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
     JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
     jobjectArray filter_col_names, jstring inputfilepath, jlong buffer, jlong buffer_length,
@@ -1407,6 +1468,72 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE
   CATCH_STD(env, nullptr);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
+    JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
+    jboolean day_first, jboolean lines, jboolean recover_with_null, jlong ds_handle) {
+
+  JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::jni::native_jstringArray n_col_names(env, col_names);
+    cudf::jni::native_jintArray n_types(env, j_types);
+    cudf::jni::native_jintArray n_scales(env, j_scales);
+    if (n_types.is_null() != n_scales.is_null()) {
+      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match null",
+                    0);
+    }
+    std::vector<cudf::data_type> data_types;
+    if (!n_types.is_null()) {
+      if (n_types.size() != n_scales.size()) {
+        JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match size",
+                      0);
+      }
+      data_types.reserve(n_types.size());
+      std::transform(n_types.begin(), n_types.end(), n_scales.begin(),
+                     std::back_inserter(data_types), [](auto const &type, auto const &scale) {
+                       return cudf::data_type{static_cast<cudf::type_id>(type), scale};
+                     });
+    }
+
+    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    cudf::io::source_info source{ds};
+
+    cudf::io::json_recovery_mode_t recovery_mode =
+        recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
+                            cudf::io::json_recovery_mode_t::FAIL;
+    cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
+                                                     .dayfirst(static_cast<bool>(day_first))
+                                                     .lines(static_cast<bool>(lines))
+                                                     .recovery_mode(recovery_mode);
+
+    if (!n_col_names.is_null() && data_types.size() > 0) {
+      if (n_col_names.size() != n_types.size()) {
+        JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
+                      "types and column names must match size", 0);
+      }
+
+      std::map<std::string, cudf::data_type> map;
+
+      auto col_names_vec = n_col_names.as_cpp_vector();
+      std::transform(col_names_vec.begin(), col_names_vec.end(), data_types.begin(),
+                     std::inserter(map, map.end()),
+                     [](std::string a, cudf::data_type b) { return std::make_pair(a, b); });
+      opts.dtypes(map);
+    } else if (data_types.size() > 0) {
+      opts.dtypes(data_types);
+    } else {
+      // should infer the types
+    }
+
+    auto result =
+        std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
+
+    return reinterpret_cast<jlong>(result.release());
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
     JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
     jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
@@ -1489,6 +1616,36 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquetFromDataSource(
+    JNIEnv *env, jclass, jobjectArray filter_col_names, jbooleanArray j_col_binary_read, jint unit,
+    jlong ds_handle) {
+
+  JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
+  JNI_NULL_CHECK(env, j_col_binary_read, "null col_binary_read", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+
+    cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
+    cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
+
+    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    cudf::io::source_info source{ds};
+
+    auto builder = cudf::io::parquet_reader_options::builder(source);
+    if (n_filter_col_names.size() > 0) {
+      builder = builder.columns(n_filter_col_names.as_cpp_vector());
+    }
+
+    cudf::io::parquet_reader_options opts =
+        builder.convert_strings_to_categories(false)
+            .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+            .build();
+    return convert_table_for_return(env, cudf::io::read_parquet(opts).tbl);
+  }
+  CATCH_STD(env, NULL);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(
     JNIEnv *env, jclass, jobjectArray filter_col_names, jbooleanArray j_col_binary_read,
     jstring inputfilepath, jlong buffer, jlong buffer_length, jint unit) {
@@ -1535,10 +1692,31 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(
   CATCH_STD(env, NULL);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvroFromDataSource(
+    JNIEnv *env, jclass, jobjectArray filter_col_names, jlong ds_handle) {
+
+  JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+
+    cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
+
+    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    cudf::io::source_info source{ds};
+
+    cudf::io::avro_reader_options opts = cudf::io::avro_reader_options::builder(source)
+                                             .columns(n_filter_col_names.as_cpp_vector())
+                                             .build();
+    return convert_table_for_return(env, cudf::io::read_avro(opts).tbl);
+  }
+  CATCH_STD(env, NULL);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvro(JNIEnv *env, jclass,
                                                                 jobjectArray filter_col_names,
                                                                 jstring inputfilepath, jlong buffer,
-                                                                jlong buffer_length, jint unit) {
+                                                                jlong buffer_length) {
 
   const bool read_buffer = (buffer != 0);
   if (!read_buffer) {
@@ -1715,6 +1893,38 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetEnd(JNIEnv *env, jc
   CATCH_STD(env, )
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORCFromDataSource(
+    JNIEnv *env, jclass, jobjectArray filter_col_names, jboolean usingNumPyTypes, jint unit,
+    jobjectArray dec128_col_names, jlong ds_handle) {
+
+  JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+
+    cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
+
+    cudf::jni::native_jstringArray n_dec128_col_names(env, dec128_col_names);
+
+    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    cudf::io::source_info source{ds};
+
+    auto builder = cudf::io::orc_reader_options::builder(source);
+    if (n_filter_col_names.size() > 0) {
+      builder = builder.columns(n_filter_col_names.as_cpp_vector());
+    }
+
+    cudf::io::orc_reader_options opts =
+        builder.use_index(false)
+            .use_np_dtypes(static_cast<bool>(usingNumPyTypes))
+            .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+            .decimal128_columns(n_dec128_col_names.as_cpp_vector())
+            .build();
+    return convert_table_for_return(env, cudf::io::read_orc(opts).tbl);
+  }
+  CATCH_STD(env, NULL);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
     JNIEnv *env, jclass, jobjectArray filter_col_names, jstring inputfilepath, jlong buffer,
     jlong buffer_length, jboolean usingNumPyTypes, jint unit, jobjectArray dec128_col_names) {
diff --git a/java/src/main/native/src/cudf_jni_apis.hpp b/java/src/main/native/src/cudf_jni_apis.hpp
index 867df80b722..bd82bbd2899 100644
--- a/java/src/main/native/src/cudf_jni_apis.hpp
+++ b/java/src/main/native/src/cudf_jni_apis.hpp
@@ -134,5 +134,13 @@ void auto_set_device(JNIEnv *env);
  */
 void device_memset_async(JNIEnv *env, rmm::device_buffer &buf, char value);
 
+//
+// DataSource APIs
+//
+
+bool cache_data_source_jni(JNIEnv *env);
+
+void release_data_source_jni(JNIEnv *env);
+
 } // namespace jni
 } // namespace cudf
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index faa73ac4322..b0dd4122b0e 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -327,6 +327,25 @@ void testReadJSONFile() {
     }
   }
 
+  @Test
+  void testReadJSONFromDataSource() throws IOException {
+    Schema schema = Schema.builder()
+            .column(DType.STRING, "name")
+            .column(DType.INT32, "age")
+            .build();
+    JSONOptions opts = JSONOptions.builder()
+            .withLines(true)
+            .build();
+    try (Table expected = new Table.TestBuilder()
+            .column("Michael", "Andy", "Justin")
+            .column(null, 30, 19)
+            .build();
+         MultiBufferDataSource source = sourceFrom(TEST_SIMPLE_JSON_FILE);
+         Table table = Table.readJSON(schema, opts, source)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
   @Test
   void testReadJSONFileWithInvalidLines() {
     Schema schema = Schema.builder()
@@ -560,6 +579,126 @@ void testReadCSVBuffer() {
     }
   }
 
+  byte[][] sliceBytes(byte[] data, int slices) {
+    slices = Math.min(data.length, slices);
+    // We are not going to worry about making it super even here.
+    // The last one gets the extras.
+    int bytesPerSlice = data.length / slices;
+    byte[][] ret = new byte[slices][];
+    int startingAt = 0;
+    for (int i = 0; i < (slices - 1); i++) {
+      ret[i] = new byte[bytesPerSlice];
+      System.arraycopy(data, startingAt, ret[i], 0, bytesPerSlice);
+      startingAt += bytesPerSlice;
+    }
+    // Now for the last one
+    ret[slices - 1] = new byte[data.length - startingAt];
+    System.arraycopy(data, startingAt, ret[slices - 1], 0, data.length - startingAt);
+    return ret;
+  }
+
+  @Test
+  void testReadCSVBufferMultiBuffer() {
+    CSVOptions opts = CSVOptions.builder()
+            .includeColumn("A")
+            .includeColumn("B")
+            .hasHeader()
+            .withDelim('|')
+            .withQuote('\'')
+            .withNullValue("NULL")
+            .build();
+    byte[][] data = sliceBytes(CSV_DATA_BUFFER, 10);
+    try (Table expected = new Table.TestBuilder()
+            .column(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
+            .column(110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, null, 118.2, 119.8)
+            .build();
+         MultiBufferDataSource source = sourceFrom(data);
+         Table table = Table.readCSV(TableTest.CSV_DATA_BUFFER_SCHEMA, opts, source)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  public static byte[] arrayFrom(File f) throws IOException {
+    long len = f.length();
+    if (len > Integer.MAX_VALUE) {
+      throw new IllegalArgumentException("Sorry cannot read " + f +
+              " into an array it does not fit");
+    }
+    int remaining = (int)len;
+    byte[] ret = new byte[remaining];
+    try (java.io.FileInputStream fin = new java.io.FileInputStream(f)) {
+      int at = 0;
+      while (remaining > 0) {
+        int amount = fin.read(ret, at, remaining);
+        at += amount;
+        remaining -= amount;
+      }
+    }
+    return ret;
+  }
+
+  public static MultiBufferDataSource sourceFrom(File f) throws IOException {
+    long len = f.length();
+    byte[] tmp = new byte[(int)Math.min(32 * 1024, len)];
+    try (HostMemoryBuffer buffer = HostMemoryBuffer.allocate(len)) {
+      try (java.io.FileInputStream fin = new java.io.FileInputStream(f)) {
+        long at = 0;
+        while (at < len) {
+          int amount = fin.read(tmp);
+          buffer.setBytes(at, tmp, 0, amount);
+          at += amount;
+        }
+      }
+      return new MultiBufferDataSource(buffer);
+    }
+  }
+
+  public static MultiBufferDataSource sourceFrom(byte[] data) {
+    long len = data.length;
+    try (HostMemoryBuffer buffer = HostMemoryBuffer.allocate(len)) {
+      buffer.setBytes(0, data, 0, len);
+      return new MultiBufferDataSource(buffer);
+    }
+  }
+
+  public static MultiBufferDataSource sourceFrom(byte[][] data) {
+    HostMemoryBuffer[] buffers = new HostMemoryBuffer[data.length];
+    try {
+      for (int i = 0; i < data.length; i++) {
+        byte[] subData = data[i];
+        buffers[i] = HostMemoryBuffer.allocate(subData.length);
+        buffers[i].setBytes(0, subData, 0, subData.length);
+      }
+      return new MultiBufferDataSource(buffers);
+    } finally {
+      for (HostMemoryBuffer buffer: buffers) {
+        if (buffer != null) {
+          buffer.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testReadCSVDataSource() {
+    CSVOptions opts = CSVOptions.builder()
+            .includeColumn("A")
+            .includeColumn("B")
+            .hasHeader()
+            .withDelim('|')
+            .withQuote('\'')
+            .withNullValue("NULL")
+            .build();
+    try (Table expected = new Table.TestBuilder()
+            .column(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
+            .column(110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, null, 118.2, 119.8)
+            .build();
+         MultiBufferDataSource source = sourceFrom(TableTest.CSV_DATA_BUFFER);
+         Table table = Table.readCSV(TableTest.CSV_DATA_BUFFER_SCHEMA, opts, source)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
   @Test
   void testReadCSVWithOffset() {
     CSVOptions opts = CSVOptions.builder()
@@ -864,6 +1003,37 @@ void testReadParquet() {
     }
   }
 
+  @Test
+  void testReadParquetFromDataSource() throws IOException {
+    ParquetOptions opts = ParquetOptions.builder()
+            .includeColumn("loan_id")
+            .includeColumn("zip")
+            .includeColumn("num_units")
+            .build();
+    try (MultiBufferDataSource source = sourceFrom(TEST_PARQUET_FILE);
+         Table table = Table.readParquet(opts, source)) {
+      long rows = table.getRowCount();
+      assertEquals(1000, rows);
+      assertTableTypes(new DType[]{DType.INT64, DType.INT32, DType.INT32}, table);
+    }
+  }
+
+  @Test
+  void testReadParquetMultiBuffer() throws IOException {
+    ParquetOptions opts = ParquetOptions.builder()
+            .includeColumn("loan_id")
+            .includeColumn("zip")
+            .includeColumn("num_units")
+            .build();
+    byte [][] data = sliceBytes(arrayFrom(TEST_PARQUET_FILE), 10);
+    try (MultiBufferDataSource source = sourceFrom(data);
+         Table table = Table.readParquet(opts, source)) {
+      long rows = table.getRowCount();
+      assertEquals(1000, rows);
+      assertTableTypes(new DType[]{DType.INT64, DType.INT32, DType.INT32}, table);
+    }
+  }
+
   @Test
   void testReadParquetBinary() {
     ParquetOptions opts = ParquetOptions.builder()
@@ -1018,6 +1188,23 @@ void testChunkedReadParquet() {
     }
   }
 
+  @Test
+  void testChunkedReadParquetFromDataSource() throws IOException {
+    try (MultiBufferDataSource source = sourceFrom(TEST_PARQUET_FILE_CHUNKED_READ);
+         ParquetChunkedReader reader = new ParquetChunkedReader(240000, ParquetOptions.DEFAULT, source)) {
+      int numChunks = 0;
+      long totalRows = 0;
+      while(reader.hasNext()) {
+        ++numChunks;
+        try(Table chunk = reader.readChunk()) {
+          totalRows += chunk.getRowCount();
+        }
+      }
+      assertEquals(2, numChunks);
+      assertEquals(40000, totalRows);
+    }
+  }
+
   @Test
   void testReadAvro() {
     AvroOptions opts = AvroOptions.builder()
@@ -1037,6 +1224,26 @@ void testReadAvro() {
     }
   }
 
+  @Test
+  void testReadAvroFromDataSource() throws IOException {
+    AvroOptions opts = AvroOptions.builder()
+            .includeColumn("bool_col")
+            .includeColumn("int_col")
+            .includeColumn("timestamp_col")
+            .build();
+
+    try (Table expected = new Table.TestBuilder()
+            .column(true, false, true, false, true, false, true, false)
+            .column(0, 1, 0, 1, 0, 1, 0, 1)
+            .column(1235865600000000L, 1235865660000000L, 1238544000000000L, 1238544060000000L,
+                    1233446400000000L, 1233446460000000L, 1230768000000000L, 1230768060000000L)
+            .build();
+         MultiBufferDataSource source = sourceFrom(TEST_ALL_TYPES_PLAIN_AVRO_FILE);
+         Table table = Table.readAvro(opts, source)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
   @Test
   void testReadAvroBuffer() throws IOException{
     AvroOptions opts = AvroOptions.builder()
@@ -1094,6 +1301,24 @@ void testReadORC() {
     }
   }
 
+  @Test
+  void testReadORCFromDataSource() throws IOException {
+    ORCOptions opts = ORCOptions.builder()
+            .includeColumn("string1")
+            .includeColumn("float1")
+            .includeColumn("int1")
+            .build();
+    try (Table expected = new Table.TestBuilder()
+            .column("hi","bye")
+            .column(1.0f,2.0f)
+            .column(65536,65536)
+            .build();
+         MultiBufferDataSource source = sourceFrom(TEST_ORC_FILE);
+         Table table = Table.readORC(opts, source)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
   @Test
   void testReadORCBuffer() throws IOException {
     ORCOptions opts = ORCOptions.builder()