From 0cac2a9d68341a38721be16132ead14cf4a0d70b Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Mon, 22 Jul 2024 14:18:21 -0700
Subject: [PATCH 01/15] Remove size constraints on source files in batched JSON
 reading (#16162)

Addresses https://github.com/rapidsai/cudf/issues/16138
The batched multi-source JSON reader fails when the size of any of the input source buffers exceeds `INT_MAX` bytes.
The goal of this PR is to remove this constraint by modifying the batching behavior of the reader.  Instead of constructing batches that include entire source files, the batches are now constructed at the granularity of byte ranges of size at most `INT_MAX` bytes,

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16162
---
 cpp/include/cudf/io/json.hpp                  |   4 +-
 cpp/src/io/json/read_json.cu                  | 139 +++++++++---------
 cpp/src/io/json/read_json.hpp                 |  18 ++-
 cpp/tests/CMakeLists.txt                      |  14 +-
 .../json_chunked_reader.cu}                   |  81 ++--------
 .../json_quote_normalization_test.cpp         |   0
 cpp/tests/io/{ => json}/json_test.cpp         |   0
 cpp/tests/io/{ => json}/json_tree.cpp         |   0
 .../io/{ => json}/json_type_cast_test.cu      |   0
 cpp/tests/io/json/json_utils.cuh              | 105 +++++++++++++
 .../json_whitespace_normalization_test.cu     |   0
 cpp/tests/io/{ => json}/json_writer.cpp       |   0
 cpp/tests/io/{ => json}/nested_json_test.cpp  |   0
 .../{json_tests.cpp => json_tests.cu}         |  45 +++++-
 14 files changed, 242 insertions(+), 164 deletions(-)
 rename cpp/tests/io/{json_chunked_reader.cpp => json/json_chunked_reader.cu} (64%)
 rename cpp/tests/io/{ => json}/json_quote_normalization_test.cpp (100%)
 rename cpp/tests/io/{ => json}/json_test.cpp (100%)
 rename cpp/tests/io/{ => json}/json_tree.cpp (100%)
 rename cpp/tests/io/{ => json}/json_type_cast_test.cu (100%)
 create mode 100644 cpp/tests/io/json/json_utils.cuh
 rename cpp/tests/io/{ => json}/json_whitespace_normalization_test.cu (100%)
 rename cpp/tests/io/{ => json}/json_writer.cpp (100%)
 rename cpp/tests/io/{ => json}/nested_json_test.cpp (100%)
 rename cpp/tests/large_strings/{json_tests.cpp => json_tests.cu} (50%)
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 7af90766ad0..d47266fdd12 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -333,14 +333,14 @@ class json_reader_options {
    *
    * @param offset Number of bytes of offset
    */
-  void set_byte_range_offset(size_type offset) { _byte_range_offset = offset; }
+  void set_byte_range_offset(size_t offset) { _byte_range_offset = offset; }
 
   /**
    * @brief Set number of bytes to read.
    *
    * @param size Number of bytes to read
    */
-  void set_byte_range_size(size_type size) { _byte_range_size = size; }
+  void set_byte_range_size(size_t size) { _byte_range_size = size; }
 
   /**
    * @brief Set delimiter separating records in JSON lines
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 9cd39038348..0ba4dedfc34 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -148,20 +148,12 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
   return buffer.first(uncomp_data.size());
 }
 
-size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::datasource>> sources,
-                                        json_reader_options const& reader_opts,
-                                        char const delimiter,
-                                        rmm::cuda_stream_view stream)
+size_t estimate_size_per_subchunk(size_t chunk_size)
 {
-  auto total_source_size = sources_size(sources, 0, 0) + (sources.size() - 1);
-  rmm::device_uvector<char> buffer(total_source_size, stream);
-  auto readbufspan = ingest_raw_input(buffer,
-                                      sources,
-                                      reader_opts.get_compression(),
-                                      reader_opts.get_byte_range_offset(),
-                                      reader_opts.get_byte_range_size(),
-                                      stream);
-  return find_first_delimiter(readbufspan, '\n', stream);
+  auto geometric_mean = [](double a, double b) { return std::sqrt(a * b); };
+  // NOTE: heuristic for choosing subchunk size: geometric mean of minimum subchunk size (set to
+  // 10kb) and the byte range size
+  return geometric_mean(std::ceil((double)chunk_size / num_subchunks), min_subchunk_size);
 }
 
 /**
@@ -183,7 +175,6 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
   rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  auto geometric_mean = [](double a, double b) { return std::sqrt(a * b); };
 
   size_t const total_source_size            = sources_size(sources, 0, 0);
   auto constexpr num_delimiter_chars        = 1;
@@ -198,17 +189,8 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
   auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset;
   chunk_size = should_load_all_sources ? total_source_size - chunk_offset : chunk_size;
 
-  // Some magic numbers
-  constexpr int num_subchunks               = 10;  // per chunk_size
-  constexpr size_t min_subchunk_size        = 10000;
-  int const num_subchunks_prealloced        = should_load_all_sources ? 0 : 3;
-  constexpr int estimated_compression_ratio = 4;
-
-  // NOTE: heuristic for choosing subchunk size: geometric mean of minimum subchunk size (set to
-  // 10kb) and the byte range size
-
-  size_t const size_per_subchunk =
-    geometric_mean(std::ceil((double)chunk_size / num_subchunks), min_subchunk_size);
+  int const num_subchunks_prealloced = should_load_all_sources ? 0 : max_subchunks_prealloced;
+  size_t const size_per_subchunk     = estimate_size_per_subchunk(chunk_size);
 
   // The allocation for single source compressed input is estimated by assuming a ~4:1
   // compression ratio. For uncompressed inputs, we can getter a better estimate using the idea
@@ -308,67 +290,78 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                  "Multiple inputs are supported only for JSON Lines format");
   }
 
-  std::for_each(sources.begin(), sources.end(), [](auto const& source) {
-    CUDF_EXPECTS(source->size() < std::numeric_limits<int>::max(),
-                 "The size of each source file must be less than INT_MAX bytes");
-  });
-
-  constexpr size_t batch_size_ub = std::numeric_limits<int>::max();
-  size_t const chunk_offset      = reader_opts.get_byte_range_offset();
+  /*
+   * The batched JSON reader enforces that the size of each batch is at most INT_MAX
+   * bytes (~2.14GB). Batches are defined to be byte range chunks - characterized by
+   * chunk offset and chunk size - that may span across multiple source files.
+   * Note that the batched reader does not work for compressed inputs or for regular
+   * JSON inputs.
+   */
+  size_t const total_source_size = sources_size(sources, 0, 0);
+  size_t chunk_offset            = reader_opts.get_byte_range_offset();
   size_t chunk_size              = reader_opts.get_byte_range_size();
-  chunk_size                     = !chunk_size ? sources_size(sources, 0, 0) : chunk_size;
-
-  // Identify the position of starting source file from which to begin batching based on
-  // byte range offset. If the offset is larger than the sum of all source
-  // sizes, then start_source is total number of source files i.e. no file is read
-  size_t const start_source = [&]() {
-    size_t sum = 0;
+  chunk_size                     = !chunk_size ? total_source_size - chunk_offset
+                                               : std::min(chunk_size, total_source_size - chunk_offset);
+
+  size_t const size_per_subchunk = estimate_size_per_subchunk(chunk_size);
+  size_t const batch_size_ub =
+    std::numeric_limits<int>::max() - (max_subchunks_prealloced * size_per_subchunk);
+
+  /*
+   * Identify the position (zero-indexed) of starting source file from which to begin
+   * batching based on byte range offset. If the offset is larger than the sum of all
+   * source sizes, then start_source is total number of source files i.e. no file is
+   * read
+   */
+
+  // Prefix sum of source file sizes
+  size_t pref_source_size = 0;
+  // Starting source file from which to being batching evaluated using byte range offset
+  size_t const start_source = [chunk_offset, &sources, &pref_source_size]() {
     for (size_t src_idx = 0; src_idx < sources.size(); ++src_idx) {
-      if (sum + sources[src_idx]->size() > chunk_offset) return src_idx;
-      sum += sources[src_idx]->size();
+      if (pref_source_size + sources[src_idx]->size() > chunk_offset) { return src_idx; }
+      pref_source_size += sources[src_idx]->size();
     }
     return sources.size();
   }();
-
-  // Construct batches of source files, with starting position of batches indicated by
-  // batch_positions. The size of each batch i.e. the sum of sizes of the source files in the batch
-  // is capped at INT_MAX bytes.
-  size_t cur_size = 0;
-  std::vector<size_t> batch_positions;
-  std::vector<size_t> batch_sizes;
-  batch_positions.push_back(0);
-  for (size_t i = start_source; i < sources.size(); i++) {
-    cur_size += sources[i]->size();
-    if (cur_size >= batch_size_ub) {
-      batch_positions.push_back(i);
-      batch_sizes.push_back(cur_size - sources[i]->size());
-      cur_size = sources[i]->size();
+  /*
+   * Construct batches of byte ranges spanning source files, with the starting position of batches
+   * indicated by `batch_offsets`. `pref_bytes_size` gives the bytes position from which the current
+   * batch begins, and `end_bytes_size` gives the terminal bytes position after which reading
+   * stops.
+   */
+  size_t pref_bytes_size = chunk_offset;
+  size_t end_bytes_size  = chunk_offset + chunk_size;
+  std::vector<size_t> batch_offsets{pref_bytes_size};
+  for (size_t i = start_source; i < sources.size() && pref_bytes_size < end_bytes_size;) {
+    pref_source_size += sources[i]->size();
+    // If the current source file can subsume multiple batches, we split the file until the
+    // boundary of the last batch exceeds the end of the file (indexed by `pref_source_size`)
+    while (pref_bytes_size < end_bytes_size &&
+           pref_source_size >= std::min(pref_bytes_size + batch_size_ub, end_bytes_size)) {
+      auto next_batch_size = std::min(batch_size_ub, end_bytes_size - pref_bytes_size);
+      batch_offsets.push_back(batch_offsets.back() + next_batch_size);
+      pref_bytes_size += next_batch_size;
     }
+    i++;
   }
-  batch_positions.push_back(sources.size());
-  batch_sizes.push_back(cur_size);
-
-  // If there is a single batch, then we can directly return the table without the
-  // unnecessary concatenate
-  if (batch_sizes.size() == 1) return read_batch(sources, reader_opts, stream, mr);
+  /*
+   * If there is a single batch, then we can directly return the table without the
+   * unnecessary concatenate. The size of batch_offsets is 1 if all sources are empty,
+   * or if end_bytes_size is larger than total_source_size.
+   */
+  if (batch_offsets.size() <= 2) return read_batch(sources, reader_opts, stream, mr);
 
   std::vector<cudf::io::table_with_metadata> partial_tables;
   json_reader_options batched_reader_opts{reader_opts};
-
   // Dispatch individual batches to read_batch and push the resulting table into
   // partial_tables array. Note that the reader options need to be updated for each
   // batch to adjust byte range offset and byte range size.
-  for (size_t i = 0; i < batch_sizes.size(); i++) {
-    batched_reader_opts.set_byte_range_size(std::min(batch_sizes[i], chunk_size));
-    partial_tables.emplace_back(read_batch(
-      host_span<std::unique_ptr<datasource>>(sources.begin() + batch_positions[i],
-                                             batch_positions[i + 1] - batch_positions[i]),
-      batched_reader_opts,
-      stream,
-      rmm::mr::get_current_device_resource()));
-    if (chunk_size <= batch_sizes[i]) break;
-    chunk_size -= batch_sizes[i];
-    batched_reader_opts.set_byte_range_offset(0);
+  for (size_t i = 0; i < batch_offsets.size() - 1; i++) {
+    batched_reader_opts.set_byte_range_offset(batch_offsets[i]);
+    batched_reader_opts.set_byte_range_size(batch_offsets[i + 1] - batch_offsets[i]);
+    partial_tables.emplace_back(
+      read_batch(sources, batched_reader_opts, stream, rmm::mr::get_current_device_resource()));
   }
 
   auto expects_schema_equality =
diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp
index 0c30b4cad46..ff69f9b7627 100644
--- a/cpp/src/io/json/read_json.hpp
+++ b/cpp/src/io/json/read_json.hpp
@@ -29,6 +29,19 @@
 
 namespace cudf::io::json::detail {
 
+// Some magic numbers
+constexpr int num_subchunks               = 10;  // per chunk_size
+constexpr size_t min_subchunk_size        = 10000;
+constexpr int estimated_compression_ratio = 4;
+constexpr int max_subchunks_prealloced    = 3;
+
+device_span<char> ingest_raw_input(device_span<char> buffer,
+                                   host_span<std::unique_ptr<datasource>> sources,
+                                   compression_type compression,
+                                   size_t range_offset,
+                                   size_t range_size,
+                                   rmm::cuda_stream_view stream);
+
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
                               rmm::cuda_stream_view stream,
@@ -38,9 +51,4 @@ size_type find_first_delimiter(device_span<char const> d_data,
                                char const delimiter,
                                rmm::cuda_stream_view stream);
 
-size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::datasource>> sources,
-                                        json_reader_options const& reader_opts,
-                                        char const delimiter,
-                                        rmm::cuda_stream_view stream);
-
 }  // namespace cudf::io::json::detail
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 8e2017ccb97..05e9759632f 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -313,17 +313,17 @@ ConfigureTest(
   PERCENT 30
 )
 ConfigureTest(
-  JSON_TEST io/json_test.cpp io/json_chunked_reader.cpp
+  JSON_TEST io/json/json_test.cpp io/json/json_chunked_reader.cu
   GPUS 1
   PERCENT 30
 )
-ConfigureTest(JSON_WRITER_TEST io/json_writer.cpp)
-ConfigureTest(JSON_TYPE_CAST_TEST io/json_type_cast_test.cu)
-ConfigureTest(NESTED_JSON_TEST io/nested_json_test.cpp io/json_tree.cpp)
+ConfigureTest(JSON_WRITER_TEST io/json/json_writer.cpp)
+ConfigureTest(JSON_TYPE_CAST_TEST io/json/json_type_cast_test.cu)
+ConfigureTest(NESTED_JSON_TEST io/json/nested_json_test.cpp io/json/json_tree.cpp)
 ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp)
 ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
-ConfigureTest(JSON_QUOTE_NORMALIZATION io/json_quote_normalization_test.cpp)
-ConfigureTest(JSON_WHITESPACE_NORMALIZATION io/json_whitespace_normalization_test.cu)
+ConfigureTest(JSON_QUOTE_NORMALIZATION io/json/json_quote_normalization_test.cpp)
+ConfigureTest(JSON_WHITESPACE_NORMALIZATION io/json/json_whitespace_normalization_test.cu)
 ConfigureTest(
   DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp
   GPUS 1
@@ -572,7 +572,7 @@ ConfigureTest(
   LARGE_STRINGS_TEST
   large_strings/concatenate_tests.cpp
   large_strings/case_tests.cpp
-  large_strings/json_tests.cpp
+  large_strings/json_tests.cu
   large_strings/large_strings_fixture.cpp
   large_strings/merge_tests.cpp
   large_strings/parquet_tests.cpp
diff --git a/cpp/tests/io/json_chunked_reader.cpp b/cpp/tests/io/json/json_chunked_reader.cu
similarity index 64%
rename from cpp/tests/io/json_chunked_reader.cpp
rename to cpp/tests/io/json/json_chunked_reader.cu
index 23d54f7263c..b9dee54752c 100644
--- a/cpp/tests/io/json_chunked_reader.cpp
+++ b/cpp/tests/io/json/json_chunked_reader.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "io/json/read_json.hpp"
+#include "json_utils.cuh"
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
@@ -37,65 +37,6 @@ cudf::test::TempDirTestEnvironment* const temp_env =
   static_cast<cudf::test::TempDirTestEnvironment*>(
     ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
 
-// function to extract first delimiter in the string in each chunk,
-// collate together and form byte_range for each chunk,
-// parse separately.
-std::vector<cudf::io::table_with_metadata> skeleton_for_parellel_chunk_reader(
-  cudf::host_span<std::unique_ptr<cudf::io::datasource>> sources,
-  cudf::io::json_reader_options const& reader_opts,
-  int32_t chunk_size,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  using namespace cudf::io::json::detail;
-  using cudf::size_type;
-  size_t total_source_size = 0;
-  for (auto const& source : sources) {
-    total_source_size += source->size();
-  }
-  size_t num_chunks                = (total_source_size + chunk_size - 1) / chunk_size;
-  constexpr size_type no_min_value = -1;
-
-  // Get the first delimiter in each chunk.
-  std::vector<size_type> first_delimiter_index(num_chunks);
-  auto reader_opts_chunk = reader_opts;
-  for (size_t i = 0; i < num_chunks; i++) {
-    auto const chunk_start = i * chunk_size;
-    reader_opts_chunk.set_byte_range_offset(chunk_start);
-    reader_opts_chunk.set_byte_range_size(chunk_size);
-    first_delimiter_index[i] =
-      find_first_delimiter_in_chunk(sources, reader_opts_chunk, '\n', stream);
-    if (first_delimiter_index[i] != no_min_value) { first_delimiter_index[i] += chunk_start; }
-  }
-
-  // Process and allocate record start, end for each worker.
-  using record_range = std::pair<size_type, size_type>;
-  std::vector<record_range> record_ranges;
-  record_ranges.reserve(num_chunks);
-  first_delimiter_index[0] = 0;
-  auto prev                = first_delimiter_index[0];
-  for (size_t i = 1; i < num_chunks; i++) {
-    if (first_delimiter_index[i] == no_min_value) continue;
-    record_ranges.emplace_back(prev, first_delimiter_index[i]);
-    prev = first_delimiter_index[i];
-  }
-  record_ranges.emplace_back(prev, total_source_size);
-
-  std::vector<cudf::io::table_with_metadata> tables;
-  // Process each chunk in parallel.
-  for (auto const& [chunk_start, chunk_end] : record_ranges) {
-    if (chunk_start == -1 or chunk_end == -1 or
-        static_cast<size_t>(chunk_start) >= total_source_size)
-      continue;
-    reader_opts_chunk.set_byte_range_offset(chunk_start);
-    reader_opts_chunk.set_byte_range_size(chunk_end - chunk_start);
-    tables.push_back(read_json(sources, reader_opts_chunk, stream, mr));
-  }
-  // assume all records have same number of columns, and inferred same type. (or schema is passed)
-  // TODO a step before to merge all columns, types and infer final schema.
-  return tables;
-}
-
 TEST_F(JsonReaderTest, ByteRange_SingleSource)
 {
   std::string const json_string = R"(
@@ -118,11 +59,11 @@ TEST_F(JsonReaderTest, ByteRange_SingleSource)
 
   // Test for different chunk sizes
   for (auto chunk_size : {7, 10, 15, 20, 40, 50, 100, 200, 500}) {
-    auto const tables = skeleton_for_parellel_chunk_reader(datasources,
-                                                           json_lines_options,
-                                                           chunk_size,
-                                                           cudf::get_default_stream(),
-                                                           rmm::mr::get_current_device_resource());
+    auto const tables = split_byte_range_reading(datasources,
+                                                 json_lines_options,
+                                                 chunk_size,
+                                                 cudf::get_default_stream(),
+                                                 rmm::mr::get_current_device_resource());
 
     auto table_views = std::vector<cudf::table_view>(tables.size());
     std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) {
@@ -213,11 +154,11 @@ TEST_F(JsonReaderTest, ByteRange_MultiSource)
 
   // Test for different chunk sizes
   for (auto chunk_size : {7, 10, 15, 20, 40, 50, 100, 200, 500, 1000, 2000}) {
-    auto const tables = skeleton_for_parellel_chunk_reader(datasources,
-                                                           json_lines_options,
-                                                           chunk_size,
-                                                           cudf::get_default_stream(),
-                                                           rmm::mr::get_current_device_resource());
+    auto const tables = split_byte_range_reading(datasources,
+                                                 json_lines_options,
+                                                 chunk_size,
+                                                 cudf::get_default_stream(),
+                                                 rmm::mr::get_current_device_resource());
 
     auto table_views = std::vector<cudf::table_view>(tables.size());
     std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) {
diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json/json_quote_normalization_test.cpp
similarity index 100%
rename from cpp/tests/io/json_quote_normalization_test.cpp
rename to cpp/tests/io/json/json_quote_normalization_test.cpp
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json/json_test.cpp
similarity index 100%
rename from cpp/tests/io/json_test.cpp
rename to cpp/tests/io/json/json_test.cpp
diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp
similarity index 100%
rename from cpp/tests/io/json_tree.cpp
rename to cpp/tests/io/json/json_tree.cpp
diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json/json_type_cast_test.cu
similarity index 100%
rename from cpp/tests/io/json_type_cast_test.cu
rename to cpp/tests/io/json/json_type_cast_test.cu
diff --git a/cpp/tests/io/json/json_utils.cuh b/cpp/tests/io/json/json_utils.cuh
new file mode 100644
index 00000000000..9383797d91b
--- /dev/null
+++ b/cpp/tests/io/json/json_utils.cuh
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "io/json/read_json.hpp"
+
+#include <cudf/io/datasource.hpp>
+#include <cudf/io/json.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <numeric>
+
+// Helper function to test correctness of JSON byte range reading.
+// We split the input source files into a set of byte range chunks each of size
+// `chunk_size` and return an array of partial tables constructed from each chunk
+template <typename IndexType = std::int32_t>
+std::vector<cudf::io::table_with_metadata> split_byte_range_reading(
+  cudf::host_span<std::unique_ptr<cudf::io::datasource>> sources,
+  cudf::io::json_reader_options const& reader_opts,
+  IndexType chunk_size,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  auto total_source_size = [&sources]() {
+    return std::accumulate(sources.begin(), sources.end(), 0ul, [=](size_t sum, auto& source) {
+      auto const size = source->size();
+      return sum + size;
+    });
+  }();
+  auto find_first_delimiter_in_chunk =
+    [total_source_size, &sources, &stream](
+      cudf::io::json_reader_options const& reader_opts) -> IndexType {
+    rmm::device_uvector<char> buffer(total_source_size, stream);
+    auto readbufspan = cudf::io::json::detail::ingest_raw_input(buffer,
+                                                                sources,
+                                                                reader_opts.get_compression(),
+                                                                reader_opts.get_byte_range_offset(),
+                                                                reader_opts.get_byte_range_size(),
+                                                                stream);
+    // Note: we cannot reuse cudf::io::json::detail::find_first_delimiter since the
+    // return type of that function is size_type. However, when the chunk_size is
+    // larger than INT_MAX, the position of the delimiter can also be larger than
+    // INT_MAX. We do not encounter this overflow error in the detail function
+    // since the batched JSON reader splits the byte_range_size into chunk_sizes
+    // smaller than INT_MAX bytes
+    auto const first_delimiter_position_it =
+      thrust::find(rmm::exec_policy(stream), readbufspan.begin(), readbufspan.end(), '\n');
+    return first_delimiter_position_it != readbufspan.end()
+             ? thrust::distance(readbufspan.begin(), first_delimiter_position_it)
+             : -1;
+  };
+  size_t num_chunks                = (total_source_size + chunk_size - 1) / chunk_size;
+  constexpr IndexType no_min_value = -1;
+
+  // Get the first delimiter in each chunk.
+  std::vector<IndexType> first_delimiter_index(num_chunks);
+  auto reader_opts_chunk = reader_opts;
+  for (size_t i = 0; i < num_chunks; i++) {
+    auto const chunk_start = i * chunk_size;
+    // We are updating reader_opt_chunks to store offset and size information for the current chunk
+    reader_opts_chunk.set_byte_range_offset(chunk_start);
+    reader_opts_chunk.set_byte_range_size(chunk_size);
+    first_delimiter_index[i] = find_first_delimiter_in_chunk(reader_opts_chunk);
+  }
+
+  // Process and allocate record start, end for each worker.
+  using record_range = std::pair<size_t, size_t>;
+  std::vector<record_range> record_ranges;
+  record_ranges.reserve(num_chunks);
+  size_t prev = 0;
+  for (size_t i = 1; i < num_chunks; i++) {
+    // In the case where chunk_size is smaller than row size, the chunk needs to be skipped
+    if (first_delimiter_index[i] == no_min_value) continue;
+    size_t next = static_cast<size_t>(first_delimiter_index[i]) + (i * chunk_size);
+    record_ranges.emplace_back(prev, next);
+    prev = next;
+  }
+  record_ranges.emplace_back(prev, total_source_size);
+
+  std::vector<cudf::io::table_with_metadata> tables;
+  for (auto const& [chunk_start, chunk_end] : record_ranges) {
+    reader_opts_chunk.set_byte_range_offset(chunk_start);
+    reader_opts_chunk.set_byte_range_size(chunk_end - chunk_start);
+    tables.push_back(cudf::io::json::detail::read_json(sources, reader_opts_chunk, stream, mr));
+  }
+  // assume all records have same number of columns, and inferred same type. (or schema is passed)
+  // TODO a step before to merge all columns, types and infer final schema.
+  return tables;
+}
diff --git a/cpp/tests/io/json_whitespace_normalization_test.cu b/cpp/tests/io/json/json_whitespace_normalization_test.cu
similarity index 100%
rename from cpp/tests/io/json_whitespace_normalization_test.cu
rename to cpp/tests/io/json/json_whitespace_normalization_test.cu
diff --git a/cpp/tests/io/json_writer.cpp b/cpp/tests/io/json/json_writer.cpp
similarity index 100%
rename from cpp/tests/io/json_writer.cpp
rename to cpp/tests/io/json/json_writer.cpp
diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/json/nested_json_test.cpp
similarity index 100%
rename from cpp/tests/io/nested_json_test.cpp
rename to cpp/tests/io/json/nested_json_test.cpp
diff --git a/cpp/tests/large_strings/json_tests.cpp b/cpp/tests/large_strings/json_tests.cu
similarity index 50%
rename from cpp/tests/large_strings/json_tests.cpp
rename to cpp/tests/large_strings/json_tests.cu
index bf16d131ba7..49abf7b484d 100644
--- a/cpp/tests/large_strings/json_tests.cpp
+++ b/cpp/tests/large_strings/json_tests.cu
@@ -14,8 +14,13 @@
  * limitations under the License.
  */
 
+#include "../io/json/json_utils.cuh"
 #include "large_strings_fixture.hpp"
 
+#include <cudf_test/table_utilities.hpp>
+
+#include <cudf/concatenate.hpp>
+#include <cudf/io/datasource.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -28,31 +33,57 @@ TEST_F(JsonLargeReaderTest, MultiBatch)
     { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
     { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
     { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
-  constexpr size_t expected_file_size = std::numeric_limits<int>::max() / 2;
+  constexpr size_t batch_size_ub      = std::numeric_limits<int>::max();
+  constexpr size_t expected_file_size = 1.5 * static_cast<double>(batch_size_ub);
   std::size_t const log_repetitions =
     static_cast<std::size_t>(std::ceil(std::log2(expected_file_size / json_string.size())));
 
   json_string.reserve(json_string.size() * (1UL << log_repetitions));
-  std::size_t numrows = 4;
   for (std::size_t i = 0; i < log_repetitions; i++) {
     json_string += json_string;
-    numrows <<= 1;
   }
 
   constexpr int num_sources = 2;
-  std::vector<cudf::host_span<char>> hostbufs(
-    num_sources, cudf::host_span<char>(json_string.data(), json_string.size()));
+  std::vector<cudf::host_span<std::byte>> hostbufs(
+    num_sources,
+    cudf::host_span<std::byte>(reinterpret_cast<std::byte*>(json_string.data()),
+                               json_string.size()));
 
   // Initialize parsing options (reading json lines)
   cudf::io::json_reader_options json_lines_options =
     cudf::io::json_reader_options::builder(
       cudf::io::source_info{
-        cudf::host_span<cudf::host_span<char>>(hostbufs.data(), hostbufs.size())})
+        cudf::host_span<cudf::host_span<std::byte>>(hostbufs.data(), hostbufs.size())})
       .lines(true)
       .compression(cudf::io::compression_type::NONE)
       .recovery_mode(cudf::io::json_recovery_mode_t::FAIL);
 
   // Read full test data via existing, nested JSON lines reader
   cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options);
-  ASSERT_EQ(current_reader_table.tbl->num_rows(), numrows * num_sources);
+
+  std::vector<std::unique_ptr<cudf::io::datasource>> datasources;
+  for (auto& hb : hostbufs) {
+    datasources.emplace_back(cudf::io::datasource::create(hb));
+  }
+  // Test for different chunk sizes
+  std::vector<size_t> chunk_sizes{
+    batch_size_ub / 4, batch_size_ub / 2, batch_size_ub, static_cast<size_t>(batch_size_ub * 2)};
+  for (auto chunk_size : chunk_sizes) {
+    auto const tables =
+      split_byte_range_reading<std::int64_t>(datasources,
+                                             json_lines_options,
+                                             chunk_size,
+                                             cudf::get_default_stream(),
+                                             rmm::mr::get_current_device_resource());
+
+    auto table_views = std::vector<cudf::table_view>(tables.size());
+    std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) {
+      return table.tbl->view();
+    });
+    auto result = cudf::concatenate(table_views);
+
+    // Verify that the data read via chunked reader matches the data read via nested JSON reader
+    // cannot use EQUAL due to concatenate removing null mask
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(current_reader_table.tbl->view(), result->view());
+  }
 }

From c7b28ceeb46d2b921e30f081a9ed97745c91ff9e Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 23 Jul 2024 05:28:13 -0500
Subject: [PATCH 02/15] Add `drop_nulls` in `cudf-polars` (#16290)

Closes https://github.com/rapidsai/cudf/issues/16219

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16290
---
 python/cudf_polars/cudf_polars/dsl/expr.py  | 30 +++++++++-
 python/cudf_polars/tests/test_drop_nulls.py | 65 +++++++++++++++++++++
 2 files changed, 94 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf_polars/tests/test_drop_nulls.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index a034d55120a..8322d6bd6fb 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -882,7 +882,14 @@ def __init__(
         self.name = name
         self.options = options
         self.children = children
-        if self.name not in ("mask_nans", "round", "setsorted", "unique"):
+        if self.name not in (
+            "mask_nans",
+            "round",
+            "setsorted",
+            "unique",
+            "dropnull",
+            "fill_null",
+        ):
             raise NotImplementedError(f"Unary function {name=}")
 
     def do_evaluate(
@@ -968,6 +975,27 @@ def do_evaluate(
                 order=order,
                 null_order=null_order,
             )
+        elif self.name == "dropnull":
+            (column,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            return Column(
+                plc.stream_compaction.drop_nulls(
+                    plc.Table([column.obj]), [0], 1
+                ).columns()[0]
+            )
+        elif self.name == "fill_null":
+            column = self.children[0].evaluate(df, context=context, mapping=mapping)
+            if isinstance(self.children[1], Literal):
+                arg = plc.interop.from_arrow(self.children[1].value)
+            else:
+                evaluated = self.children[1].evaluate(
+                    df, context=context, mapping=mapping
+                )
+                arg = evaluated.obj_scalar if evaluated.is_scalar else evaluated.obj
+            return Column(plc.replace.replace_nulls(column.obj, arg))
+
         raise NotImplementedError(
             f"Unimplemented unary function {self.name=}"
         )  # pragma: no cover; init trips first
diff --git a/python/cudf_polars/tests/test_drop_nulls.py b/python/cudf_polars/tests/test_drop_nulls.py
new file mode 100644
index 00000000000..5dfe9f66a97
--- /dev/null
+++ b/python/cudf_polars/tests/test_drop_nulls.py
@@ -0,0 +1,65 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
+
+
+@pytest.fixture(
+    params=[
+        [1, 2, 1, 3, 5, None, None],
+        [1.5, 2.5, None, 1.5, 3, float("nan"), 3],
+        [],
+        [None, None],
+        [1, 2, 3, 4, 5],
+    ]
+)
+def null_data(request):
+    is_empty = pl.Series(request.param).dtype == pl.Null
+    return pl.DataFrame(
+        {
+            "a": pl.Series(request.param, dtype=pl.Float64 if is_empty else None),
+            "b": pl.Series(request.param, dtype=pl.Float64 if is_empty else None),
+        }
+    ).lazy()
+
+
+def test_drop_null(null_data):
+    q = null_data.select(pl.col("a").drop_nulls())
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize(
+    "value",
+    [0, pl.col("a").mean(), pl.col("b")],
+    ids=["scalar", "aggregation", "column_expression"],
+)
+def test_fill_null(null_data, value):
+    q = null_data.select(pl.col("a").fill_null(value))
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize(
+    "strategy", ["forward", "backward", "min", "max", "mean", "zero", "one"]
+)
+def test_fill_null_with_strategy(null_data, strategy):
+    q = null_data.select(pl.col("a").fill_null(strategy=strategy))
+
+    # Not yet exposed to python from rust
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+@pytest.mark.parametrize("strategy", ["forward", "backward"])
+@pytest.mark.parametrize("limit", [0, 1, 2])
+def test_fill_null_with_limit(null_data, strategy, limit):
+    q = null_data.select(pl.col("a").fill_null(strategy=strategy, limit=limit))
+
+    # Not yet exposed to python from rust
+    assert_ir_translation_raises(q, NotImplementedError)

From e6d412cba7c23df7ee500c28257ed9281cea49b9 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 23 Jul 2024 06:03:28 -0500
Subject: [PATCH 03/15] Fall back when casting a timestamp to numeric in
 cudf-polars (#16232)

This PR adds logic that falls back to CPU when a cudf-polars query would cast a timestamp column to a numeric type, an unsupported operation in libcudf, which should fix a few polars tests. It could be cleaned up a bit with some of the utilities that will be added in https://github.com/rapidsai/cudf/pull/16150.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16232
---
 python/cudf_polars/cudf_polars/dsl/expr.py    |  4 ++
 .../tests/expressions/test_casting.py         | 52 +++++++++++++++++++
 2 files changed, 56 insertions(+)
 create mode 100644 python/cudf_polars/tests/expressions/test_casting.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 8322d6bd6fb..9835e6f8461 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -1188,6 +1188,10 @@ class Cast(Expr):
     def __init__(self, dtype: plc.DataType, value: Expr) -> None:
         super().__init__(dtype)
         self.children = (value,)
+        if not plc.unary.is_supported_cast(self.dtype, value.dtype):
+            raise NotImplementedError(
+                f"Can't cast {self.dtype.id().name} to {value.dtype.id().name}"
+            )
 
     def do_evaluate(
         self,
diff --git a/python/cudf_polars/tests/expressions/test_casting.py b/python/cudf_polars/tests/expressions/test_casting.py
new file mode 100644
index 00000000000..3e003054338
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_casting.py
@@ -0,0 +1,52 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
+
+_supported_dtypes = [(pl.Int8(), pl.Int64())]
+
+_unsupported_dtypes = [
+    (pl.String(), pl.Int64()),
+]
+
+
+@pytest.fixture
+def dtypes(request):
+    return request.param
+
+
+@pytest.fixture
+def tests(dtypes):
+    fromtype, totype = dtypes
+    if fromtype == pl.String():
+        data = ["a", "b", "c"]
+    else:
+        data = [1, 2, 3]
+    return pl.DataFrame(
+        {
+            "a": pl.Series(data, dtype=fromtype),
+        }
+    ).lazy(), totype
+
+
+@pytest.mark.parametrize("dtypes", _supported_dtypes, indirect=True)
+def test_cast_supported(tests):
+    df, totype = tests
+    q = df.select(pl.col("a").cast(totype))
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("dtypes", _unsupported_dtypes, indirect=True)
+def test_cast_unsupported(tests):
+    df, totype = tests
+    assert_ir_translation_raises(
+        df.select(pl.col("a").cast(totype)), NotImplementedError
+    )

From ff30c0211109e14b1f6918fcc6c2e2b98f863a1f Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Tue, 23 Jul 2024 12:03:55 -0700
Subject: [PATCH 04/15] Fix compile warnings with `jni_utils.hpp` (#16336)

This fixes the compiler warnings with `jni_utils.hpp`, removing some `const` qualifiers that are redundant.

Closes https://github.com/rapidsai/cudf/issues/16335.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/16336
---
 java/src/main/native/include/jni_utils.hpp | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp
index ea04c1cda83..a3b4bfcb63e 100644
--- a/java/src/main/native/include/jni_utils.hpp
+++ b/java/src/main/native/include/jni_utils.hpp
@@ -284,7 +284,7 @@ class native_jArray {
     return data()[index];
   }
 
-  const N_TYPE* const data() const
+  N_TYPE const* data() const
   {
     init_data_ptr();
     return data_ptr;
@@ -296,17 +296,15 @@ class native_jArray {
     return data_ptr;
   }
 
-  const N_TYPE* const begin() const { return data(); }
+  N_TYPE const* begin() const { return data(); }
 
   N_TYPE* begin() { return data(); }
 
-  const N_TYPE* const end() const { return data() + size(); }
+  N_TYPE const* end() const { return data() + size(); }
 
   N_TYPE* end() { return data() + size(); }
 
-  const J_ARRAY_TYPE get_jArray() const { return orig; }
-
-  J_ARRAY_TYPE get_jArray() { return orig; }
+  J_ARRAY_TYPE get_jArray() const { return orig; }
 
   /**
    * @brief Conversion to std::vector
@@ -430,9 +428,7 @@ class native_jpointerArray {
   T* const* begin() const { return data(); }
   T* const* end() const { return data() + size(); }
 
-  const jlongArray get_jArray() const { return wrapped.get_jArray(); }
-
-  jlongArray get_jArray() { return wrapped.get_jArray(); }
+  jlongArray get_jArray() const { return wrapped.get_jArray(); }
 
   void assert_no_nulls() const
   {
@@ -624,7 +620,7 @@ class native_jstring {
     return true;
   }
 
-  const jstring get_jstring() const { return orig; }
+  jstring get_jstring() const { return orig; }
 
   ~native_jstring()
   {
@@ -753,13 +749,13 @@ class native_jstringArray {
     return cache[index];
   }
 
-  char const** const as_c_array() const
+  char const** as_c_array() const
   {
     init_c_cache();
     return c_cache.data();
   }
 
-  const std::vector<std::string> as_cpp_vector() const
+  std::vector<std::string> as_cpp_vector() const
   {
     init_cpp_cache();
     return cpp_cache;

From cd711913d2312ba158e34f5c03784a7b07f1583a Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Wed, 24 Jul 2024 00:24:19 +0200
Subject: [PATCH 05/15] Adds write-coalescing code path optimization to FST
 (#16143)

This PR adds an optimized code path to the finite-state transducer (FST) that will use a shared memory-backed write buffer for the translated output and translated output indexes, if the the write buffer does not require allocating excessive amounts of shared memory (i.e., current heuristic is 24 KB/CTA). Writes are first buffered in shared memory and then collaboratively written out using coalesced writes to global memory.

## Benchmark results

Numbers are for libcudf's FST_NVBENCH for a 1.073 GB input. FST outputs one token per input symbol. Benchmarks run on V100 with 900 GB/s theoretical peak BW.
We compare the current FST implementation (old) to an FST implementaation that uses write-coalescing to gmem (new).

|                  | OLD throughput  (GB/s) | NEW throughput  (GB/s) | relative performance |   | 1st kernel, per byte: bytes read/written | 2nd kernel, per byte: bytes read/written | expected SOL (GB/s) | achieved SOL (old) | achieved SOL (new) |
|------------------|------------------------|------------------------|----------------------|---|------------------------------------------|------------------------------------------|---------------------|--------------------|--------------------|
| full             |                   15.7 |                  74.74 |                 476% |   |                                        1 |                                        6 |              102.86 |             15.26% |             72.66% |
| no out-indexes   |                 39.123 |                  105.8 |                 270% |   |                                        1 |                                        2 |              240.00 |             16.30% |             44.08% |
| no-output        |                 229.27 |                 178.92 |                  78% |   |                                        1 |                                        1 |              360.00 |             63.69% |             49.70% |
| out-indexes-only |                  24.95 |                   85.2 |                 341% |   |                                        1 |                                        5 |              120.00 |             20.79% |             71.00% |

Authors:
  - Elias Stehle (https://github.com/elstehle)

Approvers:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/16143
---
 cpp/benchmarks/io/fst.cu              |  16 +-
 cpp/src/io/fst/agent_dfa.cuh          | 371 ++++++++++++++++++++++----
 cpp/src/io/fst/dispatch_dfa.cuh       |   7 +-
 cpp/src/io/fst/lookup_tables.cuh      |  70 +++--
 cpp/src/io/json/json_normalization.cu |  26 +-
 cpp/src/io/json/nested_json_gpu.cu    |  25 +-
 cpp/tests/io/fst/common.hpp           |   4 +-
 cpp/tests/io/fst/fst_test.cu          |   4 +-
 8 files changed, 425 insertions(+), 98 deletions(-)

diff --git a/cpp/benchmarks/io/fst.cu b/cpp/benchmarks/io/fst.cu
index ad19bdfdfcb..31f1bf8e70f 100644
--- a/cpp/benchmarks/io/fst.cu
+++ b/cpp/benchmarks/io/fst.cu
@@ -95,7 +95,9 @@ void BM_FST_JSON(nvbench::state& state)
   auto parser = cudf::io::fst::detail::make_fst(
     cudf::io::fst::detail::make_symbol_group_lut(pda_sgs),
     cudf::io::fst::detail::make_transition_table(pda_state_tt),
-    cudf::io::fst::detail::make_translation_table<max_translation_table_size>(pda_out_tt),
+    cudf::io::fst::detail::make_translation_table<max_translation_table_size,
+                                                  min_translated_out,
+                                                  max_translated_out>(pda_out_tt),
     stream);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
@@ -134,7 +136,9 @@ void BM_FST_JSON_no_outidx(nvbench::state& state)
   auto parser = cudf::io::fst::detail::make_fst(
     cudf::io::fst::detail::make_symbol_group_lut(pda_sgs),
     cudf::io::fst::detail::make_transition_table(pda_state_tt),
-    cudf::io::fst::detail::make_translation_table<max_translation_table_size>(pda_out_tt),
+    cudf::io::fst::detail::make_translation_table<max_translation_table_size,
+                                                  min_translated_out,
+                                                  max_translated_out>(pda_out_tt),
     stream);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
@@ -171,7 +175,9 @@ void BM_FST_JSON_no_out(nvbench::state& state)
   auto parser = cudf::io::fst::detail::make_fst(
     cudf::io::fst::detail::make_symbol_group_lut(pda_sgs),
     cudf::io::fst::detail::make_transition_table(pda_state_tt),
-    cudf::io::fst::detail::make_translation_table<max_translation_table_size>(pda_out_tt),
+    cudf::io::fst::detail::make_translation_table<max_translation_table_size,
+                                                  min_translated_out,
+                                                  max_translated_out>(pda_out_tt),
     stream);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
@@ -209,7 +215,9 @@ void BM_FST_JSON_no_str(nvbench::state& state)
   auto parser = cudf::io::fst::detail::make_fst(
     cudf::io::fst::detail::make_symbol_group_lut(pda_sgs),
     cudf::io::fst::detail::make_transition_table(pda_state_tt),
-    cudf::io::fst::detail::make_translation_table<max_translation_table_size>(pda_out_tt),
+    cudf::io::fst::detail::make_translation_table<max_translation_table_size,
+                                                  min_translated_out,
+                                                  max_translated_out>(pda_out_tt),
     stream);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh
index 2171764decd..bc5b94e2718 100644
--- a/cpp/src/io/fst/agent_dfa.cuh
+++ b/cpp/src/io/fst/agent_dfa.cuh
@@ -18,7 +18,9 @@
 #include "in_reg_array.cuh"
 
 #include <cub/cub.cuh>
+#include <cuda/std/type_traits>
 #include <thrust/execution_policy.h>
+#include <thrust/iterator/discard_iterator.h>
 #include <thrust/sequence.h>
 
 namespace cudf::io::fst::detail {
@@ -44,9 +46,10 @@ using StateIndexT = uint32_t;
 template <int32_t NUM_ITEMS>
 struct VectorCompositeOp {
   template <typename VectorT>
-  __host__ __device__ __forceinline__ VectorT operator()(VectorT const& lhs, VectorT const& rhs)
+  __device__ __forceinline__ VectorT operator()(VectorT const& lhs, VectorT const& rhs)
   {
     VectorT res{};
+#pragma unroll
     for (int32_t i = 0; i < NUM_ITEMS; ++i) {
       res.Set(i, rhs.Get(lhs.Get(i)));
     }
@@ -57,61 +60,275 @@ struct VectorCompositeOp {
 /**
  * @brief A class whose ReadSymbol member function is invoked for each symbol being read from the
  * input tape. The wrapper class looks up whether a state transition caused by a symbol is supposed
- * to emit any output symbol (the "transduced" output) and, if so, keeps track of how many symbols
- * it intends to write out and writing out such symbols to the given output iterators.
+ * to emit any output symbol (the "transduced" output) and, if so, keeps track of *how many* symbols
+ * it intends to write out.
+ */
+template <typename TransducerTableT>
+class DFACountCallbackWrapper {
+ public:
+  __device__ __forceinline__ DFACountCallbackWrapper(TransducerTableT transducer_table)
+    : transducer_table(transducer_table)
+  {
+  }
+
+  template <typename OffsetT>
+  __device__ __forceinline__ void Init(OffsetT const&)
+  {
+    out_count = 0;
+  }
+
+  template <typename CharIndexT, typename StateIndexT, typename SymbolIndexT, typename SymbolT>
+  __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index,
+                                             StateIndexT const old_state,
+                                             StateIndexT const new_state,
+                                             SymbolIndexT const symbol_id,
+                                             SymbolT const read_symbol)
+  {
+    uint32_t const count = transducer_table(old_state, symbol_id, read_symbol);
+    out_count += count;
+  }
+
+  __device__ __forceinline__ void TearDown() {}
+  TransducerTableT const transducer_table;
+  uint32_t out_count{};
+};
+
+/**
+ * @brief A class whose ReadSymbol member function is invoked for each symbol being read from the
+ * input tape. The wrapper class looks up whether a state transition caused by a symbol is supposed
+ * to emit any output symbol (the "transduced" output) and, if so, writes out such symbols to the
+ * given output iterators.
  *
+ * @tparam MaxTranslatedOutChars The maximum number of symbols that are written on a any given state
+ * transition
  * @tparam TransducerTableT The type implementing a transducer table that can be used for looking up
  * the symbols that are supposed to be emitted on a given state transition.
- * @tparam TransducedOutItT A Random-access output iterator type to which symbols returned by the
+ * @tparam TransducedOutItT A random-access output iterator type to which symbols returned by the
  * transducer table are assignable.
- * @tparam TransducedIndexOutItT A Random-access output iterator type to which indexes are written.
+ * @tparam TransducedIndexOutItT A random-access output iterator type to which indexes are written.
  */
-template <typename TransducerTableT, typename TransducedOutItT, typename TransducedIndexOutItT>
-class DFASimulationCallbackWrapper {
+template <int MaxTranslatedOutChars,
+          typename TransducerTableT,
+          typename TransducedOutItT,
+          typename TransducedIndexOutItT>
+class DFAWriteCallbackWrapper {
  public:
-  __host__ __device__ __forceinline__ DFASimulationCallbackWrapper(
-    TransducerTableT transducer_table, TransducedOutItT out_it, TransducedIndexOutItT out_idx_it)
-    : transducer_table(transducer_table), out_it(out_it), out_idx_it(out_idx_it), write(false)
+  __device__ __forceinline__ DFAWriteCallbackWrapper(TransducerTableT transducer_table,
+                                                     TransducedOutItT out_it,
+                                                     TransducedIndexOutItT out_idx_it,
+                                                     uint32_t out_offset,
+                                                     uint32_t /*tile_out_offset*/,
+                                                     uint32_t /*tile_in_offset*/,
+                                                     uint32_t /*tile_out_count*/)
+    : transducer_table(transducer_table),
+      out_it(out_it),
+      out_idx_it(out_idx_it),
+      out_offset(out_offset)
   {
   }
 
   template <typename OffsetT>
-  __host__ __device__ __forceinline__ void Init(OffsetT const& offset)
+  __device__ __forceinline__ void Init(OffsetT const& in_offset)
+  {
+    this->in_offset = in_offset;
+  }
+
+  template <typename CharIndexT,
+            typename StateIndexT,
+            typename SymbolIndexT,
+            typename SymbolT,
+            int MaxTranslatedOutChars_>
+  __device__ __forceinline__
+    typename ::cuda::std::enable_if<(MaxTranslatedOutChars_ <= 2), void>::type
+    ReadSymbol(CharIndexT const character_index,
+               StateIndexT const old_state,
+               StateIndexT const new_state,
+               SymbolIndexT const symbol_id,
+               SymbolT const read_symbol,
+               cub::Int2Type<MaxTranslatedOutChars_> /*MaxTranslatedOutChars*/)
+  {
+    uint32_t const count = transducer_table(old_state, symbol_id, read_symbol);
+
+#pragma unroll
+    for (uint32_t out_char = 0; out_char < MaxTranslatedOutChars_; out_char++) {
+      if (out_char < count) {
+        out_it[out_offset + out_char] =
+          transducer_table(old_state, symbol_id, out_char, read_symbol);
+        out_idx_it[out_offset + out_char] = in_offset + character_index;
+      }
+    }
+    out_offset += count;
+  }
+
+  template <typename CharIndexT,
+            typename StateIndexT,
+            typename SymbolIndexT,
+            typename SymbolT,
+            int MaxTranslatedOutChars_>
+  __device__ __forceinline__
+    typename ::cuda::std::enable_if<(MaxTranslatedOutChars_ > 2), void>::type
+    ReadSymbol(CharIndexT const character_index,
+               StateIndexT const old_state,
+               StateIndexT const new_state,
+               SymbolIndexT const symbol_id,
+               SymbolT const read_symbol,
+               cub::Int2Type<MaxTranslatedOutChars_>)
   {
-    this->offset = offset;
-    if (!write) out_count = 0;
+    uint32_t const count = transducer_table(old_state, symbol_id, read_symbol);
+
+    for (uint32_t out_char = 0; out_char < count; out_char++) {
+      out_it[out_offset + out_char] = transducer_table(old_state, symbol_id, out_char, read_symbol);
+      out_idx_it[out_offset + out_char] = in_offset + character_index;
+    }
+    out_offset += count;
   }
 
   template <typename CharIndexT, typename StateIndexT, typename SymbolIndexT, typename SymbolT>
-  __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index,
-                                                      StateIndexT const old_state,
-                                                      StateIndexT const new_state,
-                                                      SymbolIndexT const symbol_id,
-                                                      SymbolT const read_symbol)
+  __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index,
+                                             StateIndexT const old_state,
+                                             StateIndexT const new_state,
+                                             SymbolIndexT const symbol_id,
+                                             SymbolT const read_symbol)
+  {
+    ReadSymbol(character_index,
+               old_state,
+               new_state,
+               symbol_id,
+               read_symbol,
+               cub::Int2Type<MaxTranslatedOutChars>{});
+  }
+
+  __device__ __forceinline__ void TearDown() {}
+
+ public:
+  TransducerTableT const transducer_table;
+  TransducedOutItT out_it;
+  TransducedIndexOutItT out_idx_it;
+  uint32_t out_offset;
+  uint32_t in_offset;
+};
+
+/**
+ * @brief A class whose ReadSymbol member function is invoked for each symbol being read from the
+ * input tape. The wrapper class looks up whether a state transition caused by a symbol is supposed
+ * to emit any output symbol (the "transduced" output) and, if so, writes out such symbols to the
+ * given output iterators. This class uses a shared memory-backed write buffer to coalesce writes to
+ * global memory.
+ *
+ * @tparam DiscardIndexOutput Whether to discard the indexes instead of writing them to the given
+ * output iterator
+ * @tparam DiscardTranslatedOutput Whether to discard the translated output symbols instead of
+ * writing them to the given output iterator
+ * @tparam NumWriteBufferItems The number of items to allocate in shared memory for the write
+ * buffer.
+ * @tparam OutputT The type of the translated items
+ * @tparam TransducerTableT The type implementing a transducer table that can be used for looking up
+ * the symbols that are supposed to be emitted on a given state transition.
+ * @tparam TransducedOutItT A random-access output iterator type to which symbols returned by the
+ * transducer table are assignable.
+ * @tparam TransducedIndexOutItT A random-access output iterator type to which indexes are written.
+ */
+template <bool DiscardIndexOutput,
+          bool DiscardTranslatedOutput,
+          int NumWriteBufferItems,
+          typename OutputT,
+          typename TransducerTableT,
+          typename TransducedOutItT,
+          typename TransducedIndexOutItT>
+class WriteCoalescingCallbackWrapper {
+  struct TempStorage_Offsets {
+    uint16_t compacted_offset[NumWriteBufferItems];
+  };
+  struct TempStorage_Symbols {
+    OutputT compacted_symbols[NumWriteBufferItems];
+  };
+  using offset_cache_t =
+    ::cuda::std::conditional_t<DiscardIndexOutput, cub::NullType, TempStorage_Offsets>;
+  using symbol_cache_t = ::cuda::std::
+    conditional_t<DiscardTranslatedOutput, cub::Uninitialized<cub::NullType>, TempStorage_Symbols>;
+  struct TempStorage_ : offset_cache_t, symbol_cache_t {};
+
+  __device__ __forceinline__ TempStorage_& PrivateStorage()
+  {
+    __shared__ TempStorage private_storage;
+    return private_storage.Alias();
+  }
+  TempStorage_& temp_storage;
+
+ public:
+  struct TempStorage : cub::Uninitialized<TempStorage_> {};
+
+  __device__ __forceinline__ WriteCoalescingCallbackWrapper(TransducerTableT transducer_table,
+                                                            TransducedOutItT out_it,
+                                                            TransducedIndexOutItT out_idx_it,
+                                                            uint32_t thread_out_offset,
+                                                            uint32_t tile_out_offset,
+                                                            uint32_t tile_in_offset,
+                                                            uint32_t tile_out_count)
+    : temp_storage(PrivateStorage()),
+      transducer_table(transducer_table),
+      out_it(out_it),
+      out_idx_it(out_idx_it),
+      thread_out_offset(thread_out_offset),
+      tile_out_offset(tile_out_offset),
+      tile_in_offset(tile_in_offset),
+      tile_out_count(tile_out_count)
+  {
+  }
+
+  template <typename OffsetT>
+  __device__ __forceinline__ void Init(OffsetT const& offset)
+  {
+    this->in_offset = offset;
+  }
+
+  template <typename CharIndexT, typename StateIndexT, typename SymbolIndexT, typename SymbolT>
+  __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index,
+                                             StateIndexT const old_state,
+                                             StateIndexT const new_state,
+                                             SymbolIndexT const symbol_id,
+                                             SymbolT const read_symbol)
   {
     uint32_t const count = transducer_table(old_state, symbol_id, read_symbol);
-    if (write) {
-#if defined(__CUDA_ARCH__)
-#pragma unroll 1
-#endif
-      for (uint32_t out_char = 0; out_char < count; out_char++) {
-        out_it[out_count + out_char] =
+    for (uint32_t out_char = 0; out_char < count; out_char++) {
+      if constexpr (!DiscardIndexOutput) {
+        temp_storage.compacted_offset[thread_out_offset + out_char - tile_out_offset] =
+          in_offset + character_index - tile_in_offset;
+      }
+      if constexpr (!DiscardTranslatedOutput) {
+        temp_storage.compacted_symbols[thread_out_offset + out_char - tile_out_offset] =
           transducer_table(old_state, symbol_id, out_char, read_symbol);
-        out_idx_it[out_count + out_char] = offset + character_index;
       }
     }
-    out_count += count;
+    thread_out_offset += count;
   }
 
-  __host__ __device__ __forceinline__ void TearDown() {}
+  __device__ __forceinline__ void TearDown()
+  {
+    __syncthreads();
+    if constexpr (!DiscardTranslatedOutput) {
+      for (uint32_t out_char = threadIdx.x; out_char < tile_out_count; out_char += blockDim.x) {
+        out_it[tile_out_offset + out_char] = temp_storage.compacted_symbols[out_char];
+      }
+    }
+    if constexpr (!DiscardIndexOutput) {
+      for (uint32_t out_char = threadIdx.x; out_char < tile_out_count; out_char += blockDim.x) {
+        out_idx_it[tile_out_offset + out_char] =
+          temp_storage.compacted_offset[out_char] + tile_in_offset;
+      }
+    }
+    __syncthreads();
+  }
 
  public:
   TransducerTableT const transducer_table;
   TransducedOutItT out_it;
   TransducedIndexOutItT out_idx_it;
-  uint32_t out_count;
-  uint32_t offset;
-  bool write;
+  uint32_t thread_out_offset;
+  uint32_t tile_out_offset;
+  uint32_t tile_in_offset;
+  uint32_t in_offset;
+  uint32_t tile_out_count;
 };
 
 /**
@@ -125,17 +342,18 @@ class DFASimulationCallbackWrapper {
 template <int32_t NUM_INSTANCES, typename TransitionTableT>
 class StateVectorTransitionOp {
  public:
-  __host__ __device__ __forceinline__ StateVectorTransitionOp(
+  __device__ __forceinline__ StateVectorTransitionOp(
     TransitionTableT const& transition_table, std::array<StateIndexT, NUM_INSTANCES>& state_vector)
     : transition_table(transition_table), state_vector(state_vector)
   {
   }
 
   template <typename CharIndexT, typename SymbolIndexT, typename SymbolT>
-  __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index,
-                                                      SymbolIndexT const& read_symbol_id,
-                                                      SymbolT const& read_symbol) const
+  __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index,
+                                             SymbolIndexT const& read_symbol_id,
+                                             SymbolT const& read_symbol) const
   {
+#pragma unroll
     for (int32_t i = 0; i < NUM_INSTANCES; ++i) {
       state_vector[i] = transition_table(state_vector[i], read_symbol_id);
     }
@@ -152,17 +370,17 @@ struct StateTransitionOp {
   TransitionTableT const& transition_table;
   CallbackOpT& callback_op;
 
-  __host__ __device__ __forceinline__ StateTransitionOp(TransitionTableT const& transition_table,
-                                                        StateIndexT state,
-                                                        CallbackOpT& callback_op)
+  __device__ __forceinline__ StateTransitionOp(TransitionTableT const& transition_table,
+                                               StateIndexT state,
+                                               CallbackOpT& callback_op)
     : transition_table(transition_table), state(state), callback_op(callback_op)
   {
   }
 
   template <typename CharIndexT, typename SymbolIndexT, typename SymbolT>
-  __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index,
-                                                      SymbolIndexT const& read_symbol_id,
-                                                      SymbolT const& read_symbol)
+  __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index,
+                                             SymbolIndexT const& read_symbol_id,
+                                             SymbolT const& read_symbol)
   {
     // Remember what state we were in before we made the transition
     StateIndexT previous_state = state;
@@ -420,7 +638,7 @@ struct AgentDFA {
     __syncthreads();
 
     // Thread's symbols
-    CharT* t_chars = &temp_storage.chars[threadIdx.x * SYMBOLS_PER_THREAD];
+    CharT const* t_chars = &temp_storage.chars[threadIdx.x * SYMBOLS_PER_THREAD];
 
     // Parse thread's symbols and transition the state-vector
     if (is_full_block) {
@@ -538,6 +756,43 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
   // The state transition vector passed on to the second stage of the algorithm
   StateVectorT out_state_vector;
 
+  using OutSymbolT = typename DfaT::OutSymbolT;
+  // static constexpr int32_t MIN_TRANSLATED_OUT = DfaT::MIN_TRANSLATED_OUT;
+  static constexpr int32_t num_max_translated_out = DfaT::MAX_TRANSLATED_OUT;
+  static constexpr bool discard_out_index =
+    ::cuda::std::is_same<TransducedIndexOutItT, thrust::discard_iterator<>>::value;
+  static constexpr bool discard_out_it =
+    ::cuda::std::is_same<TransducedOutItT, thrust::discard_iterator<>>::value;
+  using NonWriteCoalescingT =
+    DFAWriteCallbackWrapper<num_max_translated_out,
+                            decltype(dfa.InitTranslationTable(transducer_table_storage)),
+                            TransducedOutItT,
+                            TransducedIndexOutItT>;
+
+  using WriteCoalescingT =
+    WriteCoalescingCallbackWrapper<discard_out_index,
+                                   discard_out_it,
+                                   num_max_translated_out * SYMBOLS_PER_BLOCK,
+                                   OutSymbolT,
+                                   decltype(dfa.InitTranslationTable(transducer_table_storage)),
+                                   TransducedOutItT,
+                                   TransducedIndexOutItT>;
+
+  static constexpr bool is_translation_pass = (!IS_TRANS_VECTOR_PASS) || IS_SINGLE_PASS;
+
+  // Use write-coalescing only if the worst-case output size per tile fits into shared memory
+  static constexpr bool can_use_smem_cache =
+    (sizeof(typename WriteCoalescingT::TempStorage) + sizeof(typename AgentDfaSimT::TempStorage) +
+     sizeof(typename DfaT::SymbolGroupStorageT) + sizeof(typename DfaT::TransitionTableStorageT) +
+     sizeof(typename DfaT::TranslationTableStorageT)) < (48 * 1024);
+  static constexpr bool use_smem_cache =
+    is_translation_pass and
+    (sizeof(typename WriteCoalescingT::TempStorage) <= AgentDFAPolicy::SMEM_THRESHOLD) and
+    can_use_smem_cache;
+
+  using DFASimulationCallbackWrapperT =
+    typename cub::If<use_smem_cache, WriteCoalescingT, NonWriteCoalescingT>::Type;
+
   // Stage 1: Compute the state-transition vector
   if (IS_TRANS_VECTOR_PASS || IS_SINGLE_PASS) {
     // Keeping track of the state for each of the <NUM_STATES> state machines
@@ -576,7 +831,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
     // -> first block/tile: write out block aggregate as the "tile's" inclusive (i.e., the one that
     // incorporates all preceding blocks/tiles results)
     //------------------------------------------------------------------------------
-    if (IS_SINGLE_PASS) {
+    if constexpr (IS_SINGLE_PASS) {
       uint32_t tile_idx             = blockIdx.x;
       using StateVectorCompositeOpT = VectorCompositeOp<NUM_STATES>;
 
@@ -623,10 +878,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
     }
 
     // Perform finite-state machine simulation, computing size of transduced output
-    DFASimulationCallbackWrapper<decltype(dfa.InitTranslationTable(transducer_table_storage)),
-                                 TransducedOutItT,
-                                 TransducedIndexOutItT>
-      callback_wrapper(transducer_table, transduced_out_it, transduced_out_idx_it);
+    DFACountCallbackWrapper count_chars_callback_op{transducer_table};
 
     StateIndexT t_start_state = state;
     agent_dfa.GetThreadStateTransitions(symbol_matcher,
@@ -635,7 +887,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
                                         blockIdx.x * SYMBOLS_PER_BLOCK,
                                         num_chars,
                                         state,
-                                        callback_wrapper,
+                                        count_chars_callback_op,
                                         cub::Int2Type<IS_SINGLE_PASS>());
 
     __syncthreads();
@@ -650,15 +902,18 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
     __shared__ typename OffsetPrefixScanCallbackOpT_::TempStorage prefix_callback_temp_storage;
 
     uint32_t tile_idx = blockIdx.x;
+    uint32_t tile_out_offset{};
+    uint32_t tile_out_count{};
+    uint32_t thread_out_offset{};
     if (tile_idx == 0) {
       OffsetT block_aggregate = 0;
       OutOffsetBlockScan(scan_temp_storage)
-        .ExclusiveScan(callback_wrapper.out_count,
-                       callback_wrapper.out_count,
+        .ExclusiveScan(count_chars_callback_op.out_count,
+                       thread_out_offset,
                        static_cast<OffsetT>(0),
                        cub::Sum{},
                        block_aggregate);
-
+      tile_out_count = block_aggregate;
       if (threadIdx.x == 0 /*and not IS_LAST_TILE*/) {
         offset_tile_state.SetInclusive(0, block_aggregate);
       }
@@ -671,22 +926,28 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
         offset_tile_state, prefix_callback_temp_storage, cub::Sum{}, tile_idx);
 
       OutOffsetBlockScan(scan_temp_storage)
-        .ExclusiveScan(
-          callback_wrapper.out_count, callback_wrapper.out_count, cub::Sum{}, prefix_op);
-
+        .ExclusiveScan(count_chars_callback_op.out_count, thread_out_offset, cub::Sum{}, prefix_op);
+      tile_out_offset = prefix_op.GetExclusivePrefix();
+      tile_out_count  = prefix_op.GetBlockAggregate();
       if (tile_idx == gridDim.x - 1 && threadIdx.x == 0) {
         *d_num_transduced_out_it = prefix_op.GetInclusivePrefix();
       }
     }
 
-    callback_wrapper.write = true;
+    DFASimulationCallbackWrapperT write_translated_callback_op{transducer_table,
+                                                               transduced_out_it,
+                                                               transduced_out_idx_it,
+                                                               thread_out_offset,
+                                                               tile_out_offset,
+                                                               blockIdx.x * SYMBOLS_PER_BLOCK,
+                                                               tile_out_count};
     agent_dfa.GetThreadStateTransitions(symbol_matcher,
                                         transition_table,
                                         d_chars,
                                         blockIdx.x * SYMBOLS_PER_BLOCK,
                                         num_chars,
                                         t_start_state,
-                                        callback_wrapper,
+                                        write_translated_callback_op,
                                         cub::Int2Type<true>());
   }
 }
diff --git a/cpp/src/io/fst/dispatch_dfa.cuh b/cpp/src/io/fst/dispatch_dfa.cuh
index be63ec6539f..ef5e9c8a78f 100644
--- a/cpp/src/io/fst/dispatch_dfa.cuh
+++ b/cpp/src/io/fst/dispatch_dfa.cuh
@@ -37,6 +37,11 @@ struct AgentDFAPolicy {
 
   // The number of symbols processed by each thread
   static constexpr int32_t ITEMS_PER_THREAD = _ITEMS_PER_THREAD;
+
+  // If the shared memory-backed write buffer exceeds this threshold, the FST will skip buffering
+  // the output in a write buffer and instead immediately write out to global memory, potentially
+  // resulting in non-coalesced writes
+  static constexpr std::size_t SMEM_THRESHOLD = 24 * 1024;
 };
 
 /**
@@ -49,7 +54,7 @@ struct DeviceFSMPolicy {
   struct Policy900 : cub::ChainedPolicy<900, Policy900, Policy900> {
     enum {
       BLOCK_THREADS    = 128,
-      ITEMS_PER_THREAD = 32,
+      ITEMS_PER_THREAD = 16,
     };
 
     using AgentDFAPolicy = AgentDFAPolicy<BLOCK_THREADS, ITEMS_PER_THREAD>;
diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh
index 5532a7f994b..ae1f81fd541 100644
--- a/cpp/src/io/fst/lookup_tables.cuh
+++ b/cpp/src/io/fst/lookup_tables.cuh
@@ -367,18 +367,18 @@ class TransitionTable {
 
   template <typename StateIdT>
   static KernelParameter InitDeviceTransitionTable(
-    std::array<std::array<StateIdT, MAX_NUM_SYMBOLS>, MAX_NUM_STATES> const& translation_table)
+    std::array<std::array<StateIdT, MAX_NUM_SYMBOLS>, MAX_NUM_STATES> const& transition_table)
   {
     KernelParameter init_data{};
-    // translation_table[state][symbol] -> new state
-    for (std::size_t state = 0; state < translation_table.size(); ++state) {
-      for (std::size_t symbol = 0; symbol < translation_table[state].size(); ++symbol) {
+    // transition_table[state][symbol] -> new state
+    for (std::size_t state = 0; state < transition_table.size(); ++state) {
+      for (std::size_t symbol = 0; symbol < transition_table[state].size(); ++symbol) {
         CUDF_EXPECTS(
-          static_cast<int64_t>(translation_table[state][symbol]) <=
+          static_cast<int64_t>(transition_table[state][symbol]) <=
             std::numeric_limits<ItemT>::max(),
           "Target state index value exceeds value representable by the transition table's type");
         init_data.transitions[symbol * MAX_NUM_STATES + state] =
-          static_cast<ItemT>(translation_table[state][symbol]);
+          static_cast<ItemT>(transition_table[state][symbol]);
       }
     }
 
@@ -494,6 +494,10 @@ class dfa_device_view {
   // This is a value queried by the DFA simulation algorithm
   static constexpr int32_t MAX_NUM_STATES = NUM_STATES;
 
+  using OutSymbolT                            = typename TranslationTableT::OutSymbolT;
+  static constexpr int32_t MIN_TRANSLATED_OUT = TranslationTableT::MIN_TRANSLATED_OUT;
+  static constexpr int32_t MAX_TRANSLATED_OUT = TranslationTableT::MAX_TRANSLATED_OUT;
+
   using SymbolGroupStorageT      = std::conditional_t<is_complex_op<SymbolGroupIdLookupT>::value,
                                                  typename SymbolGroupIdLookupT::TempStorage,
                                                  typename cub::NullType>;
@@ -542,24 +546,33 @@ class dfa_device_view {
  * @tparam OutSymbolT The symbol type being output
  * @tparam OutSymbolOffsetT Type sufficiently large to index into the lookup table of output
  * symbols
- * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition
+ * @tparam MAX_NUM_SYMBOLS The maximum number of symbol groups supported by this lookup table
  * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support
+ * @tparam MIN_TRANSLATED_OUT_ The minimum number of symbols being output by a single state
+ * transition
+ * @tparam MAX_TRANSLATED_OUT_ The maximum number of symbols being output by a single state
+ * transition
  * @tparam MAX_TABLE_SIZE The maximum number of items in the lookup table of output symbols
- * be used.
  */
-template <typename OutSymbolT,
+template <typename OutSymbolT_,
           typename OutSymbolOffsetT,
           int32_t MAX_NUM_SYMBOLS,
           int32_t MAX_NUM_STATES,
+          int32_t MIN_TRANSLATED_OUT_,
+          int32_t MAX_TRANSLATED_OUT_,
           int32_t MAX_TABLE_SIZE = (MAX_NUM_SYMBOLS * MAX_NUM_STATES)>
 class TransducerLookupTable {
  private:
   struct _TempStorage {
     OutSymbolOffsetT out_offset[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1];
-    OutSymbolT out_symbols[MAX_TABLE_SIZE];
+    OutSymbolT_ out_symbols[MAX_TABLE_SIZE];
   };
 
  public:
+  using OutSymbolT                            = OutSymbolT_;
+  static constexpr int32_t MIN_TRANSLATED_OUT = MIN_TRANSLATED_OUT_;
+  static constexpr int32_t MAX_TRANSLATED_OUT = MAX_TRANSLATED_OUT_;
+
   using TempStorage = cub::Uninitialized<_TempStorage>;
 
   struct KernelParameter {
@@ -567,6 +580,8 @@ class TransducerLookupTable {
                                                OutSymbolOffsetT,
                                                MAX_NUM_SYMBOLS,
                                                MAX_NUM_STATES,
+                                               MIN_TRANSLATED_OUT,
+                                               MAX_TRANSLATED_OUT,
                                                MAX_TABLE_SIZE>;
 
     OutSymbolOffsetT d_out_offsets[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1];
@@ -686,14 +701,19 @@ class TransducerLookupTable {
  * sequence of symbols that the finite-state transducer is supposed to output for each transition.
  *
  * @tparam MAX_TABLE_SIZE The maximum number of items in the lookup table of output symbols
- * be used
+ * @tparam MIN_TRANSLATED_OUT The minimum number of symbols being output by a single state
+ * transition
+ * @tparam MAX_TRANSLATED_OUT The maximum number of symbols being output by a single state
+ * transition
  * @tparam OutSymbolT The symbol type being output
- * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition
+ * @tparam MAX_NUM_SYMBOLS The maximum number of symbol groups supported by this lookup table
  * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support
  * @param translation_table The translation table
  * @return A translation table of type `TransducerLookupTable`.
  */
 template <std::size_t MAX_TABLE_SIZE,
+          std::size_t MIN_TRANSLATED_OUT,
+          std::size_t MAX_TRANSLATED_OUT,
           typename OutSymbolT,
           std::size_t MAX_NUM_SYMBOLS,
           std::size_t MAX_NUM_STATES>
@@ -705,20 +725,30 @@ auto make_translation_table(std::array<std::array<std::vector<OutSymbolT>, MAX_N
                                                     OutSymbolOffsetT,
                                                     MAX_NUM_SYMBOLS,
                                                     MAX_NUM_STATES,
+                                                    MIN_TRANSLATED_OUT,
+                                                    MAX_TRANSLATED_OUT,
                                                     MAX_TABLE_SIZE>;
   return translation_table_t::InitDeviceTranslationTable(translation_table);
 }
 
-template <typename TranslationOpT>
+template <typename TranslationOpT,
+          typename OutSymbolT_,
+          std::int32_t MIN_TRANSLATED_OUT_,
+          std::int32_t MAX_TRANSLATED_OUT_>
 class TranslationOp {
  private:
   struct _TempStorage {};
 
  public:
+  using OutSymbolT                            = OutSymbolT_;
+  static constexpr int32_t MIN_TRANSLATED_OUT = MIN_TRANSLATED_OUT_;
+  static constexpr int32_t MAX_TRANSLATED_OUT = MAX_TRANSLATED_OUT_;
+
   using TempStorage = cub::Uninitialized<_TempStorage>;
 
   struct KernelParameter {
-    using LookupTableT = TranslationOp<TranslationOpT>;
+    using LookupTableT =
+      TranslationOp<TranslationOpT, OutSymbolT, MIN_TRANSLATED_OUT, MAX_TRANSLATED_OUT>;
     TranslationOpT translation_op;
   };
 
@@ -772,6 +802,10 @@ class TranslationOp {
  *
  * @tparam FunctorT A function object type that must implement two signatures: (1) with `(state_id,
  * match_id, read_symbol)` and (2) with `(state_id, match_id, relative_offset, read_symbol)`
+ * @tparam MIN_TRANSLATED_SYMBOLS The minimum number of translated output symbols for any given
+ * input symbol
+ * @tparam MAX_TRANSLATED_SYMBOLS The maximum number of translated output symbols for any given
+ * input symbol
  * @param map_op A function object that must implement two signatures: (1) with `(state_id,
  * match_id, read_symbol)` and (2) with `(state_id, match_id, relative_offset, read_symbol)`.
  * Invocations of the first signature, (1), must return the number of symbols that are emitted for
@@ -779,10 +813,14 @@ class TranslationOp {
  * that transition, where `i` corresponds to `relative_offse`
  * @return A translation table of type `TranslationO`
  */
-template <typename FunctorT>
+template <typename OutSymbolT,
+          std::size_t MIN_TRANSLATED_OUT,
+          std::size_t MAX_TRANSLATED_OUT,
+          typename FunctorT>
 auto make_translation_functor(FunctorT map_op)
 {
-  return TranslationOp<FunctorT>::InitDeviceTranslationTable(map_op);
+  return TranslationOp<FunctorT, OutSymbolT, MIN_TRANSLATED_OUT, MAX_TRANSLATED_OUT>::
+    InitDeviceTranslationTable(map_op);
 }
 
 /**
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index ca56a12eb36..760b2214365 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -302,11 +302,14 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<Symbo
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr)
 {
-  auto parser = fst::detail::make_fst(
-    fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs),
-    fst::detail::make_transition_table(normalize_quotes::qna_state_tt),
-    fst::detail::make_translation_functor(normalize_quotes::TransduceToNormalizedQuotes{}),
-    stream);
+  static constexpr std::int32_t min_out = 0;
+  static constexpr std::int32_t max_out = 2;
+  auto parser =
+    fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs),
+                          fst::detail::make_transition_table(normalize_quotes::qna_state_tt),
+                          fst::detail::make_translation_functor<SymbolT, min_out, max_out>(
+                            normalize_quotes::TransduceToNormalizedQuotes{}),
+                          stream);
 
   rmm::device_uvector<SymbolT> outbuf(indata.size() * 2, stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
@@ -327,11 +330,14 @@ void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<SymbolT>
                           rmm::cuda_stream_view stream,
                           rmm::device_async_resource_ref mr)
 {
-  auto parser = fst::detail::make_fst(
-    fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs),
-    fst::detail::make_transition_table(normalize_whitespace::wna_state_tt),
-    fst::detail::make_translation_functor(normalize_whitespace::TransduceToNormalizedWS{}),
-    stream);
+  static constexpr std::int32_t min_out = 0;
+  static constexpr std::int32_t max_out = 2;
+  auto parser =
+    fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs),
+                          fst::detail::make_transition_table(normalize_whitespace::wna_state_tt),
+                          fst::detail::make_translation_functor<SymbolT, min_out, max_out>(
+                            normalize_whitespace::TransduceToNormalizedWS{}),
+                          stream);
 
   rmm::device_uvector<SymbolT> outbuf(indata.size(), stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index a007754ef4f..8decaf034f3 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1455,11 +1455,14 @@ void get_stack_context(device_span<SymbolT const> json_in,
   constexpr auto max_translation_table_size =
     to_stack_op::NUM_SYMBOL_GROUPS * to_stack_op::TT_NUM_STATES;
 
-  auto json_to_stack_ops_fst = fst::detail::make_fst(
+  static constexpr auto min_translated_out = 0;
+  static constexpr auto max_translated_out = 1;
+  auto json_to_stack_ops_fst               = fst::detail::make_fst(
     fst::detail::make_symbol_group_lut(to_stack_op::get_sgid_lut(delimiter)),
     fst::detail::make_transition_table(to_stack_op::get_transition_table(stack_behavior)),
-    fst::detail::make_translation_table<max_translation_table_size>(
-      to_stack_op::get_translation_table(stack_behavior)),
+    fst::detail::
+      make_translation_table<max_translation_table_size, min_translated_out, max_translated_out>(
+        to_stack_op::get_translation_table(stack_behavior)),
     stream);
 
   // "Search" for relevant occurrence of brackets and braces that indicate the beginning/end
@@ -1507,11 +1510,12 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
   // Instantiate FST for post-processing the token stream to remove all tokens that belong to an
   // invalid JSON line
   token_filter::UnwrapTokenFromSymbolOp sgid_op{};
-  auto filter_fst =
-    fst::detail::make_fst(fst::detail::make_symbol_group_lut(token_filter::symbol_groups, sgid_op),
-                          fst::detail::make_transition_table(token_filter::transition_table),
-                          fst::detail::make_translation_functor(token_filter::TransduceToken{}),
-                          stream);
+  using symbol_t  = thrust::tuple<PdaTokenT, SymbolOffsetT>;
+  auto filter_fst = fst::detail::make_fst(
+    fst::detail::make_symbol_group_lut(token_filter::symbol_groups, sgid_op),
+    fst::detail::make_transition_table(token_filter::transition_table),
+    fst::detail::make_translation_functor<symbol_t, 0, 2>(token_filter::TransduceToken{}),
+    stream);
 
   auto const mr = rmm::mr::get_current_device_resource();
   rmm::device_scalar<SymbolOffsetT> d_num_selected_tokens(stream, mr);
@@ -1598,7 +1602,8 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
       fst::detail::make_symbol_group_lookup_op(
         fix_stack_of_excess_chars::SymbolPairToSymbolGroupId{delimiter}),
       fst::detail::make_transition_table(fix_stack_of_excess_chars::transition_table),
-      fst::detail::make_translation_functor(fix_stack_of_excess_chars::TransduceInputOp{}),
+      fst::detail::make_translation_functor<StackSymbolT, 1, 1>(
+        fix_stack_of_excess_chars::TransduceInputOp{}),
       stream);
     fix_stack_of_excess_chars.Transduce(zip_in,
                                         static_cast<SymbolOffsetT>(json_in.size()),
@@ -1619,7 +1624,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   auto json_to_tokens_fst = fst::detail::make_fst(
     fst::detail::make_symbol_group_lookup_op(tokenizer_pda::PdaSymbolToSymbolGroupId{delimiter}),
     fst::detail::make_transition_table(tokenizer_pda::get_transition_table(format)),
-    fst::detail::make_translation_table<max_translation_table_size>(
+    fst::detail::make_translation_table<max_translation_table_size, 0, 3>(
       tokenizer_pda::get_translation_table(recover_from_error)),
     stream);
 
diff --git a/cpp/tests/io/fst/common.hpp b/cpp/tests/io/fst/common.hpp
index 382d21fabb8..0177300eda9 100644
--- a/cpp/tests/io/fst/common.hpp
+++ b/cpp/tests/io/fst/common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,6 +69,8 @@ std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const pda_s
    /* TT_ESC    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}}};
 
 // Translation table (i.e., for each transition, what are the symbols that we output)
+static constexpr auto min_translated_out = 1;
+static constexpr auto max_translated_out = 1;
 std::array<std::array<std::vector<char>, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const pda_out_tt{
   {/* IN_STATE         {      [      }      ]      "      \    OTHER */
    /* TT_OOS    */ {{{'{'}, {'['}, {'}'}, {']'}, {'x'}, {'x'}, {'x'}}},
diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu
index 4df0d3ae04d..8a8d3d39e0f 100644
--- a/cpp/tests/io/fst/fst_test.cu
+++ b/cpp/tests/io/fst/fst_test.cu
@@ -169,7 +169,9 @@ TEST_F(FstTest, GroundTruth)
   auto parser = cudf::io::fst::detail::make_fst(
     cudf::io::fst::detail::make_symbol_group_lut(pda_sgs),
     cudf::io::fst::detail::make_transition_table(pda_state_tt),
-    cudf::io::fst::detail::make_translation_table<TT_NUM_STATES * NUM_SYMBOL_GROUPS>(pda_out_tt),
+    cudf::io::fst::detail::make_translation_table<TT_NUM_STATES * NUM_SYMBOL_GROUPS,
+                                                  min_translated_out,
+                                                  max_translated_out>(pda_out_tt),
     stream);
 
   // Allocate device-side temporary storage & run algorithm

From 39f256c3397afc9c495cb819636abddb23f81dc0 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 23 Jul 2024 19:03:16 -0500
Subject: [PATCH 06/15] Fall back to CPU for unsupported libcudf binaryops in
 cudf-polars (#16188)

This PR adds logic that should trigger CPU fallback unsupported binary ops.

Authors:
  - https://github.com/brandon-b-miller
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16188
---
 python/cudf_polars/cudf_polars/dsl/expr.py    | 13 ++++---
 .../cudf_polars/cudf_polars/utils/dtypes.py   | 38 +------------------
 .../tests/expressions/test_literal.py         | 18 ++++++---
 3 files changed, 21 insertions(+), 48 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 9835e6f8461..6325feced94 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -1424,13 +1424,14 @@ def __init__(
         super().__init__(dtype)
         self.op = op
         self.children = (left, right)
-        if (
-            op in (plc.binaryop.BinaryOperator.ADD, plc.binaryop.BinaryOperator.SUB)
-            and plc.traits.is_chrono(left.dtype)
-            and plc.traits.is_chrono(right.dtype)
-            and not dtypes.have_compatible_resolution(left.dtype.id(), right.dtype.id())
+        if not plc.binaryop.is_supported_operation(
+            self.dtype, left.dtype, right.dtype, op
         ):
-            raise NotImplementedError("Casting rules for timelike types")
+            raise NotImplementedError(
+                f"Operation {op.name} not supported "
+                f"for types {left.dtype.id().name} and {right.dtype.id().name} "
+                f"with output type {self.dtype.id().name}"
+            )
 
     _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = {
         pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL,
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index 1279fe91d48..cd68d021286 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -14,43 +14,7 @@
 
 import cudf._lib.pylibcudf as plc
 
-__all__ = ["from_polars", "downcast_arrow_lists", "have_compatible_resolution"]
-
-
-def have_compatible_resolution(lid: plc.TypeId, rid: plc.TypeId):
-    """
-    Do two datetime typeids have matching resolution for a binop.
-
-    Parameters
-    ----------
-    lid
-       Left type id
-    rid
-       Right type id
-
-    Returns
-    -------
-    True if resolutions are compatible, False otherwise.
-
-    Notes
-    -----
-    Polars has different casting rules for combining
-    datetimes/durations than libcudf, and while we don't encode the
-    casting rules fully, just reject things we can't handle.
-
-    Precondition for correctness: both lid and rid are timelike.
-    """
-    if lid == rid:
-        return True
-    # Timestamps are smaller than durations in the libcudf enum.
-    lid, rid = sorted([lid, rid])
-    if lid == plc.TypeId.TIMESTAMP_MILLISECONDS:
-        return rid == plc.TypeId.DURATION_MILLISECONDS
-    elif lid == plc.TypeId.TIMESTAMP_MICROSECONDS:
-        return rid == plc.TypeId.DURATION_MICROSECONDS
-    elif lid == plc.TypeId.TIMESTAMP_NANOSECONDS:
-        return rid == plc.TypeId.DURATION_NANOSECONDS
-    return False
+__all__ = ["from_polars", "downcast_arrow_lists"]
 
 
 def downcast_arrow_lists(typ: pa.DataType) -> pa.DataType:
diff --git a/python/cudf_polars/tests/expressions/test_literal.py b/python/cudf_polars/tests/expressions/test_literal.py
index 55e688428bd..5bd3131d1d7 100644
--- a/python/cudf_polars/tests/expressions/test_literal.py
+++ b/python/cudf_polars/tests/expressions/test_literal.py
@@ -6,6 +6,8 @@
 
 import polars as pl
 
+import cudf._lib.pylibcudf as plc
+
 from cudf_polars.testing.asserts import (
     assert_gpu_result_equal,
     assert_ir_translation_raises,
@@ -64,11 +66,17 @@ def test_timelike_literal(timestamp, timedelta):
         adjusted=timestamp + timedelta,
         two_delta=timedelta + timedelta,
     )
-    schema = q.collect_schema()
-    time_type = schema["time"]
-    delta_type = schema["delta"]
-    if dtypes.have_compatible_resolution(
-        dtypes.from_polars(time_type).id(), dtypes.from_polars(delta_type).id()
+    schema = {k: dtypes.from_polars(v) for k, v in q.collect_schema().items()}
+    if plc.binaryop.is_supported_operation(
+        schema["adjusted"],
+        schema["time"],
+        schema["delta"],
+        plc.binaryop.BinaryOperator.ADD,
+    ) and plc.binaryop.is_supported_operation(
+        schema["two_delta"],
+        schema["delta"],
+        schema["delta"],
+        plc.binaryop.BinaryOperator.ADD,
     ):
         assert_gpu_result_equal(q)
     else:

From f0efc8b36a8f43cfa027966265dcea052bb5c45d Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 23 Jul 2024 17:17:05 -0700
Subject: [PATCH 07/15] Modify `make_host_vector` and `make_device_uvector`
 factories to optionally use pinned memory and kernel copy (#16206)

Issue #15616

Modified `make_host_vector` functions to return `cudf::detail::host_vector`, which can use a pinned or a pageable memory resource. When pinned memory is used, the D2H copy is potentially done using a CUDA kernel.

Also added factories to create `host_vector`s without device data. These are useful to replace uses of `std::vector` and `thrust::host_vector` when the data eventually gets copied to the GPU.

Added `is_device_accessible` to `host_span`. With this, `make_device_uvector` can optionally use the kernel for the H2D copy.

Modified `cudf::detail::host_vector` to be derived from `thrust::host_vector`, to avoid issues with implicit conversion from `std::vector`.

Used `cudf::detail::host_vector` and its new factory functions wherever data ends up copied to the GPU.

Stopped using `thrust::copy_n` for the kernel copy path in `cuda_memcpy` because of an optimization that allows it to fall back to `cudaMemCpyAsync`. We now call a simple local kernel.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)
  - Alessandro Bellina (https://github.com/abellina)

URL: https://github.com/rapidsai/cudf/pull/16206
---
 cpp/CMakeLists.txt                            |   2 +-
 cpp/include/cudf/detail/gather.cuh            |   2 +-
 cpp/include/cudf/detail/null_mask.cuh         |   4 +-
 .../cudf/detail/utilities/host_memory.hpp     |  51 +++++++++
 .../cudf/detail/utilities/host_vector.hpp     |  24 +++-
 .../detail/utilities/vector_factories.hpp     | 106 ++++++++++++------
 cpp/include/cudf/io/text/detail/trie.hpp      |   4 +-
 cpp/include/cudf/lists/detail/dremel.hpp      |  10 +-
 cpp/include/cudf/utilities/pinned_memory.hpp  |  16 +++
 cpp/include/cudf/utilities/span.hpp           |  32 ++++++
 cpp/src/copying/concatenate.cu                |   6 +-
 cpp/src/copying/contiguous_split.cu           |   3 +-
 cpp/src/datetime/timezone.cpp                 |   6 +-
 cpp/src/dictionary/detail/concatenate.cu      |   2 +-
 cpp/src/io/avro/reader_impl.cu                |   8 +-
 cpp/src/io/csv/reader_impl.cu                 |  44 +++++---
 cpp/src/io/json/json_column.cu                |   4 +-
 cpp/src/io/json/nested_json_gpu.cu            |   6 +-
 cpp/src/io/json/read_json.cu                  |   3 +-
 cpp/src/io/orc/reader_impl_decode.cu          |  10 +-
 cpp/src/io/orc/stripe_enc.cu                  |   4 +-
 cpp/src/io/orc/writer_impl.cu                 |  50 +++++----
 cpp/src/io/orc/writer_impl.hpp                |   9 +-
 cpp/src/io/parquet/predicate_pushdown.cpp     |  20 ++--
 cpp/src/io/parquet/reader_impl_chunking.cu    |  78 +++++++------
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  10 +-
 cpp/src/io/parquet/writer_impl.cu             |   7 +-
 cpp/src/lists/dremel.cu                       |   6 +-
 cpp/src/strings/combine/join.cu               |   6 +-
 cpp/src/strings/convert/convert_datetime.cu   |   2 +-
 cpp/src/strings/copying/concatenate.cu        |   2 +-
 cpp/src/strings/filter_chars.cu               |   2 +-
 cpp/src/strings/replace/multi_re.cu           |   2 +-
 cpp/src/strings/translate.cu                  |   2 +-
 cpp/src/table/row_operators.cu                |   5 +-
 cpp/src/utilities/cuda_memcpy.cu              |  20 +++-
 .../{pinned_memory.cpp => host_memory.cpp}    |  86 +++++++++++++-
 cpp/tests/io/json/json_tree.cpp               |   6 +-
 cpp/tests/strings/integers_tests.cpp          |   4 +-
 .../utilities_tests/pinned_memory_tests.cpp   |  67 ++++++++++-
 40 files changed, 539 insertions(+), 192 deletions(-)
 create mode 100644 cpp/include/cudf/detail/utilities/host_memory.hpp
 rename cpp/src/utilities/{pinned_memory.cpp => host_memory.cpp} (73%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 5e79204a558..a2c2dd3af4c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -671,9 +671,9 @@ add_library(
   src/unary/null_ops.cu
   src/utilities/cuda_memcpy.cu
   src/utilities/default_stream.cpp
+  src/utilities/host_memory.cpp
   src/utilities/linked_column.cpp
   src/utilities/logger.cpp
-  src/utilities/pinned_memory.cpp
   src/utilities/prefetch.cpp
   src/utilities/stacktrace.cpp
   src/utilities/stream_pool.cpp
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 5977c7341c1..d3e9fc4974d 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -577,7 +577,7 @@ void gather_bitmask(table_view const& source,
   }
 
   // Make device array of target bitmask pointers
-  std::vector<bitmask_type*> target_masks(target.size());
+  auto target_masks = make_host_vector<bitmask_type*>(target.size(), stream);
   std::transform(target.begin(), target.end(), target_masks.begin(), [](auto const& col) {
     return col->mutable_view().null_mask();
   });
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index e62675cbc8c..ae6db5409cc 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -430,7 +430,9 @@ std::vector<size_type> segmented_count_bits(bitmask_type const* bitmask,
   if (num_segments == 0) { return std::vector<size_type>{}; }
 
   // Construct a contiguous host buffer of indices and copy to device.
-  auto const h_indices = std::vector<size_type>(indices_begin, indices_end);
+  auto h_indices = make_empty_host_vector<typename std::iterator_traits<IndexIterator>::value_type>(
+    std::distance(indices_begin, indices_end), stream);
+  std::copy(indices_begin, indices_end, std::back_inserter(h_indices));
   auto const d_indices =
     make_device_uvector_async(h_indices, stream, rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/detail/utilities/host_memory.hpp b/cpp/include/cudf/detail/utilities/host_memory.hpp
new file mode 100644
index 00000000000..c6775a950c9
--- /dev/null
+++ b/cpp/include/cudf/detail/utilities/host_memory.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/utilities/export.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
+
+#include <rmm/resource_ref.hpp>
+
+#include <cstddef>
+
+namespace cudf::detail {
+/**
+ * @brief Get the memory resource to be used for pageable memory allocations.
+ *
+ * @return Reference to the pageable memory resource
+ */
+CUDF_EXPORT rmm::host_async_resource_ref get_pageable_memory_resource();
+
+/**
+ * @brief Get the allocator to be used for the host memory allocation.
+ *
+ * @param size The number of elements of type T to allocate
+ * @param stream The stream to use for the allocation
+ * @return The allocator to be used for the host memory allocation
+ */
+template <typename T>
+rmm_host_allocator<T> get_host_allocator(std::size_t size, rmm::cuda_stream_view stream)
+{
+  if (size * sizeof(T) <= get_allocate_host_as_pinned_threshold()) {
+    return {get_pinned_memory_resource(), stream};
+  }
+  return {get_pageable_memory_resource(), stream};
+}
+
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp
index 2d14d0306cd..f4e5f718da4 100644
--- a/cpp/include/cudf/detail/utilities/host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/host_vector.hpp
@@ -61,6 +61,10 @@ class rmm_host_allocator<void> {
   };
 };
 
+template <class DesiredProperty, class... Properties>
+inline constexpr bool contains_property =
+  (cuda::std::is_same_v<DesiredProperty, Properties> || ... || false);
+
 /*! \p rmm_host_allocator is a CUDA-specific host memory allocator
  *  that employs \c `rmm::host_async_resource_ref` for allocation.
  *
@@ -100,8 +104,12 @@ class rmm_host_allocator {
   /**
    * @brief Construct from a `cudf::host_async_resource_ref`
    */
-  rmm_host_allocator(rmm::host_async_resource_ref _mr, rmm::cuda_stream_view _stream)
-    : mr(_mr), stream(_stream)
+  template <class... Properties>
+  rmm_host_allocator(cuda::mr::async_resource_ref<cuda::mr::host_accessible, Properties...> _mr,
+                     rmm::cuda_stream_view _stream)
+    : mr(_mr),
+      stream(_stream),
+      _is_device_accessible{contains_property<cuda::mr::device_accessible, Properties...>}
   {
   }
 
@@ -173,15 +181,25 @@ class rmm_host_allocator {
    */
   inline bool operator!=(rmm_host_allocator const& x) const { return !operator==(x); }
 
+  bool is_device_accessible() const { return _is_device_accessible; }
+
  private:
   rmm::host_async_resource_ref mr;
   rmm::cuda_stream_view stream;
+  bool _is_device_accessible;
 };
 
 /**
  * @brief A vector class with rmm host memory allocator
  */
 template <typename T>
-using host_vector = thrust::host_vector<T, rmm_host_allocator<T>>;
+class host_vector : public thrust::host_vector<T, rmm_host_allocator<T>> {
+ public:
+  using base = thrust::host_vector<T, rmm_host_allocator<T>>;
+
+  host_vector(rmm_host_allocator<T> const& alloc) : base(alloc) {}
+
+  host_vector(size_t size, rmm_host_allocator<T> const& alloc) : base(size, alloc) {}
+};
 
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 20cb55bb1c7..45dc839c9bd 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -21,6 +21,8 @@
  * @file vector_factories.hpp
  */
 
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
+#include <cudf/detail/utilities/host_memory.hpp>
 #include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -32,8 +34,6 @@
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <thrust/host_vector.h>
-
 #include <vector>
 
 namespace cudf {
@@ -100,11 +100,12 @@ rmm::device_uvector<T> make_device_uvector_async(host_span<T const> source_data,
                                                  rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(),
-                                source_data.data(),
-                                source_data.size() * sizeof(T),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  auto const is_pinned = source_data.is_device_accessible();
+  cuda_memcpy_async(ret.data(),
+                    source_data.data(),
+                    source_data.size() * sizeof(T),
+                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+                    stream);
   return ret;
 }
 
@@ -271,21 +272,11 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
   return make_device_uvector_sync(device_span<typename Container::value_type const>{c}, stream, mr);
 }
 
-// Utility function template to allow copying to either a thrust::host_vector or std::vector
-template <typename T, typename OutContainer>
-OutContainer make_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
-{
-  OutContainer result(v.size());
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    result.data(), v.data(), v.size() * sizeof(T), cudaMemcpyDefault, stream.value()));
-  return result;
-}
-
 /**
  * @brief Asynchronously construct a `std::vector` containing a copy of data from a
  * `device_span`
  *
- * @note This function does not synchronize `stream`.
+ * @note This function does not synchronize `stream` after the copy.
  *
  * @tparam T The type of the data to copy
  * @param source_data The device data to copy
@@ -295,14 +286,17 @@ OutContainer make_vector_async(device_span<T const> v, rmm::cuda_stream_view str
 template <typename T>
 std::vector<T> make_std_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
 {
-  return make_vector_async<T, std::vector<T>>(v, stream);
+  std::vector<T> result(v.size());
+  CUDF_CUDA_TRY(cudaMemcpyAsync(
+    result.data(), v.data(), v.size() * sizeof(T), cudaMemcpyDefault, stream.value()));
+  return result;
 }
 
 /**
  * @brief Asynchronously construct a `std::vector` containing a copy of data from a device
  * container
  *
- * @note This function synchronizes `stream`.
+ * @note This function synchronizes `stream` after the copy.
  *
  * @tparam Container The type of the container to copy from
  * @tparam T The type of the data to copy
@@ -324,7 +318,7 @@ std::vector<typename Container::value_type> make_std_vector_async(Container cons
  * @brief Synchronously construct a `std::vector` containing a copy of data from a
  * `device_span`
  *
- * @note This function does a synchronize on `stream`.
+ * @note This function does a synchronize on `stream` after the copy.
  *
  * @tparam T The type of the data to copy
  * @param source_data The device data to copy
@@ -361,11 +355,46 @@ std::vector<typename Container::value_type> make_std_vector_sync(Container const
   return make_std_vector_sync(device_span<typename Container::value_type const>{c}, stream);
 }
 
+/**
+ * @brief Construct a `cudf::detail::host_vector` of the given size.
+ *
+ * @note The returned vector may be using a pinned memory resource.
+ *
+ * @tparam T The type of the vector data
+ * @param size The number of elements in the created vector
+ * @param stream The stream on which to allocate memory
+ * @return A host_vector of the given size
+ */
+template <typename T>
+host_vector<T> make_host_vector(size_t size, rmm::cuda_stream_view stream)
+{
+  return host_vector<T>(size, get_host_allocator<T>(size, stream));
+}
+
+/**
+ * @brief Construct an empty `cudf::detail::host_vector` with the given capacity.
+ *
+ * @note The returned vector may be using a pinned memory resource.
+ *
+ * @tparam T The type of the vector data
+ * @param capacity Initial capacity of the vector
+ * @param stream The stream on which to allocate memory
+ * @return A host_vector with the given capacity
+ */
+template <typename T>
+host_vector<T> make_empty_host_vector(size_t capacity, rmm::cuda_stream_view stream)
+{
+  auto result = host_vector<T>(get_host_allocator<T>(capacity, stream));
+  result.reserve(capacity);
+  return result;
+}
+
 /**
  * @brief Asynchronously construct a `thrust::host_vector` containing a copy of data from a
  * `device_span`
  *
- * @note This function does not synchronize `stream`.
+ * @note This function does not synchronize `stream` after the copy. The returned vector may be
+ * using a pinned memory resource.
  *
  * @tparam T The type of the data to copy
  * @param source_data The device data to copy
@@ -373,16 +402,24 @@ std::vector<typename Container::value_type> make_std_vector_sync(Container const
  * @return The data copied to the host
  */
 template <typename T>
-thrust::host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
+host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
 {
-  return make_vector_async<T, thrust::host_vector<T>>(v, stream);
+  auto result          = make_host_vector<T>(v.size(), stream);
+  auto const is_pinned = result.get_allocator().is_device_accessible();
+  cuda_memcpy_async(result.data(),
+                    v.data(),
+                    v.size() * sizeof(T),
+                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+                    stream);
+  return result;
 }
 
 /**
  * @brief Asynchronously construct a `std::vector` containing a copy of data from a device
  * container
  *
- * @note This function does not synchronize `stream`.
+ * @note This function does not synchronize `stream` after the copy. The returned vector may be
+ * using a pinned memory resource.
  *
  * @tparam Container The type of the container to copy from
  * @tparam T The type of the data to copy
@@ -394,8 +431,8 @@ template <
   typename Container,
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
-thrust::host_vector<typename Container::value_type> make_host_vector_async(
-  Container const& c, rmm::cuda_stream_view stream)
+host_vector<typename Container::value_type> make_host_vector_async(Container const& c,
+                                                                   rmm::cuda_stream_view stream)
 {
   return make_host_vector_async(device_span<typename Container::value_type const>{c}, stream);
 }
@@ -404,7 +441,8 @@ thrust::host_vector<typename Container::value_type> make_host_vector_async(
  * @brief Synchronously construct a `thrust::host_vector` containing a copy of data from a
  * `device_span`
  *
- * @note This function does a synchronize on `stream`.
+ * @note This function does a synchronize on `stream` after the copy. The returned vector may be
+ * using a pinned memory resource.
  *
  * @tparam T The type of the data to copy
  * @param source_data The device data to copy
@@ -412,7 +450,7 @@ thrust::host_vector<typename Container::value_type> make_host_vector_async(
  * @return The data copied to the host
  */
 template <typename T>
-thrust::host_vector<T> make_host_vector_sync(device_span<T const> v, rmm::cuda_stream_view stream)
+host_vector<T> make_host_vector_sync(device_span<T const> v, rmm::cuda_stream_view stream)
 {
   auto result = make_host_vector_async(v, stream);
   stream.synchronize();
@@ -423,7 +461,7 @@ thrust::host_vector<T> make_host_vector_sync(device_span<T const> v, rmm::cuda_s
  * @brief Synchronously construct a `thrust::host_vector` containing a copy of data from a device
  * container
  *
- * @note This function synchronizes `stream`.
+ * @note This function synchronizes `stream` after the copy.
  *
  * @tparam Container The type of the container to copy from
  * @tparam T The type of the data to copy
@@ -435,8 +473,8 @@ template <
   typename Container,
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
-thrust::host_vector<typename Container::value_type> make_host_vector_sync(
-  Container const& c, rmm::cuda_stream_view stream)
+host_vector<typename Container::value_type> make_host_vector_sync(Container const& c,
+                                                                  rmm::cuda_stream_view stream)
 {
   return make_host_vector_sync(device_span<typename Container::value_type const>{c}, stream);
 }
@@ -444,7 +482,7 @@ thrust::host_vector<typename Container::value_type> make_host_vector_sync(
 /**
  * @brief Asynchronously construct a pinned `cudf::detail::host_vector` of the given size
  *
- * @note This function may not synchronize `stream`.
+ * @note This function may not synchronize `stream` after the copy.
  *
  * @tparam T The type of the vector data
  * @param size The number of elements in the created vector
@@ -460,7 +498,7 @@ host_vector<T> make_pinned_vector_async(size_t size, rmm::cuda_stream_view strea
 /**
  * @brief Synchronously construct a pinned `cudf::detail::host_vector` of the given size
  *
- * @note This function synchronizes `stream`.
+ * @note This function synchronizes `stream` after the copy.
  *
  * @tparam T The type of the vector data
  * @param size The number of elements in the created vector
diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp
index e0b9c7635e3..28862d97ede 100644
--- a/cpp/include/cudf/io/text/detail/trie.hpp
+++ b/cpp/include/cudf/io/text/detail/trie.hpp
@@ -223,11 +223,11 @@ struct trie {
 
     match_length.emplace_back(0);
 
-    std::vector<trie_node> trie_nodes;
     auto token_counts = std::unordered_map<cudf::size_type, int32_t>();
+    auto trie_nodes   = cudf::detail::make_empty_host_vector<trie_node>(tokens.size(), stream);
 
     for (uint32_t i = 0; i < tokens.size(); i++) {
-      trie_nodes.emplace_back(trie_node{tokens[i], match_length[i], transitions[i]});
+      trie_nodes.push_back(trie_node{tokens[i], match_length[i], transitions[i]});
       token_counts[tokens[i]]++;
     }
 
diff --git a/cpp/include/cudf/lists/detail/dremel.hpp b/cpp/include/cudf/lists/detail/dremel.hpp
index d36a4091947..53448424827 100644
--- a/cpp/include/cudf/lists/detail/dremel.hpp
+++ b/cpp/include/cudf/lists/detail/dremel.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,8 +31,8 @@ struct dremel_device_view {
   size_type const* offsets;
   uint8_t const* rep_levels;
   uint8_t const* def_levels;
-  size_type const leaf_data_size;
-  uint8_t const max_def_level;
+  size_type leaf_data_size;
+  uint8_t max_def_level;
 };
 
 /**
@@ -45,8 +45,8 @@ struct dremel_data {
   rmm::device_uvector<uint8_t> rep_level;
   rmm::device_uvector<uint8_t> def_level;
 
-  size_type const leaf_data_size;
-  uint8_t const max_def_level;
+  size_type leaf_data_size;
+  uint8_t max_def_level;
 
   operator dremel_device_view() const
   {
diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp
index 3e2fa43cb50..fa7e1b35327 100644
--- a/cpp/include/cudf/utilities/pinned_memory.hpp
+++ b/cpp/include/cudf/utilities/pinned_memory.hpp
@@ -71,4 +71,20 @@ void set_kernel_pinned_copy_threshold(size_t threshold);
  */
 size_t get_kernel_pinned_copy_threshold();
 
+/**
+ * @brief Set the threshold size for allocating host memory as pinned memory.
+ *
+ * @param threshold The threshold size in bytes. If the size of the allocation is less or equal to
+ * this threshold, the memory will be allocated as pinned memory. If the size is greater than this
+ * threshold, the memory will be allocated as pageable memory.
+ */
+void set_allocate_host_as_pinned_threshold(size_t threshold);
+
+/**
+ * @brief Get the threshold size for allocating host memory as pinned memory.
+ *
+ * @return The threshold size in bytes.
+ */
+size_t get_allocate_host_as_pinned_threshold();
+
 }  // namespace cudf
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 3b35e60e034..c5054c733a7 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <cudf/detail/utilities/host_vector.hpp>
+
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/device_vector.hpp>
@@ -257,6 +259,26 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
   {
   }
 
+  /// Constructor from a host_vector
+  /// @param in The host_vector to construct the span from
+  template <typename OtherT,
+            // Only supported containers of types convertible to T
+            std::enable_if_t<std::is_convertible_v<OtherT (*)[], T (*)[]>>* = nullptr>
+  constexpr host_span(cudf::detail::host_vector<OtherT>& in)
+    : base(in.data(), in.size()), _is_device_accessible{in.get_allocator().is_device_accessible()}
+  {
+  }
+
+  /// Constructor from a const host_vector
+  /// @param in The host_vector to construct the span from
+  template <typename OtherT,
+            // Only supported containers of types convertible to T
+            std::enable_if_t<std::is_convertible_v<OtherT (*)[], T (*)[]>>* = nullptr>
+  constexpr host_span(cudf::detail::host_vector<OtherT> const& in)
+    : base(in.data(), in.size()), _is_device_accessible{in.get_allocator().is_device_accessible()}
+  {
+  }
+
   // Copy construction to support const conversion
   /// @param other The span to copy
   template <typename OtherT,
@@ -268,6 +290,16 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
     : base(other.data(), other.size())
   {
   }
+
+  /**
+   * @brief Returns whether the data is device accessible (e.g. pinned memory)
+   *
+   * @return true if the data is device accessible
+   */
+  [[nodiscard]] bool is_device_accessible() const { return _is_device_accessible; }
+
+ private:
+  bool _is_device_accessible{false};
 };
 
 // ===== device_span ===============================================================================
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 6acbafd24fb..4be3054b3dc 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -73,8 +73,8 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
   });
 
   // Assemble contiguous array of device views
-  auto device_views = thrust::host_vector<column_device_view>();
-  device_views.reserve(views.size());
+  auto device_views =
+    cudf::detail::make_empty_host_vector<column_device_view>(views.size(), stream);
   std::transform(device_view_owners.cbegin(),
                  device_view_owners.cend(),
                  std::back_inserter(device_views),
@@ -84,7 +84,7 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
     make_device_uvector_async(device_views, stream, rmm::mr::get_current_device_resource());
 
   // Compute the partition offsets
-  auto offsets = thrust::host_vector<size_t>(views.size() + 1);
+  auto offsets = cudf::detail::make_host_vector<size_t>(views.size() + 1, stream);
   thrust::transform_inclusive_scan(
     thrust::host,
     device_views.cbegin(),
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 37db2c74790..95544742fb7 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1539,7 +1539,8 @@ std::unique_ptr<chunk_iteration_state> chunk_iteration_state::create(
 
     std::vector<std::size_t> num_batches_per_iteration;
     std::vector<std::size_t> size_of_batches_per_iteration;
-    std::vector<std::size_t> accum_size_per_iteration;
+    auto accum_size_per_iteration =
+      cudf::detail::make_empty_host_vector<std::size_t>(h_offsets.size(), stream);
     std::size_t accum_size = 0;
     {
       auto current_offset_it = h_offsets.begin();
diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp
index 1b0d201501b..7ca1b51df98 100644
--- a/cpp/src/datetime/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -485,14 +485,12 @@ std::unique_ptr<table> make_timezone_transition_table(std::optional<std::string_
   CUDF_EXPECTS(transition_times.size() == offsets.size(),
                "Error reading TZif file for timezone " + std::string{timezone_name});
 
-  std::vector<timestamp_s> ttimes_typed;
-  ttimes_typed.reserve(transition_times.size());
+  auto ttimes_typed = make_empty_host_vector<timestamp_s>(transition_times.size(), stream);
   std::transform(transition_times.cbegin(),
                  transition_times.cend(),
                  std::back_inserter(ttimes_typed),
                  [](auto ts) { return timestamp_s{duration_s{ts}}; });
-  std::vector<duration_s> offsets_typed;
-  offsets_typed.reserve(offsets.size());
+  auto offsets_typed = make_empty_host_vector<duration_s>(offsets.size(), stream);
   std::transform(offsets.cbegin(), offsets.cend(), std::back_inserter(offsets_typed), [](auto ts) {
     return duration_s{ts};
   });
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index fdc3d9d0ecf..72828309425 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -105,7 +105,7 @@ struct compute_children_offsets_fn {
    */
   rmm::device_uvector<offsets_pair> create_children_offsets(rmm::cuda_stream_view stream)
   {
-    std::vector<offsets_pair> offsets(columns_ptrs.size());
+    auto offsets = cudf::detail::make_host_vector<offsets_pair>(columns_ptrs.size(), stream);
     thrust::transform_exclusive_scan(
       thrust::host,
       columns_ptrs.begin(),
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index 814efe2b5a1..69a0e982a5b 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -554,9 +554,11 @@ table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
       auto d_global_dict_data = rmm::device_uvector<char>(0, stream);
 
       if (total_dictionary_entries > 0) {
-        auto h_global_dict      = std::vector<string_index_pair>(total_dictionary_entries);
-        auto h_global_dict_data = std::vector<char>(dictionary_data_size);
-        size_t dict_pos         = 0;
+        auto h_global_dict =
+          cudf::detail::make_host_vector<string_index_pair>(total_dictionary_entries, stream);
+        auto h_global_dict_data =
+          cudf::detail::make_host_vector<char>(dictionary_data_size, stream);
+        size_t dict_pos = 0;
 
         for (size_t i = 0; i < column_types.size(); ++i) {
           auto const col_idx          = selected_columns[i].first;
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 05faded651d..40d4372ae9d 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -567,7 +567,7 @@ void infer_column_types(parse_options const& parse_opts,
 }
 
 std::vector<column_buffer> decode_data(parse_options const& parse_opts,
-                                       std::vector<column_parse::flags> const& column_flags,
+                                       host_span<column_parse::flags const> column_flags,
                                        std::vector<std::string> const& column_names,
                                        device_span<char const> data,
                                        device_span<uint64_t const> row_offsets,
@@ -592,8 +592,8 @@ std::vector<column_buffer> decode_data(parse_options const& parse_opts,
     }
   }
 
-  thrust::host_vector<void*> h_data(num_active_columns);
-  thrust::host_vector<bitmask_type*> h_valid(num_active_columns);
+  auto h_data  = cudf::detail::make_host_vector<void*>(num_active_columns, stream);
+  auto h_valid = cudf::detail::make_host_vector<bitmask_type*>(num_active_columns, stream);
 
   for (int i = 0; i < num_active_columns; ++i) {
     h_data[i]  = out_buffers[i].data();
@@ -622,14 +622,16 @@ std::vector<column_buffer> decode_data(parse_options const& parse_opts,
   return out_buffers;
 }
 
-std::vector<data_type> determine_column_types(csv_reader_options const& reader_opts,
-                                              parse_options const& parse_opts,
-                                              host_span<std::string const> column_names,
-                                              device_span<char const> data,
-                                              device_span<uint64_t const> row_offsets,
-                                              int32_t num_records,
-                                              host_span<column_parse::flags> column_flags,
-                                              rmm::cuda_stream_view stream)
+cudf::detail::host_vector<data_type> determine_column_types(
+  csv_reader_options const& reader_opts,
+  parse_options const& parse_opts,
+  host_span<std::string const> column_names,
+  device_span<char const> data,
+  device_span<uint64_t const> row_offsets,
+  int32_t num_records,
+  host_span<column_parse::flags> column_flags,
+  cudf::size_type num_active_columns,
+  rmm::cuda_stream_view stream)
 {
   std::vector<data_type> column_types(column_flags.size());
 
@@ -653,7 +655,8 @@ std::vector<data_type> determine_column_types(csv_reader_options const& reader_o
                      stream);
 
   // compact column_types to only include active columns
-  std::vector<data_type> active_col_types;
+  auto active_col_types =
+    cudf::detail::make_empty_host_vector<data_type>(num_active_columns, stream);
   std::copy_if(column_types.cbegin(),
                column_types.cend(),
                std::back_inserter(active_col_types),
@@ -697,8 +700,10 @@ table_with_metadata read_csv(cudf::io::datasource* source,
 
   auto const num_actual_columns = static_cast<int32_t>(column_names.size());
   auto num_active_columns       = num_actual_columns;
-  auto column_flags             = std::vector<column_parse::flags>(
-    num_actual_columns, column_parse::enabled | column_parse::inferred);
+  auto column_flags =
+    cudf::detail::make_host_vector<column_parse::flags>(num_actual_columns, stream);
+  std::fill(
+    column_flags.begin(), column_flags.end(), column_parse::enabled | column_parse::inferred);
 
   // User did not pass column names to override names in the file
   // Process names from the file to remove empty and duplicated strings
@@ -842,8 +847,15 @@ table_with_metadata read_csv(cudf::io::datasource* source,
 
   // Exclude the end-of-data row from number of rows with actual data
   auto const num_records  = std::max(row_offsets.size(), 1ul) - 1;
-  auto const column_types = determine_column_types(
-    reader_opts, parse_opts, column_names, data, row_offsets, num_records, column_flags, stream);
+  auto const column_types = determine_column_types(reader_opts,
+                                                   parse_opts,
+                                                   column_names,
+                                                   data,
+                                                   row_offsets,
+                                                   num_records,
+                                                   column_flags,
+                                                   num_active_columns,
+                                                   stream);
 
   auto metadata    = table_metadata{};
   auto out_columns = std::vector<std::unique_ptr<cudf::column>>();
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 3e587768b11..17fa7abdffe 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -622,7 +622,7 @@ void make_device_json_column(device_span<SymbolT const> input,
   // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking
   std::map<std::pair<NodeIndexT, std::string>, NodeIndexT> mapped_columns;
   // find column_ids which are values, but should be ignored in validity
-  std::vector<uint8_t> ignore_vals(num_columns, 0);
+  auto ignore_vals = cudf::detail::make_host_vector<uint8_t>(num_columns, stream);
   std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
   std::vector<uint8_t> is_pruned(num_columns, 0);
   columns.try_emplace(parent_node_sentinel, std::ref(root));
@@ -812,7 +812,7 @@ void make_device_json_column(device_span<SymbolT const> input,
     return thrust::get<1>(a) < thrust::get<1>(b);
   });
   // move columns data to device.
-  std::vector<json_column_data> columns_data(num_columns);
+  auto columns_data = cudf::detail::make_host_vector<json_column_data>(num_columns, stream);
   for (auto& [col_id, col_ref] : columns) {
     if (col_id == parent_node_sentinel) continue;
     auto& col            = col_ref.get();
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 8decaf034f3..1e484d74679 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1703,10 +1703,8 @@ void make_json_column(json_column& root_column,
   auto const [d_tokens_gpu, d_token_indices_gpu] = get_token_stream(d_input, options, stream, mr);
 
   // Copy the JSON tokens to the host
-  thrust::host_vector<PdaTokenT> tokens =
-    cudf::detail::make_host_vector_async(d_tokens_gpu, stream);
-  thrust::host_vector<SymbolOffsetT> token_indices_gpu =
-    cudf::detail::make_host_vector_async(d_token_indices_gpu, stream);
+  auto tokens            = cudf::detail::make_host_vector_async(d_tokens_gpu, stream);
+  auto token_indices_gpu = cudf::detail::make_host_vector_async(d_token_indices_gpu, stream);
 
   // Make sure tokens have been copied to the host
   stream.synchronize();
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 0ba4dedfc34..590f70864b1 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -78,10 +78,9 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
   auto constexpr num_delimiter_chars = 1;
 
   if (compression == compression_type::NONE) {
-    std::vector<size_t> delimiter_map{};
+    auto delimiter_map = cudf::detail::make_empty_host_vector<size_t>(sources.size(), stream);
     std::vector<size_t> prefsum_source_sizes(sources.size());
     std::vector<std::unique_ptr<datasource::buffer>> h_buffers;
-    delimiter_map.reserve(sources.size());
     size_t bytes_read = 0;
     std::transform_inclusive_scan(sources.begin(),
                                   sources.end(),
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 8e20505d3ff..e3b9a048be8 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -492,11 +492,17 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const&
   if (num_stripes == 0) return;
 
   auto const num_columns = chunks.size().second;
-  std::vector<thrust::pair<size_type, uint32_t*>> prefix_sums_to_update;
+  auto const num_struct_cols =
+    std::count_if(chunks[0].begin(), chunks[0].end(), [](auto const& chunk) {
+      return chunk.type_kind == STRUCT;
+    });
+  auto prefix_sums_to_update =
+    cudf::detail::make_empty_host_vector<thrust::pair<size_type, uint32_t*>>(num_struct_cols,
+                                                                             stream);
   for (auto col_idx = 0ul; col_idx < num_columns; ++col_idx) {
     // Null counts sums are only needed for children of struct columns
     if (chunks[0][col_idx].type_kind == STRUCT) {
-      prefix_sums_to_update.emplace_back(col_idx, d_prefix_sums + num_stripes * col_idx);
+      prefix_sums_to_update.push_back({col_idx, d_prefix_sums + num_stripes * col_idx});
     }
   }
   auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 805959327ac..80f32512b98 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -1417,8 +1417,8 @@ void decimal_sizes_to_offsets(device_2dspan<rowgroup_rows const> rg_bounds,
   if (rg_bounds.count() == 0) return;
 
   // Convert map to a vector of views of the `elem_sizes` device buffers
-  std::vector<decimal_column_element_sizes> h_sizes;
-  h_sizes.reserve(elem_sizes.size());
+  auto h_sizes =
+    cudf::detail::make_empty_host_vector<decimal_column_element_sizes>(elem_sizes.size(), stream);
   std::transform(elem_sizes.begin(), elem_sizes.end(), std::back_inserter(h_sizes), [](auto& p) {
     return decimal_column_element_sizes{p.first, p.second};
   });
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 4cb20bb7518..f3b8cfbc836 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -444,14 +444,17 @@ namespace {
  */
 file_segmentation calculate_segmentation(host_span<orc_column_view const> columns,
                                          hostdevice_2dvector<rowgroup_rows>&& rowgroup_bounds,
-                                         stripe_size_limits max_stripe_size)
+                                         stripe_size_limits max_stripe_size,
+                                         rmm::cuda_stream_view stream)
 {
-  std::vector<stripe_rowgroups> infos;
-  auto const num_rowgroups = rowgroup_bounds.size().first;
-  size_t stripe_start      = 0;
-  size_t stripe_bytes      = 0;
-  size_type stripe_rows    = 0;
-  for (size_t rg_idx = 0; rg_idx < num_rowgroups; ++rg_idx) {
+  // Number of stripes is not known in advance. Only reserve a single element to use pinned memory
+  // resource if at all enabled.
+  auto infos                    = cudf::detail::make_empty_host_vector<stripe_rowgroups>(1, stream);
+  size_type const num_rowgroups = rowgroup_bounds.size().first;
+  size_type stripe_start        = 0;
+  size_t stripe_bytes           = 0;
+  size_type stripe_rows         = 0;
+  for (size_type rg_idx = 0; rg_idx < num_rowgroups; ++rg_idx) {
     auto const rowgroup_total_bytes =
       std::accumulate(columns.begin(), columns.end(), 0ul, [&](size_t total_size, auto const& col) {
         auto const rows = rowgroup_bounds[rg_idx][col.index()].size();
@@ -470,7 +473,9 @@ file_segmentation calculate_segmentation(host_span<orc_column_view const> column
     // Check if adding the current rowgroup to the stripe will make the stripe too large or long
     if ((rg_idx > stripe_start) && (stripe_bytes + rowgroup_total_bytes > max_stripe_size.bytes ||
                                     stripe_rows + rowgroup_rows_max > max_stripe_size.rows)) {
-      infos.emplace_back(infos.size(), stripe_start, rg_idx - stripe_start);
+      infos.push_back(stripe_rowgroups{static_cast<size_type>(infos.size()),
+                                       stripe_start,
+                                       static_cast<size_type>(rg_idx - stripe_start)});
       stripe_start = rg_idx;
       stripe_bytes = 0;
       stripe_rows  = 0;
@@ -479,7 +484,9 @@ file_segmentation calculate_segmentation(host_span<orc_column_view const> column
     stripe_bytes += rowgroup_total_bytes;
     stripe_rows += rowgroup_rows_max;
     if (rg_idx + 1 == num_rowgroups) {
-      infos.emplace_back(infos.size(), stripe_start, num_rowgroups - stripe_start);
+      infos.push_back(stripe_rowgroups{static_cast<size_type>(infos.size()),
+                                       stripe_start,
+                                       static_cast<size_type>(num_rowgroups - stripe_start)});
     }
   }
 
@@ -1336,7 +1343,7 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer,
     if (num_file_blobs == 0) { return {}; }
 
     // Create empty file stats and merge groups
-    std::vector<statistics_chunk> h_stat_chunks(num_file_blobs);
+    auto h_stat_chunks = cudf::detail::make_host_vector<statistics_chunk>(num_file_blobs, stream);
     cudf::detail::hostdevice_vector<statistics_merge_group> stats_merge(num_file_blobs, stream);
     // Fill in stats_merge and stat_chunks on the host
     for (auto i = 0u; i < num_file_blobs; ++i) {
@@ -1677,39 +1684,39 @@ struct pushdown_null_masks {
   // Owning vector for masks in device memory
   std::vector<rmm::device_uvector<bitmask_type>> data;
   // Pointers to pushdown masks in device memory. Can be same for multiple columns.
-  std::vector<bitmask_type const*> masks;
+  cudf::detail::host_vector<bitmask_type const*> masks;
 };
 
 pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table,
                                              rmm::cuda_stream_view stream)
 {
-  std::vector<bitmask_type const*> mask_ptrs;
-  mask_ptrs.reserve(orc_table.num_columns());
+  auto mask_ptrs =
+    cudf::detail::make_empty_host_vector<bitmask_type const*>(orc_table.num_columns(), stream);
   std::vector<rmm::device_uvector<bitmask_type>> pd_masks;
   for (auto const& col : orc_table.columns) {
     // Leaf columns don't need pushdown masks
     if (col.num_children() == 0) {
-      mask_ptrs.emplace_back(nullptr);
+      mask_ptrs.push_back({nullptr});
       continue;
     }
     auto const parent_pd_mask = col.is_child() ? mask_ptrs[col.parent_index()] : nullptr;
     auto const null_mask      = col.null_mask();
 
     if (null_mask == nullptr and parent_pd_mask == nullptr) {
-      mask_ptrs.emplace_back(nullptr);
+      mask_ptrs.push_back({nullptr});
       continue;
     }
     if (col.orc_kind() == STRUCT) {
       if (null_mask != nullptr and parent_pd_mask == nullptr) {
         // Reuse own null mask
-        mask_ptrs.emplace_back(null_mask);
+        mask_ptrs.push_back(null_mask);
       } else if (null_mask == nullptr and parent_pd_mask != nullptr) {
         // Reuse parent's pushdown mask
-        mask_ptrs.emplace_back(parent_pd_mask);
+        mask_ptrs.push_back(parent_pd_mask);
       } else {
         // Both are nullable, allocate new pushdown mask
         pd_masks.emplace_back(num_bitmask_words(col.size()), stream);
-        mask_ptrs.emplace_back(pd_masks.back().data());
+        mask_ptrs.push_back({pd_masks.back().data()});
 
         thrust::transform(rmm::exec_policy(stream),
                           null_mask,
@@ -1724,7 +1731,7 @@ pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table,
       auto const child_col = orc_table.column(col.child_begin()[0]);
       // pushdown mask applies to child column(s); use the child column size
       pd_masks.emplace_back(num_bitmask_words(child_col.size()), stream);
-      mask_ptrs.emplace_back(pd_masks.back().data());
+      mask_ptrs.push_back({pd_masks.back().data()});
       pushdown_lists_null_mask(col, orc_table.d_columns, parent_pd_mask, pd_masks.back(), stream);
     }
   }
@@ -1815,8 +1822,7 @@ orc_table_view make_orc_table_view(table_view const& table,
     append_orc_column(table.column(col_idx), nullptr, table_meta.column_metadata[col_idx]);
   }
 
-  std::vector<TypeKind> type_kinds;
-  type_kinds.reserve(orc_columns.size());
+  auto type_kinds = cudf::detail::make_empty_host_vector<TypeKind>(orc_columns.size(), stream);
   std::transform(
     orc_columns.cbegin(), orc_columns.cend(), std::back_inserter(type_kinds), [](auto& orc_column) {
       return orc_column.orc_kind();
@@ -2299,7 +2305,7 @@ auto convert_table_to_orc_data(table_view const& input,
 
   // Decide stripe boundaries based on rowgroups and char counts
   auto segmentation =
-    calculate_segmentation(orc_table.columns, std::move(rowgroup_bounds), max_stripe_size);
+    calculate_segmentation(orc_table.columns, std::move(rowgroup_bounds), max_stripe_size, stream);
 
   auto stripe_dicts    = build_dictionaries(orc_table, segmentation, sort_dictionaries, stream);
   auto dec_chunk_sizes = decimal_chunk_sizes(orc_table, segmentation, stream);
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index bd082befe0c..f5f8b3cfed9 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -78,10 +78,9 @@ struct orc_table_view {
  * Provides a container-like interface to iterate over rowgroup indices.
  */
 struct stripe_rowgroups {
-  uint32_t id;     // stripe id
-  uint32_t first;  // first rowgroup in the stripe
-  uint32_t size;   // number of rowgroups in the stripe
-  stripe_rowgroups(uint32_t id, uint32_t first, uint32_t size) : id{id}, first{first}, size{size} {}
+  size_type id;     // stripe id
+  size_type first;  // first rowgroup in the stripe
+  size_type size;   // number of rowgroups in the stripe
   [[nodiscard]] auto cbegin() const { return thrust::make_counting_iterator(first); }
   [[nodiscard]] auto cend() const { return thrust::make_counting_iterator(first + size); }
 };
@@ -125,7 +124,7 @@ class orc_streams {
  */
 struct file_segmentation {
   hostdevice_2dvector<rowgroup_rows> rowgroups;
-  std::vector<stripe_rowgroups> stripes;
+  cudf::detail::host_vector<stripe_rowgroups> stripes;
 
   auto num_rowgroups() const noexcept { return rowgroups.size().first; }
   auto num_stripes() const noexcept { return stripes.size(); }
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index 11f4a00ee8b..481c1e9fcdd 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -141,11 +141,11 @@ struct stats_caster {
       // Local struct to hold host columns
       struct host_column {
         // using thrust::host_vector because std::vector<bool> uses bitmap instead of byte per bool.
-        thrust::host_vector<T> val;
+        cudf::detail::host_vector<T> val;
         std::vector<bitmask_type> null_mask;
         cudf::size_type null_count = 0;
-        host_column(size_type total_row_groups)
-          : val(total_row_groups),
+        host_column(size_type total_row_groups, rmm::cuda_stream_view stream)
+          : val{cudf::detail::make_host_vector<T>(total_row_groups, stream)},
             null_mask(
               cudf::util::div_rounding_up_safe<size_type>(
                 cudf::bitmask_allocation_size_bytes(total_row_groups), sizeof(bitmask_type)),
@@ -170,8 +170,14 @@ struct stats_caster {
                                           rmm::cuda_stream_view stream,
                                           rmm::device_async_resource_ref mr)
         {
-          std::vector<char> chars{};
-          std::vector<cudf::size_type> offsets(1, 0);
+          auto const total_char_count = std::accumulate(
+            host_strings.begin(), host_strings.end(), 0, [](auto sum, auto const& str) {
+              return sum + str.size_bytes();
+            });
+          auto chars = cudf::detail::make_empty_host_vector<char>(total_char_count, stream);
+          auto offsets =
+            cudf::detail::make_empty_host_vector<cudf::size_type>(host_strings.size() + 1, stream);
+          offsets.push_back(0);
           for (auto const& str : host_strings) {
             auto tmp =
               str.empty() ? std::string_view{} : std::string_view(str.data(), str.size_bytes());
@@ -206,8 +212,8 @@ struct stats_caster {
             null_count);
         }
       };  // local struct host_column
-      host_column min(total_row_groups);
-      host_column max(total_row_groups);
+      host_column min(total_row_groups, stream);
+      host_column max(total_row_groups, stream);
       size_type stats_idx = 0;
       for (size_t src_idx = 0; src_idx < row_group_indices.size(); ++src_idx) {
         for (auto const rg_idx : row_group_indices[src_idx]) {
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 05e0d8c0111..794750ab6d2 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -804,16 +804,16 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
   rmm::device_buffer decomp_pages(
     cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
 
-  std::vector<device_span<uint8_t const>> comp_in;
-  comp_in.reserve(num_comp_pages);
-  std::vector<device_span<uint8_t>> comp_out;
-  comp_out.reserve(num_comp_pages);
+  auto comp_in =
+    cudf::detail::make_empty_host_vector<device_span<uint8_t const>>(num_comp_pages, stream);
+  auto comp_out =
+    cudf::detail::make_empty_host_vector<device_span<uint8_t>>(num_comp_pages, stream);
 
   // vectors to save v2 def and rep level data, if any
-  std::vector<device_span<uint8_t const>> copy_in;
-  copy_in.reserve(num_comp_pages);
-  std::vector<device_span<uint8_t>> copy_out;
-  copy_out.reserve(num_comp_pages);
+  auto copy_in =
+    cudf::detail::make_empty_host_vector<device_span<uint8_t const>>(num_comp_pages, stream);
+  auto copy_out =
+    cudf::detail::make_empty_host_vector<device_span<uint8_t>>(num_comp_pages, stream);
 
   rmm::device_uvector<compression_result> comp_res(num_comp_pages, stream);
   thrust::fill(rmm::exec_policy_nosync(stream),
@@ -822,7 +822,6 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
                compression_result{0, compression_status::FAILURE});
 
   size_t decomp_offset = 0;
-  int32_t start_pos    = 0;
   for (auto const& codec : codecs) {
     if (codec.num_pages == 0) { continue; }
 
@@ -836,56 +835,64 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
       // input and output buffers. otherwise we'd have to keep both the compressed
       // and decompressed data.
       if (offset != 0) {
-        copy_in.emplace_back(page.page_data, offset);
-        copy_out.emplace_back(dst_base, offset);
+        copy_in.push_back({page.page_data, static_cast<size_t>(offset)});
+        copy_out.push_back({dst_base, static_cast<size_t>(offset)});
       }
-      comp_in.emplace_back(page.page_data + offset,
-                           static_cast<size_t>(page.compressed_page_size - offset));
-      comp_out.emplace_back(dst_base + offset,
-                            static_cast<size_t>(page.uncompressed_page_size - offset));
+      comp_in.push_back(
+        {page.page_data + offset, static_cast<size_t>(page.compressed_page_size - offset)});
+      comp_out.push_back(
+        {dst_base + offset, static_cast<size_t>(page.uncompressed_page_size - offset)});
       page.page_data = dst_base;
       decomp_offset += page.uncompressed_page_size;
     });
+  }
+  auto d_comp_in = cudf::detail::make_device_uvector_async(
+    comp_in, stream, rmm::mr::get_current_device_resource());
+  auto d_comp_out = cudf::detail::make_device_uvector_async(
+    comp_out, stream, rmm::mr::get_current_device_resource());
+
+  int32_t start_pos = 0;
+  for (auto const& codec : codecs) {
+    if (codec.num_pages == 0) { continue; }
+
+    device_span<device_span<uint8_t const> const> d_comp_in_view{d_comp_in.data() + start_pos,
+                                                                 codec.num_pages};
+
+    device_span<device_span<uint8_t> const> d_comp_out_view(d_comp_out.data() + start_pos,
+                                                            codec.num_pages);
 
-    host_span<device_span<uint8_t const> const> comp_in_view{comp_in.data() + start_pos,
-                                                             codec.num_pages};
-    auto const d_comp_in = cudf::detail::make_device_uvector_async(
-      comp_in_view, stream, rmm::mr::get_current_device_resource());
-    host_span<device_span<uint8_t> const> comp_out_view(comp_out.data() + start_pos,
-                                                        codec.num_pages);
-    auto const d_comp_out = cudf::detail::make_device_uvector_async(
-      comp_out_view, stream, rmm::mr::get_current_device_resource());
     device_span<compression_result> d_comp_res_view(comp_res.data() + start_pos, codec.num_pages);
 
     switch (codec.compression_type) {
       case GZIP:
-        gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream);
+        gpuinflate(
+          d_comp_in_view, d_comp_out_view, d_comp_res_view, gzip_header_included::YES, stream);
         break;
       case SNAPPY:
         if (cudf::io::nvcomp_integration::is_stable_enabled()) {
           nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
-                                     d_comp_in,
-                                     d_comp_out,
+                                     d_comp_in_view,
+                                     d_comp_out_view,
                                      d_comp_res_view,
                                      codec.max_decompressed_size,
                                      codec.total_decomp_size,
                                      stream);
         } else {
-          gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, stream);
+          gpu_unsnap(d_comp_in_view, d_comp_out, d_comp_res_view, stream);
         }
         break;
       case ZSTD:
         nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
-                                   d_comp_in,
-                                   d_comp_out,
+                                   d_comp_in_view,
+                                   d_comp_out_view,
                                    d_comp_res_view,
                                    codec.max_decompressed_size,
                                    codec.total_decomp_size,
                                    stream);
         break;
       case BROTLI:
-        gpu_debrotli(d_comp_in,
-                     d_comp_out,
+        gpu_debrotli(d_comp_in_view,
+                     d_comp_out_view,
                      d_comp_res_view,
                      debrotli_scratch.data(),
                      debrotli_scratch.size(),
@@ -893,8 +900,8 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
         break;
       case LZ4_RAW:
         nvcomp::batched_decompress(nvcomp::compression_type::LZ4,
-                                   d_comp_in,
-                                   d_comp_out,
+                                   d_comp_in_view,
+                                   d_comp_out_view,
                                    d_comp_res_view,
                                    codec.max_decompressed_size,
                                    codec.total_decomp_size,
@@ -1127,9 +1134,8 @@ void include_decompression_scratch_size(device_span<ColumnChunkDesc const> chunk
                                 decomp_sum{});
 
   // retrieve to host so we can call nvcomp to get compression scratch sizes
-  std::vector<decompression_info> h_decomp_info =
-    cudf::detail::make_std_vector_sync(decomp_info, stream);
-  std::vector<size_t> temp_cost(pages.size());
+  auto h_decomp_info = cudf::detail::make_host_vector_sync(decomp_info, stream);
+  auto temp_cost     = cudf::detail::make_host_vector<size_t>(pages.size(), stream);
   thrust::transform(thrust::host,
                     h_decomp_info.begin(),
                     h_decomp_info.end(),
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index ff47dfc4cf3..e006cc7d714 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -370,7 +370,7 @@ void fill_in_page_info(host_span<ColumnChunkDesc> chunks,
                        rmm::cuda_stream_view stream)
 {
   auto const num_pages = pages.size();
-  std::vector<page_index_info> page_indexes(num_pages);
+  auto page_indexes    = cudf::detail::make_host_vector<page_index_info>(num_pages, stream);
 
   for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
     auto const& chunk = chunks[c];
@@ -1031,8 +1031,8 @@ struct get_page_num_rows {
 };
 
 struct input_col_info {
-  int const schema_idx;
-  size_type const nesting_depth;
+  int schema_idx;
+  size_type nesting_depth;
 };
 
 /**
@@ -1523,8 +1523,8 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
 
   // compute output column sizes by examining the pages of the -input- columns
   if (has_lists) {
-    std::vector<input_col_info> h_cols_info;
-    h_cols_info.reserve(_input_columns.size());
+    auto h_cols_info =
+      cudf::detail::make_empty_host_vector<input_col_info>(_input_columns.size(), _stream);
     std::transform(_input_columns.cbegin(),
                    _input_columns.cend(),
                    std::back_inserter(h_cols_info),
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 8413e716224..2df71b77301 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1824,7 +1824,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   size_type max_page_fragment_size =
     max_page_fragment_size_opt.value_or(default_max_page_fragment_size);
 
-  std::vector<size_type> column_frag_size(num_columns, max_page_fragment_size);
+  auto column_frag_size = cudf::detail::make_host_vector<size_type>(num_columns, stream);
+  std::fill(column_frag_size.begin(), column_frag_size.end(), max_page_fragment_size);
 
   if (input.num_rows() > 0 && not max_page_fragment_size_opt.has_value()) {
     std::vector<size_t> column_sizes;
@@ -1880,7 +1881,9 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
 
   size_type num_fragments = std::reduce(num_frag_in_part.begin(), num_frag_in_part.end());
 
-  std::vector<int> part_frag_offset;  // Store the idx of the first fragment in each partition
+  auto part_frag_offset =
+    cudf::detail::make_empty_host_vector<int>(num_frag_in_part.size() + 1, stream);
+  // Store the idx of the first fragment in each partition
   std::exclusive_scan(
     num_frag_in_part.begin(), num_frag_in_part.end(), std::back_inserter(part_frag_offset), 0);
   part_frag_offset.push_back(part_frag_offset.back() + num_frag_in_part.back());
diff --git a/cpp/src/lists/dremel.cu b/cpp/src/lists/dremel.cu
index 5625e1bf05c..50f40924478 100644
--- a/cpp/src/lists/dremel.cu
+++ b/cpp/src/lists/dremel.cu
@@ -257,10 +257,8 @@ dremel_data get_encoding(column_view h_col,
     },
     stream);
 
-  thrust::host_vector<size_type> column_offsets =
-    cudf::detail::make_host_vector_async(d_column_offsets, stream);
-  thrust::host_vector<size_type> column_ends =
-    cudf::detail::make_host_vector_async(d_column_ends, stream);
+  auto column_offsets = cudf::detail::make_host_vector_async(d_column_offsets, stream);
+  auto column_ends    = cudf::detail::make_host_vector_async(d_column_ends, stream);
   stream.synchronize();
 
   size_t max_vals_size = 0;
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index c4cc0dbe09d..b534e9b2e5b 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -169,8 +169,10 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
 
   // build the offsets: single string output has offsets [0,chars-size]
   auto offsets_column = [&] {
-    auto offsets = cudf::detail::make_device_uvector_async(
-      std::vector<size_type>({0, static_cast<size_type>(chars.size())}), stream, mr);
+    auto h_offsets = cudf::detail::make_host_vector<size_type>(2, stream);
+    h_offsets[0]   = 0;
+    h_offsets[1]   = chars.size();
+    auto offsets   = cudf::detail::make_device_uvector_async(h_offsets, stream, mr);
     return std::make_unique<column>(std::move(offsets), rmm::device_buffer{}, 0);
   }();
 
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 2f4ebf97264..64a2107e17a 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -123,7 +123,7 @@ struct format_compiler {
     : format(fmt), d_items(0, stream)
   {
     specifiers.insert(extra_specifiers.begin(), extra_specifiers.end());
-    std::vector<format_item> items;
+    auto items  = cudf::detail::make_empty_host_vector<format_item>(format.length(), stream);
     auto str    = format.data();
     auto length = format.length();
     while (length > 0) {
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 7622e39e735..352e0f9f41a 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -79,7 +79,7 @@ auto create_strings_device_views(host_span<column_view const> views, rmm::cuda_s
 
   // Compute the partition offsets and size of offset column
   // Note: Using 64-bit size_t so we can detect overflow of 32-bit size_type
-  auto input_offsets = std::vector<size_t>(views.size() + 1);
+  auto input_offsets = cudf::detail::make_host_vector<size_t>(views.size() + 1, stream);
   auto offset_it     = std::next(input_offsets.begin());
   thrust::transform(
     thrust::host, views.begin(), views.end(), offset_it, [](auto const& col) -> size_t {
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index a34828fa97e..48620af8cad 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -129,7 +129,7 @@ std::unique_ptr<column> filter_characters(
 
   // convert input table for copy to device memory
   size_type table_size = static_cast<size_type>(characters_to_filter.size());
-  thrust::host_vector<char_range> htable(table_size);
+  auto htable          = cudf::detail::make_host_vector<char_range>(table_size, stream);
   std::transform(
     characters_to_filter.begin(), characters_to_filter.end(), htable.begin(), [](auto entry) {
       return char_range{entry.first, entry.second};
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index cd60a4296b9..31234ea42ec 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -171,7 +171,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
   auto d_buffer          = rmm::device_buffer(buffer_size, stream);
 
   // copy all the reprog_device instances to a device memory array
-  std::vector<reprog_device> progs;
+  auto progs = cudf::detail::make_empty_host_vector<reprog_device>(h_progs.size(), stream);
   std::transform(h_progs.begin(),
                  h_progs.end(),
                  std::back_inserter(progs),
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index 16b22d0de4c..a242b008a54 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -97,7 +97,7 @@ std::unique_ptr<column> translate(strings_column_view const& strings,
 
   size_type table_size = static_cast<size_type>(chars_table.size());
   // convert input table
-  thrust::host_vector<translate_table> htable(table_size);
+  auto htable = cudf::detail::make_host_vector<translate_table>(table_size, stream);
   std::transform(chars_table.begin(), chars_table.end(), htable.begin(), [](auto entry) {
     return translate_table{entry.first, entry.second};
   });
diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu
index 13c31e8ae4c..2969557c78f 100644
--- a/cpp/src/table/row_operators.cu
+++ b/cpp/src/table/row_operators.cu
@@ -308,7 +308,10 @@ auto decompose_structs(table_view table,
 auto list_lex_preprocess(table_view const& table, rmm::cuda_stream_view stream)
 {
   std::vector<detail::dremel_data> dremel_data;
-  std::vector<detail::dremel_device_view> dremel_device_views;
+  auto const num_list_columns = std::count_if(
+    table.begin(), table.end(), [](auto const& col) { return col.type().id() == type_id::LIST; });
+  auto dremel_device_views =
+    cudf::detail::make_empty_host_vector<detail::dremel_device_view>(num_list_columns, stream);
   for (auto const& col : table) {
     if (col.type().id() == type_id::LIST) {
       dremel_data.push_back(detail::get_comparator_data(col, {}, false, stream));
diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu
index 3d0822d8545..0efb881eb3e 100644
--- a/cpp/src/utilities/cuda_memcpy.cu
+++ b/cpp/src/utilities/cuda_memcpy.cu
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#include "cudf/detail/utilities/integer_utils.hpp"
+
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/cuda_memcpy.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
@@ -26,15 +29,24 @@ namespace cudf::detail {
 
 namespace {
 
+// Simple kernel to copy between device buffers
+CUDF_KERNEL void copy_kernel(char const* src, char* dst, size_t n)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx < n) { dst[idx] = src[idx]; }
+}
+
 void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream)
 {
   if (size == 0) return;
 
   if (size < get_kernel_pinned_copy_threshold()) {
-    thrust::copy_n(rmm::exec_policy_nosync(stream),
-                   static_cast<const char*>(src),
-                   size,
-                   static_cast<char*>(dst));
+    const int block_size = 256;
+    auto const grid_size = cudf::util::div_rounding_up_safe<size_t>(size, block_size);
+    // We are explicitly launching the kernel here instead of calling a thrust function because the
+    // thrust function can potentially call cudaMemcpyAsync instead of using a kernel
+    copy_kernel<<<grid_size, block_size, 0, stream.value()>>>(
+      static_cast<char const*>(src), static_cast<char*>(dst), size);
   } else {
     CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream));
   }
diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/host_memory.cpp
similarity index 73%
rename from cpp/src/utilities/pinned_memory.cpp
rename to cpp/src/utilities/host_memory.cpp
index 3ea4293fc60..7c3cea42023 100644
--- a/cpp/src/utilities/pinned_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -83,7 +83,7 @@ class fixed_pinned_pool_memory_resource {
   void deallocate_async(void* ptr,
                         std::size_t bytes,
                         std::size_t alignment,
-                        cuda::stream_ref stream) noexcept
+                        cuda::stream_ref stream)
   {
     if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr < pool_end_) {
       pool_->deallocate_async(ptr, bytes, alignment, stream);
@@ -92,14 +92,14 @@ class fixed_pinned_pool_memory_resource {
     }
   }
 
-  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept
+  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream)
   {
     return deallocate_async(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
   }
 
   void deallocate(void* ptr,
                   std::size_t bytes,
-                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept
+                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
   {
     deallocate_async(ptr, bytes, alignment, stream_);
     stream_.wait();
@@ -186,6 +186,61 @@ CUDF_EXPORT rmm::host_device_async_resource_ref& host_mr()
   return mr_ref;
 }
 
+class new_delete_memory_resource {
+ public:
+  void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
+  {
+    try {
+      return rmm::detail::aligned_host_allocate(
+        bytes, alignment, [](std::size_t size) { return ::operator new(size); });
+    } catch (std::bad_alloc const& e) {
+      CUDF_FAIL("Failed to allocate memory: " + std::string{e.what()}, rmm::out_of_memory);
+    }
+  }
+
+  void* allocate_async(std::size_t bytes, [[maybe_unused]] cuda::stream_ref stream)
+  {
+    return allocate(bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT);
+  }
+
+  void* allocate_async(std::size_t bytes,
+                       std::size_t alignment,
+                       [[maybe_unused]] cuda::stream_ref stream)
+  {
+    return allocate(bytes, alignment);
+  }
+
+  void deallocate(void* ptr,
+                  std::size_t bytes,
+                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
+  {
+    rmm::detail::aligned_host_deallocate(
+      ptr, bytes, alignment, [](void* ptr) { ::operator delete(ptr); });
+  }
+
+  void deallocate_async(void* ptr,
+                        std::size_t bytes,
+                        std::size_t alignment,
+                        [[maybe_unused]] cuda::stream_ref stream)
+  {
+    deallocate(ptr, bytes, alignment);
+  }
+
+  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream)
+  {
+    deallocate(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT);
+  }
+
+  bool operator==(new_delete_memory_resource const& other) const { return true; }
+
+  bool operator!=(new_delete_memory_resource const& other) const { return !operator==(other); }
+
+  friend void get_property(new_delete_memory_resource const&, cuda::mr::host_accessible) noexcept {}
+};
+
+static_assert(cuda::mr::resource_with<new_delete_memory_resource, cuda::mr::host_accessible>,
+              "Pageable pool mr must be accessible from the host");
+
 }  // namespace
 
 rmm::host_device_async_resource_ref set_pinned_memory_resource(
@@ -225,4 +280,29 @@ void set_kernel_pinned_copy_threshold(size_t threshold)
 
 size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold(); }
 
+CUDF_EXPORT auto& allocate_host_as_pinned_threshold()
+{
+  // use pageable memory for all host allocations
+  static std::atomic<size_t> threshold = 0;
+  return threshold;
+}
+
+void set_allocate_host_as_pinned_threshold(size_t threshold)
+{
+  allocate_host_as_pinned_threshold() = threshold;
+}
+
+size_t get_allocate_host_as_pinned_threshold() { return allocate_host_as_pinned_threshold(); }
+
+namespace detail {
+
+CUDF_EXPORT rmm::host_async_resource_ref get_pageable_memory_resource()
+{
+  static new_delete_memory_resource mr{};
+  static rmm::host_async_resource_ref mr_ref{mr};
+  return mr_ref;
+}
+
+}  // namespace detail
+
 }  // namespace cudf
diff --git a/cpp/tests/io/json/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp
index 7a72b77e1fb..8bcd5790e99 100644
--- a/cpp/tests/io/json/json_tree.cpp
+++ b/cpp/tests/io/json/json_tree.cpp
@@ -235,10 +235,8 @@ tree_meta_t2 get_tree_representation_cpu(
 {
   constexpr bool include_quote_char = true;
   // Copy the JSON tokens to the host
-  thrust::host_vector<cuio_json::PdaTokenT> tokens =
-    cudf::detail::make_host_vector_async(tokens_gpu, stream);
-  thrust::host_vector<cuio_json::SymbolOffsetT> token_indices =
-    cudf::detail::make_host_vector_async(token_indices_gpu1, stream);
+  auto tokens        = cudf::detail::make_host_vector_async(tokens_gpu, stream);
+  auto token_indices = cudf::detail::make_host_vector_async(token_indices_gpu1, stream);
 
   // Make sure tokens have been copied to the host
   stream.synchronize();
diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp
index 51e9b3bd0a0..7a038fa6d75 100644
--- a/cpp/tests/strings/integers_tests.cpp
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -294,7 +294,7 @@ TYPED_TEST(StringsIntegerConvertTest, FromToInteger)
   std::iota(h_integers.begin(), h_integers.end(), -(TypeParam)(h_integers.size() / 2));
   h_integers.push_back(std::numeric_limits<TypeParam>::min());
   h_integers.push_back(std::numeric_limits<TypeParam>::max());
-  auto d_integers = cudf::detail::make_device_uvector_sync(
+  auto const d_integers = cudf::detail::make_device_uvector_sync(
     h_integers, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto integers      = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
                                             (cudf::size_type)d_integers.size());
@@ -308,8 +308,6 @@ TYPED_TEST(StringsIntegerConvertTest, FromToInteger)
   // convert to strings
   auto results_strings = cudf::strings::from_integers(integers->view());
 
-  // copy back to host
-  h_integers = cudf::detail::make_host_vector_sync(d_integers, cudf::get_default_stream());
   std::vector<std::string> h_strings;
   for (auto itr = h_integers.begin(); itr != h_integers.end(); ++itr)
     h_strings.push_back(std::to_string(*itr));
diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
index df9103640f4..93259fd63ee 100644
--- a/cpp/tests/utilities_tests/pinned_memory_tests.cpp
+++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
@@ -18,16 +18,33 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/parquet.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-class PinnedMemoryTest : public cudf::test::BaseFixture {};
+class PinnedMemoryTest : public cudf::test::BaseFixture {
+  size_t prev_copy_threshold;
+  size_t prev_alloc_threshold;
 
-TEST(PinnedMemoryTest, MemoryResourceGetAndSet)
+ public:
+  PinnedMemoryTest()
+    : prev_copy_threshold{cudf::get_kernel_pinned_copy_threshold()},
+      prev_alloc_threshold{cudf::get_allocate_host_as_pinned_threshold()}
+  {
+  }
+  ~PinnedMemoryTest() override
+  {
+    cudf::set_kernel_pinned_copy_threshold(prev_copy_threshold);
+    cudf::set_allocate_host_as_pinned_threshold(prev_alloc_threshold);
+  }
+};
+
+TEST_F(PinnedMemoryTest, MemoryResourceGetAndSet)
 {
   // Global environment for temporary files
   auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
@@ -63,3 +80,49 @@ TEST(PinnedMemoryTest, MemoryResourceGetAndSet)
   // reset memory resource back
   cudf::set_pinned_memory_resource(last_mr);
 }
+
+TEST_F(PinnedMemoryTest, KernelCopyThresholdGetAndSet)
+{
+  cudf::set_kernel_pinned_copy_threshold(12345);
+  EXPECT_EQ(cudf::get_kernel_pinned_copy_threshold(), 12345);
+}
+
+TEST_F(PinnedMemoryTest, HostAsPinnedThresholdGetAndSet)
+{
+  cudf::set_allocate_host_as_pinned_threshold(12345);
+  EXPECT_EQ(cudf::get_allocate_host_as_pinned_threshold(), 12345);
+}
+
+TEST_F(PinnedMemoryTest, MakePinnedVector)
+{
+  cudf::set_allocate_host_as_pinned_threshold(0);
+
+  // should always use pinned memory
+  {
+    auto const vec = cudf::detail::make_pinned_vector_async<char>(1, cudf::get_default_stream());
+    EXPECT_TRUE(vec.get_allocator().is_device_accessible());
+  }
+}
+
+TEST_F(PinnedMemoryTest, MakeHostVector)
+{
+  cudf::set_allocate_host_as_pinned_threshold(7);
+
+  // allocate smaller than the threshold
+  {
+    auto const vec = cudf::detail::make_host_vector<int32_t>(1, cudf::get_default_stream());
+    EXPECT_TRUE(vec.get_allocator().is_device_accessible());
+  }
+
+  // allocate the same size as the threshold
+  {
+    auto const vec = cudf::detail::make_host_vector<char>(7, cudf::get_default_stream());
+    EXPECT_TRUE(vec.get_allocator().is_device_accessible());
+  }
+
+  // allocate larger than the threshold
+  {
+    auto const vec = cudf::detail::make_host_vector<int32_t>(2, cudf::get_default_stream());
+    EXPECT_FALSE(vec.get_allocator().is_device_accessible());
+  }
+}

From 75289c58f3d9ca11a51396e4adadfbd5f51856f5 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 23 Jul 2024 23:45:59 -0500
Subject: [PATCH 08/15] Rename PrefetchConfig to prefetch_config. (#16358)

This PR addresses a comment requesting a rename of `PrefetchConfig` to `prefetch_config`.

See: https://github.com/rapidsai/cudf/pull/16020#discussion_r1686284151

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Shruti Shivakumar (https://github.com/shrshi)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16358
---
 cpp/include/cudf/utilities/prefetch.hpp | 10 +++++-----
 cpp/src/column/column_view.cpp          |  2 +-
 cpp/src/utilities/prefetch.cpp          | 21 ++++++++++++---------
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/cpp/include/cudf/utilities/prefetch.hpp b/cpp/include/cudf/utilities/prefetch.hpp
index 5ca6fd6f4b0..88c634a7cc7 100644
--- a/cpp/include/cudf/utilities/prefetch.hpp
+++ b/cpp/include/cudf/utilities/prefetch.hpp
@@ -31,17 +31,17 @@ namespace detail {
 /**
  * @brief A singleton class that manages the prefetching configuration.
  */
-class PrefetchConfig {
+class prefetch_config {
  public:
-  PrefetchConfig& operator=(const PrefetchConfig&) = delete;
-  PrefetchConfig(const PrefetchConfig&)            = delete;
+  prefetch_config& operator=(const prefetch_config&) = delete;
+  prefetch_config(const prefetch_config&)            = delete;
 
   /**
    * @brief Get the singleton instance of the prefetching configuration.
    *
    * @return The singleton instance of the prefetching configuration.
    */
-  static PrefetchConfig& instance();
+  static prefetch_config& instance();
 
   /**
    * @brief Get the value of a configuration key.
@@ -65,7 +65,7 @@ class PrefetchConfig {
   bool debug{false};
 
  private:
-  PrefetchConfig() = default;                 //< Private constructor to enforce singleton pattern
+  prefetch_config() = default;                //< Private constructor to enforce singleton pattern
   std::map<std::string, bool> config_values;  //< Map of configuration keys to values
 };
 
diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp
index a9605efb362..b0f9e9f0e74 100644
--- a/cpp/src/column/column_view.cpp
+++ b/cpp/src/column/column_view.cpp
@@ -39,7 +39,7 @@ namespace {
 template <typename ColumnView>
 void prefetch_col_data(ColumnView& col, void const* data_ptr, std::string_view key) noexcept
 {
-  if (cudf::experimental::prefetch::detail::PrefetchConfig::instance().get(key)) {
+  if (cudf::experimental::prefetch::detail::prefetch_config::instance().get(key)) {
     if (cudf::is_fixed_width(col.type())) {
       cudf::experimental::prefetch::detail::prefetch_noexcept(
         key, data_ptr, col.size() * size_of(col.type()), cudf::get_default_stream());
diff --git a/cpp/src/utilities/prefetch.cpp b/cpp/src/utilities/prefetch.cpp
index 21f2e40c82a..16f2c3a1202 100644
--- a/cpp/src/utilities/prefetch.cpp
+++ b/cpp/src/utilities/prefetch.cpp
@@ -26,13 +26,13 @@ namespace cudf::experimental::prefetch {
 
 namespace detail {
 
-PrefetchConfig& PrefetchConfig::instance()
+prefetch_config& prefetch_config::instance()
 {
-  static PrefetchConfig instance;
+  static prefetch_config instance;
   return instance;
 }
 
-bool PrefetchConfig::get(std::string_view key)
+bool prefetch_config::get(std::string_view key)
 {
   // Default to not prefetching
   if (config_values.find(key.data()) == config_values.end()) {
@@ -40,7 +40,7 @@ bool PrefetchConfig::get(std::string_view key)
   }
   return config_values[key.data()];
 }
-void PrefetchConfig::set(std::string_view key, bool value) { config_values[key.data()] = value; }
+void prefetch_config::set(std::string_view key, bool value) { config_values[key.data()] = value; }
 
 cudaError_t prefetch_noexcept(std::string_view key,
                               void const* ptr,
@@ -48,8 +48,8 @@ cudaError_t prefetch_noexcept(std::string_view key,
                               rmm::cuda_stream_view stream,
                               rmm::cuda_device_id device_id) noexcept
 {
-  if (PrefetchConfig::instance().get(key)) {
-    if (PrefetchConfig::instance().debug) {
+  if (prefetch_config::instance().get(key)) {
+    if (prefetch_config::instance().debug) {
       std::cerr << "Prefetching " << size << " bytes for key " << key << " at location " << ptr
                 << std::endl;
     }
@@ -78,12 +78,15 @@ void prefetch(std::string_view key,
 
 }  // namespace detail
 
-void enable_prefetching(std::string_view key) { detail::PrefetchConfig::instance().set(key, true); }
+void enable_prefetching(std::string_view key)
+{
+  detail::prefetch_config::instance().set(key, true);
+}
 
 void disable_prefetching(std::string_view key)
 {
-  detail::PrefetchConfig::instance().set(key, false);
+  detail::prefetch_config::instance().set(key, false);
 }
 
-void prefetch_debugging(bool enable) { detail::PrefetchConfig::instance().debug = enable; }
+void prefetch_debugging(bool enable) { detail::prefetch_config::instance().debug = enable; }
 }  // namespace cudf::experimental::prefetch

From 8c1749b40eaa983966ed3bece6bdd29a4316d18a Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Wed, 24 Jul 2024 01:19:10 -0400
Subject: [PATCH 09/15] Use rapids_cpm_bs_thread_pool() (#16360)

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16360
---
 cpp/CMakeLists.txt                         |  2 +-
 cpp/cmake/thirdparty/get_thread_pool.cmake | 20 +++++++-------------
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a2c2dd3af4c..b044545bb08 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -807,7 +807,7 @@ add_dependencies(cudf jitify_preprocess_run)
 # Specify the target module library dependencies
 target_link_libraries(
   cudf
-  PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm $<BUILD_LOCAL_INTERFACE:BS_thread_pool>
+  PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm $<BUILD_LOCAL_INTERFACE:BS::thread_pool>
   PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3::nvtx3-cpp> cuco::cuco ZLIB::ZLIB nvcomp::nvcomp
           kvikio::kvikio $<TARGET_NAME_IF_EXISTS:cuFile_interface> nanoarrow
 )
diff --git a/cpp/cmake/thirdparty/get_thread_pool.cmake b/cpp/cmake/thirdparty/get_thread_pool.cmake
index 264257c7199..235bf409058 100644
--- a/cpp/cmake/thirdparty/get_thread_pool.cmake
+++ b/cpp/cmake/thirdparty/get_thread_pool.cmake
@@ -12,20 +12,14 @@
 # the License.
 # =============================================================================
 
-# This function finds rmm and sets any additional necessary environment variables.
+# Need to call rapids_cpm_bs_thread_pool to get support for an installed version of thread-pool and
+# to support installing it ourselves
 function(find_and_configure_thread_pool)
-  rapids_cpm_find(
-    BS_thread_pool 4.1.0
-    CPM_ARGS
-    GIT_REPOSITORY https://github.com/bshoshany/thread-pool.git
-    GIT_TAG 097aa718f25d44315cadb80b407144ad455ee4f9
-    GIT_SHALLOW TRUE
-  )
-  if(NOT TARGET BS_thread_pool)
-    add_library(BS_thread_pool INTERFACE)
-    target_include_directories(BS_thread_pool INTERFACE ${BS_thread_pool_SOURCE_DIR}/include)
-    target_compile_definitions(BS_thread_pool INTERFACE "BS_THREAD_POOL_ENABLE_PAUSE=1")
-  endif()
+  include(${rapids-cmake-dir}/cpm/bs_thread_pool.cmake)
+
+  # Find or install thread-pool
+  rapids_cpm_bs_thread_pool(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports)
+
 endfunction()
 
 find_and_configure_thread_pool()

From 62625f1bfcdb980186a1afbec41e420fdb4a7075 Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Wed, 24 Jul 2024 03:42:03 -0400
Subject: [PATCH 10/15] Host implementation of `to_arrow` using nanoarrow
 (#16297)

Adds the corresponding `to_arrow_host` functions for interop using `ArrowDeviceArray`. This includes updating the version of nanoarrow in use to pick up some bug fixes and features.

Authors:
  - Matt Topol (https://github.com/zeroshade)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16297
---
 cpp/CMakeLists.txt                         |    1 +
 cpp/cmake/thirdparty/get_nanoarrow.cmake   |    4 +-
 cpp/include/cudf/interop.hpp               |   80 +-
 cpp/include/cudf/interop/detail/arrow.hpp  |   53 -
 cpp/src/interop/arrow_utilities.cpp        |   31 +
 cpp/src/interop/arrow_utilities.hpp        |   43 +-
 cpp/src/interop/from_arrow_device.cu       |   10 +-
 cpp/src/interop/from_arrow_host.cu         |    2 +-
 cpp/src/interop/to_arrow.cu                |   33 +-
 cpp/src/interop/to_arrow_device.cu         |  101 +-
 cpp/src/interop/to_arrow_host.cu           |  428 ++++++++
 cpp/src/interop/to_arrow_schema.cpp        |    7 +-
 cpp/tests/CMakeLists.txt                   |    1 +
 cpp/tests/interop/nanoarrow_utils.hpp      |    9 +-
 cpp/tests/interop/to_arrow_device_test.cpp |    1 -
 cpp/tests/interop/to_arrow_host_test.cpp   | 1117 ++++++++++++++++++++
 16 files changed, 1760 insertions(+), 161 deletions(-)
 delete mode 100644 cpp/include/cudf/interop/detail/arrow.hpp
 create mode 100644 cpp/src/interop/to_arrow_host.cu
 create mode 100644 cpp/tests/interop/to_arrow_host_test.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index b044545bb08..24b683a930b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -367,6 +367,7 @@ add_library(
   src/interop/arrow_utilities.cpp
   src/interop/to_arrow.cu
   src/interop/to_arrow_device.cu
+  src/interop/to_arrow_host.cu
   src/interop/from_arrow_device.cu
   src/interop/from_arrow_host.cu
   src/interop/from_arrow_stream.cu
diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake
index 025bff7d8f0..8df1b431095 100644
--- a/cpp/cmake/thirdparty/get_nanoarrow.cmake
+++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake
@@ -17,11 +17,11 @@ function(find_and_configure_nanoarrow)
   # Currently we need to always build nanoarrow so we don't pickup a previous installed version
   set(CPM_DOWNLOAD_nanoarrow ON)
   rapids_cpm_find(
-    nanoarrow 0.5.0
+    nanoarrow 0.6.0.dev
     GLOBAL_TARGETS nanoarrow
     CPM_ARGS
     GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git
-    GIT_TAG 11e73a8c85b45e3d49c8c541b4e1497a649fe03c
+    GIT_TAG 1e2664a70ec14907409cadcceb14d79b9670bcdb
     GIT_SHALLOW FALSE
     OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf"
   )
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 11f6ce2bad7..61f7d72a467 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -136,6 +136,8 @@ struct column_metadata {
  * Converts the `cudf::table_view` to `arrow::Table` with the provided
  * metadata `column_names`.
  *
+ * @deprecated Since 24.08. Use cudf::to_arrow_host instead.
+ *
  * @throws cudf::logic_error if `column_names` size doesn't match with number of columns.
  *
  * @param input table_view that needs to be converted to arrow Table
@@ -150,16 +152,19 @@ struct column_metadata {
  * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
  * converted to Arrow decimal128 of the precision 38.
  */
-std::shared_ptr<arrow::Table> to_arrow(table_view input,
-                                       std::vector<column_metadata> const& metadata = {},
-                                       rmm::cuda_stream_view stream = cudf::get_default_stream(),
-                                       arrow::MemoryPool* ar_mr     = arrow::default_memory_pool());
+[[deprecated]] std::shared_ptr<arrow::Table> to_arrow(
+  table_view input,
+  std::vector<column_metadata> const& metadata = {},
+  rmm::cuda_stream_view stream                 = cudf::get_default_stream(),
+  arrow::MemoryPool* ar_mr                     = arrow::default_memory_pool());
 
 /**
  * @brief Create `arrow::Scalar` from cudf scalar `input`
  *
  * Converts the `cudf::scalar` to `arrow::Scalar`.
  *
+ * @deprecated Since 24.08.
+ *
  * @param input scalar that needs to be converted to arrow Scalar
  * @param metadata Contains hierarchy of names of columns and children
  * @param stream CUDA stream used for device memory operations and kernel launches
@@ -172,10 +177,11 @@ std::shared_ptr<arrow::Table> to_arrow(table_view input,
  * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
  * converted to Arrow decimal128 of the precision 38.
  */
-std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
-                                        column_metadata const& metadata = {},
-                                        rmm::cuda_stream_view stream = cudf::get_default_stream(),
-                                        arrow::MemoryPool* ar_mr = arrow::default_memory_pool());
+[[deprecated]] std::shared_ptr<arrow::Scalar> to_arrow(
+  cudf::scalar const& input,
+  column_metadata const& metadata = {},
+  rmm::cuda_stream_view stream    = cudf::get_default_stream(),
+  arrow::MemoryPool* ar_mr        = arrow::default_memory_pool());
 
 /**
  * @brief typedef for a unique_ptr to an ArrowSchema with custom deleter
@@ -329,15 +335,67 @@ unique_device_array_t to_arrow_device(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Copy table view data to host and create `ArrowDeviceArray` for it
+ *
+ * Populates the C struct ArrowDeviceArray, copying the cudf data to the host. The
+ * returned ArrowDeviceArray will have a device_type of CPU and will have no ties
+ * to the memory referenced by the table view passed in. The deleter for the
+ * returned unique_ptr will call the release callback on the ArrowDeviceArray
+ * automatically.
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf, it will
+ * be converted to an Arrow decimal128 that has the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
+ * converted to Arrow decimal128 of precision 38.
+ *
+ * @param table Input table
+ * @param stream CUDA stream used for the device memory operations and kernel launches
+ * @param mr Device memory resource used for any allocations during conversion
+ * @return ArrowDeviceArray generated from input table
+ */
+unique_device_array_t to_arrow_host(
+  cudf::table_view const& table,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Copy column view data to host and create `ArrowDeviceArray` for it
+ *
+ * Populates the C struct ArrowDeviceArray, copying the cudf data to the host. The
+ * returned ArrowDeviceArray will have a device_type of CPU and will have no ties
+ * to the memory referenced by the column view passed in. The deleter for the
+ * returned unique_ptr will call the release callback on the ArrowDeviceArray
+ * automatically.
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf, it will
+ * be converted to an Arrow decimal128 that has the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
+ * converted to Arrow decimal128 of precision 38.
+ *
+ * @param col Input column
+ * @param stream CUDA stream used for the device memory operations and kernel launches
+ * @param mr Device memory resource used for any allocations during conversion
+ * @return ArrowDeviceArray generated from input column
+ */
+unique_device_array_t to_arrow_host(
+  cudf::column_view const& col,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Create `cudf::table` from given arrow Table input
  *
+ * @deprecated Since 24.08. Use cudf::from_arrow_host instead.
+ *
  * @param input arrow:Table that needs to be converted to `cudf::table`
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr    Device memory resource used to allocate `cudf::table`
  * @return cudf table generated from given arrow Table
  */
-std::unique_ptr<table> from_arrow(
+[[deprecated]] std::unique_ptr<table> from_arrow(
   arrow::Table const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
@@ -345,12 +403,14 @@ std::unique_ptr<table> from_arrow(
 /**
  * @brief Create `cudf::scalar` from given arrow Scalar input
  *
+ * @deprecated Since 24.08.
+ *
  * @param input `arrow::Scalar` that needs to be converted to `cudf::scalar`
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr    Device memory resource used to allocate `cudf::scalar`
  * @return cudf scalar generated from given arrow Scalar
  */
-std::unique_ptr<cudf::scalar> from_arrow(
+[[deprecated]] std::unique_ptr<cudf::scalar> from_arrow(
   arrow::Scalar const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
diff --git a/cpp/include/cudf/interop/detail/arrow.hpp b/cpp/include/cudf/interop/detail/arrow.hpp
deleted file mode 100644
index 906d48f636b..00000000000
--- a/cpp/include/cudf/interop/detail/arrow.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <nanoarrow/nanoarrow.hpp>
-
-// from Arrow C Device Data Interface
-// https://arrow.apache.org/docs/format/CDeviceDataInterface.html
-#ifndef ARROW_C_DEVICE_DATA_INTERFACE
-#define ARROW_C_DEVICE_DATA_INTERFACE
-
-// Device type for the allocated memory
-using ArrowDeviceType = int32_t;
-
-// The Arrow spec specifies using macros rather than enums here to avoid being
-// susceptible to changes in the underlying type chosen by the compiler, but
-// clang-tidy doesn't like this.
-// NOLINTBEGIN
-// CPU device, same as using ArrowArray directly
-#define ARROW_DEVICE_CPU 1
-// CUDA GPU Device
-#define ARROW_DEVICE_CUDA 2
-// Pinned CUDA CPU memory by cudaMallocHost
-#define ARROW_DEVICE_CUDA_HOST 3
-// CUDA managed/unified memory allocated by cudaMallocManaged
-#define ARROW_DEVICE_CUDA_MANAGED 13
-// NOLINTEND
-
-struct ArrowDeviceArray {
-  struct ArrowArray array;
-  int64_t device_id;
-  ArrowDeviceType device_type;
-  void* sync_event;
-
-  // reserved bytes for future expansion
-  int64_t reserved[3];
-};
-
-#endif  // ARROW_C_DEVICE_DATA_INTERFACE
diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp
index 605d813ed1e..4292552a800 100644
--- a/cpp/src/interop/arrow_utilities.cpp
+++ b/cpp/src/interop/arrow_utilities.cpp
@@ -16,9 +16,16 @@
 
 #include "arrow_utilities.hpp"
 
+#include <cudf/column/column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+
 #include <nanoarrow/nanoarrow.h>
 
 namespace cudf {
@@ -83,9 +90,33 @@ ArrowType id_to_arrow_type(cudf::type_id id)
     case cudf::type_id::FLOAT32: return NANOARROW_TYPE_FLOAT;
     case cudf::type_id::FLOAT64: return NANOARROW_TYPE_DOUBLE;
     case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_DATE32;
+    case cudf::type_id::DECIMAL128: return NANOARROW_TYPE_DECIMAL128;
     default: CUDF_FAIL("Unsupported type_id conversion to arrow type", cudf::data_type_error);
   }
 }
 
+ArrowType id_to_arrow_storage_type(cudf::type_id id)
+{
+  switch (id) {
+    case cudf::type_id::TIMESTAMP_SECONDS:
+    case cudf::type_id::TIMESTAMP_MILLISECONDS:
+    case cudf::type_id::TIMESTAMP_MICROSECONDS:
+    case cudf::type_id::TIMESTAMP_NANOSECONDS: return NANOARROW_TYPE_INT64;
+    case cudf::type_id::DURATION_SECONDS:
+    case cudf::type_id::DURATION_MILLISECONDS:
+    case cudf::type_id::DURATION_MICROSECONDS:
+    case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TYPE_INT64;
+    default: return id_to_arrow_type(id);
+  }
+}
+
+int initialize_array(ArrowArray* arr, ArrowType storage_type, cudf::column_view column)
+{
+  NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(arr, storage_type));
+  arr->length     = column.size();
+  arr->null_count = column.null_count();
+  return NANOARROW_OK;
+}
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/interop/arrow_utilities.hpp b/cpp/src/interop/arrow_utilities.hpp
index 4e2628ab689..1cee3071fcb 100644
--- a/cpp/src/interop/arrow_utilities.hpp
+++ b/cpp/src/interop/arrow_utilities.hpp
@@ -18,8 +18,12 @@
 
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
 #include <nanoarrow/nanoarrow.h>
-#include <nanoarrow/nanoarrow_types.h>
 
 namespace cudf {
 namespace detail {
@@ -47,5 +51,42 @@ data_type arrow_to_cudf_type(ArrowSchemaView const* arrow_view);
  */
 ArrowType id_to_arrow_type(cudf::type_id id);
 
+/**
+ * @brief Map cudf column type id to the storage type for Arrow
+ *
+ * Specifically this is for handling the underlying storage type of
+ * timestamps and durations.
+ *
+ * @param id column type id
+ * @return ArrowType storage type
+ */
+ArrowType id_to_arrow_storage_type(cudf::type_id id);
+
+/**
+ * @brief Helper to initialize ArrowArray struct
+ *
+ * @param arr Pointer to ArrowArray to initialize
+ * @param storage_type The type to initialize with
+ * @param column view for column to get the length and null count from
+ * @return nanoarrow status code, should be NANOARROW_OK if there are no errors
+ */
+int initialize_array(ArrowArray* arr, ArrowType storage_type, cudf::column_view column);
+
+/**
+ * @brief Helper to convert decimal values to 128-bit versions for Arrow compatibility
+ *
+ * The template parameter should be the underlying type of the data (e.g. int32_t for
+ * 32-bit decimal and int64_t for 64-bit decimal).
+ *
+ * @param input column_view of the data
+ * @param stream cuda stream to perform the operations on
+ * @param mr memory resource to allocate the returned device_uvector with
+ * @return unique_ptr to a device_buffer containing the upcasted data
+ */
+template <typename DeviceType>
+std::unique_ptr<rmm::device_buffer> decimals_to_arrow(cudf::column_view input,
+                                                      rmm::cuda_stream_view stream,
+                                                      rmm::device_async_resource_ref mr);
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu
index e1d289e67a3..440df571de0 100644
--- a/cpp/src/interop/from_arrow_device.cu
+++ b/cpp/src/interop/from_arrow_device.cu
@@ -25,7 +25,6 @@
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/interop.hpp>
-#include <cudf/interop/detail/arrow.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -39,6 +38,7 @@
 
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow.hpp>
+#include <nanoarrow/nanoarrow_device.h>
 
 namespace cudf {
 
@@ -144,9 +144,6 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::string_view>(
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(schema->type != NANOARROW_TYPE_LARGE_STRING,
-               "Large strings are not yet supported in from_arrow_device",
-               cudf::data_type_error);
   if (input->length == 0) {
     return std::make_tuple<column_view, owned_columns_t>(
       {type,
@@ -158,12 +155,15 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::string_view>(
       {});
   }
 
-  auto offsets_view = column_view{data_type(type_id::INT32),
+  data_type offsets_type(type_id::INT32);
+  if (schema->type == NANOARROW_TYPE_LARGE_STRING) { offsets_type = data_type(type_id::INT64); }
+  auto offsets_view = column_view{offsets_type,
                                   static_cast<size_type>(input->offset + input->length) + 1,
                                   input->buffers[fixed_width_data_buffer_idx],
                                   nullptr,
                                   0,
                                   0};
+
   return std::make_tuple<column_view, owned_columns_t>(
     {type,
      static_cast<size_type>(input->length),
diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu
index b3087dedf98..efde8f2a463 100644
--- a/cpp/src/interop/from_arrow_host.cu
+++ b/cpp/src/interop/from_arrow_host.cu
@@ -28,7 +28,6 @@
 #include <cudf/detail/unary.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/interop.hpp>
-#include <cudf/interop/detail/arrow.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -42,6 +41,7 @@
 
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow.hpp>
+#include <nanoarrow/nanoarrow_device.h>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 622a3aba4bb..e89ecedc218 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "arrow_utilities.hpp"
 #include "detail/arrow_allocator.hpp"
 
 #include <cudf/column/column.hpp>
@@ -157,33 +158,17 @@ std::shared_ptr<arrow::Array> unsupported_decimals_to_arrow(column_view input,
                                                             arrow::MemoryPool* ar_mr,
                                                             rmm::cuda_stream_view stream)
 {
-  constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType);
-
-  rmm::device_uvector<DeviceType> buf(input.size() * BIT_WIDTH_RATIO, stream);
-
-  auto count = thrust::make_counting_iterator(0);
-
-  thrust::for_each(
-    rmm::exec_policy(cudf::get_default_stream()),
-    count,
-    count + input.size(),
-    [in = input.begin<DeviceType>(), out = buf.data(), BIT_WIDTH_RATIO] __device__(auto in_idx) {
-      auto const out_idx = in_idx * BIT_WIDTH_RATIO;
-      // The lowest order bits are the value, the remainder
-      // simply matches the sign bit to satisfy the two's
-      // complement integer representation of negative numbers.
-      out[out_idx] = in[in_idx];
-#pragma unroll BIT_WIDTH_RATIO - 1
-      for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
-        out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
-      }
-    });
+  auto buf =
+    detail::decimals_to_arrow<DeviceType>(input, stream, rmm::mr::get_current_device_resource());
 
-  auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType);
+  auto const buf_size_in_bytes = buf->size();
   auto data_buffer             = allocate_arrow_buffer(buf_size_in_bytes, ar_mr);
 
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    data_buffer->mutable_data(), buf.data(), buf_size_in_bytes, cudaMemcpyDefault, stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
+                                buf->data(),
+                                buf_size_in_bytes,
+                                cudaMemcpyDefault,
+                                stream.value()));
 
   auto type    = arrow::decimal(precision, -input.type().scale());
   auto mask    = fetch_mask_buffer(input, ar_mr, stream);
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
index b9d3a59e647..2eb9b912054 100644
--- a/cpp/src/interop/to_arrow_device.cu
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -24,7 +24,6 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/interop.hpp>
-#include <cudf/interop/detail/arrow.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -44,6 +43,7 @@
 
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow.hpp>
+#include <nanoarrow/nanoarrow_device.h>
 
 namespace cudf {
 namespace detail {
@@ -56,14 +56,6 @@ void device_buffer_finalize(ArrowBufferAllocator* allocator, uint8_t*, int64_t)
   delete unique_buffer;
 }
 
-int initialize_array(ArrowArray* arr, ArrowType storage_type, cudf::column_view column)
-{
-  NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(arr, storage_type));
-  arr->length     = column.size();
-  arr->null_count = column.null_count();
-  return NANOARROW_OK;
-}
-
 template <typename>
 struct is_device_scalar : public std::false_type {};
 
@@ -99,21 +91,6 @@ int set_buffer(std::unique_ptr<T> device_buf, int64_t i, ArrowArray* out)
   return NANOARROW_OK;
 }
 
-ArrowType id_to_arrow_storage_type(cudf::type_id id)
-{
-  switch (id) {
-    case cudf::type_id::TIMESTAMP_SECONDS:
-    case cudf::type_id::TIMESTAMP_MILLISECONDS:
-    case cudf::type_id::TIMESTAMP_MICROSECONDS:
-    case cudf::type_id::TIMESTAMP_NANOSECONDS: return NANOARROW_TYPE_INT64;
-    case cudf::type_id::DURATION_SECONDS:
-    case cudf::type_id::DURATION_MILLISECONDS:
-    case cudf::type_id::DURATION_MICROSECONDS:
-    case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TYPE_INT64;
-    default: return id_to_arrow_type(id);
-  }
-}
-
 struct dispatch_to_arrow_device {
   template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
   int operator()(cudf::column&&, rmm::cuda_stream_view, rmm::device_async_resource_ref, ArrowArray*)
@@ -156,35 +133,15 @@ struct dispatch_to_arrow_device {
 };
 
 template <typename DeviceType>
-int decimals_to_arrow(cudf::column_view input,
-                      rmm::cuda_stream_view stream,
-                      rmm::device_async_resource_ref mr,
-                      ArrowArray* out)
+int construct_decimals(cudf::column_view input,
+                       rmm::cuda_stream_view stream,
+                       rmm::device_async_resource_ref mr,
+                       ArrowArray* out)
 {
   nanoarrow::UniqueArray tmp;
   NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, input));
 
-  constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType);
-  auto buf =
-    std::make_unique<rmm::device_uvector<DeviceType>>(input.size() * BIT_WIDTH_RATIO, stream, mr);
-
-  auto count = thrust::counting_iterator<size_type>(0);
-
-  thrust::for_each(
-    rmm::exec_policy(stream, mr),
-    count,
-    count + input.size(),
-    [in = input.begin<DeviceType>(), out = buf->data(), BIT_WIDTH_RATIO] __device__(auto in_idx) {
-      auto const out_idx = in_idx * BIT_WIDTH_RATIO;
-      // the lowest order bits are the value, the remainder
-      // simply matches the sign bit to satisfy the two's
-      // complement integer representation of negative numbers.
-      out[out_idx] = in[in_idx];
-#pragma unroll BIT_WIDTH_RATIO - 1
-      for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
-        out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
-      }
-    });
+  auto buf = detail::decimals_to_arrow<DeviceType>(input, stream, mr);
   NANOARROW_RETURN_NOT_OK(set_buffer(std::move(buf), fixed_width_data_buffer_idx, tmp.get()));
 
   ArrowArrayMove(tmp.get(), out);
@@ -198,7 +155,7 @@ int dispatch_to_arrow_device::operator()<numeric::decimal32>(cudf::column&& colu
                                                              ArrowArray* out)
 {
   using DeviceType = int32_t;
-  NANOARROW_RETURN_NOT_OK(decimals_to_arrow<DeviceType>(column.view(), stream, mr, out));
+  NANOARROW_RETURN_NOT_OK(construct_decimals<DeviceType>(column.view(), stream, mr, out));
   auto contents = column.release();
   NANOARROW_RETURN_NOT_OK(set_null_mask(contents, out));
   return NANOARROW_OK;
@@ -211,7 +168,7 @@ int dispatch_to_arrow_device::operator()<numeric::decimal64>(cudf::column&& colu
                                                              ArrowArray* out)
 {
   using DeviceType = int64_t;
-  NANOARROW_RETURN_NOT_OK(decimals_to_arrow<DeviceType>(column.view(), stream, mr, out));
+  NANOARROW_RETURN_NOT_OK(construct_decimals<DeviceType>(column.view(), stream, mr, out));
   auto contents = column.release();
   NANOARROW_RETURN_NOT_OK(set_null_mask(contents, out));
   return NANOARROW_OK;
@@ -256,8 +213,15 @@ int dispatch_to_arrow_device::operator()<cudf::string_view>(cudf::column&& colum
                                                             rmm::device_async_resource_ref mr,
                                                             ArrowArray* out)
 {
+  ArrowType nanoarrow_type = NANOARROW_TYPE_STRING;
+  if (column.num_children() > 0 &&
+      column.child(cudf::strings_column_view::offsets_column_index).type().id() ==
+        cudf::type_id::INT64) {
+    nanoarrow_type = NANOARROW_TYPE_LARGE_STRING;
+  }
+
   nanoarrow::UniqueArray tmp;
-  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRING, column));
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), nanoarrow_type, column));
 
   if (column.size() == 0) {
     // the scalar zero here is necessary because the spec for string arrays states
@@ -265,8 +229,14 @@ int dispatch_to_arrow_device::operator()<cudf::string_view>(cudf::column&& colum
     // the case of a 0 length string array, there should be exactly 1 value, zero,
     // in the offsets buffer. While some arrow implementations may accept a zero-sized
     // offsets buffer, best practices would be to allocate the buffer with the single value.
-    auto zero = std::make_unique<rmm::device_scalar<int32_t>>(0, stream, mr);
-    NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
+    if (nanoarrow_type == NANOARROW_TYPE_STRING) {
+      auto zero = std::make_unique<rmm::device_scalar<int32_t>>(0, stream, mr);
+      NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
+    } else {
+      auto zero = std::make_unique<rmm::device_scalar<int64_t>>(0, stream, mr);
+      NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
+    }
+
     ArrowArrayMove(tmp.get(), out);
     return NANOARROW_OK;
   }
@@ -436,7 +406,7 @@ template <>
 int dispatch_to_arrow_device_view::operator()<numeric::decimal32>(ArrowArray* out) const
 {
   using DeviceType = int32_t;
-  NANOARROW_RETURN_NOT_OK(decimals_to_arrow<DeviceType>(column, stream, mr, out));
+  NANOARROW_RETURN_NOT_OK(construct_decimals<DeviceType>(column, stream, mr, out));
   NANOARROW_RETURN_NOT_OK(set_null_mask(column, out));
   return NANOARROW_OK;
 }
@@ -445,7 +415,7 @@ template <>
 int dispatch_to_arrow_device_view::operator()<numeric::decimal64>(ArrowArray* out) const
 {
   using DeviceType = int64_t;
-  NANOARROW_RETURN_NOT_OK(decimals_to_arrow<DeviceType>(column, stream, mr, out));
+  NANOARROW_RETURN_NOT_OK(construct_decimals<DeviceType>(column, stream, mr, out));
   NANOARROW_RETURN_NOT_OK(set_null_mask(column, out));
   return NANOARROW_OK;
 }
@@ -481,13 +451,26 @@ int dispatch_to_arrow_device_view::operator()<bool>(ArrowArray* out) const
 template <>
 int dispatch_to_arrow_device_view::operator()<cudf::string_view>(ArrowArray* out) const
 {
+  ArrowType nanoarrow_type = NANOARROW_TYPE_STRING;
+  if (column.num_children() > 0 &&
+      column.child(cudf::strings_column_view::offsets_column_index).type().id() ==
+        cudf::type_id::INT64) {
+    nanoarrow_type = NANOARROW_TYPE_LARGE_STRING;
+  }
+
   nanoarrow::UniqueArray tmp;
-  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRING, column));
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), nanoarrow_type, column));
 
   if (column.size() == 0) {
     // https://github.com/rapidsai/cudf/pull/15047#discussion_r1546528552
-    auto zero = std::make_unique<rmm::device_scalar<int32_t>>(0, stream, mr);
-    NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
+    if (nanoarrow_type == NANOARROW_TYPE_LARGE_STRING) {
+      auto zero = std::make_unique<rmm::device_scalar<int64_t>>(0, stream, mr);
+      NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
+    } else {
+      auto zero = std::make_unique<rmm::device_scalar<int32_t>>(0, stream, mr);
+      NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
+    }
+
     ArrowArrayMove(tmp.get(), out);
     return NANOARROW_OK;
   }
diff --git a/cpp/src/interop/to_arrow_host.cu b/cpp/src/interop/to_arrow_host.cu
new file mode 100644
index 00000000000..c9e53ebaab7
--- /dev/null
+++ b/cpp/src/interop/to_arrow_host.cu
@@ -0,0 +1,428 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arrow_utilities.hpp"
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/interop.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/null_mask.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+
+#include <nanoarrow/nanoarrow.h>
+#include <nanoarrow/nanoarrow.hpp>
+#include <nanoarrow/nanoarrow_device.h>
+
+#include <iostream>
+
+namespace cudf {
+namespace detail {
+
+template <typename DeviceType>
+std::unique_ptr<rmm::device_buffer> decimals_to_arrow(cudf::column_view input,
+                                                      rmm::cuda_stream_view stream,
+                                                      rmm::device_async_resource_ref mr)
+{
+  constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType);
+  auto buf = std::make_unique<rmm::device_buffer>(input.size() * sizeof(__int128_t), stream, mr);
+
+  auto count = thrust::counting_iterator<size_type>(0);
+  thrust::for_each(rmm::exec_policy(stream, mr),
+                   count,
+                   count + input.size(),
+                   [in  = input.begin<DeviceType>(),
+                    out = reinterpret_cast<DeviceType*>(buf->data()),
+                    BIT_WIDTH_RATIO] __device__(auto in_idx) {
+                     auto const out_idx = in_idx * BIT_WIDTH_RATIO;
+                     // the lowest order bits are the value, the remainder
+                     // simply matches the sign bit to satisfy the two's
+                     // complement integer representation of negative numbers.
+                     out[out_idx] = in[in_idx];
+#pragma unroll BIT_WIDTH_RATIO - 1
+                     for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
+                       out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
+                     }
+                   });
+
+  return buf;
+}
+
+template std::unique_ptr<rmm::device_buffer> decimals_to_arrow<int32_t>(
+  cudf::column_view input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
+
+template std::unique_ptr<rmm::device_buffer> decimals_to_arrow<int64_t>(
+  cudf::column_view input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
+
+namespace {
+
+struct dispatch_to_arrow_host {
+  cudf::column_view column;
+  rmm::cuda_stream_view stream;
+  rmm::device_async_resource_ref mr;
+
+  int populate_validity_bitmap(ArrowBitmap* bitmap) const
+  {
+    if (!column.has_nulls()) { return NANOARROW_OK; }
+
+    NANOARROW_RETURN_NOT_OK(ArrowBitmapResize(bitmap, static_cast<int64_t>(column.size()), 0));
+    CUDF_CUDA_TRY(cudaMemcpyAsync(bitmap->buffer.data,
+                                  (column.offset() > 0)
+                                    ? cudf::detail::copy_bitmask(column, stream, mr).data()
+                                    : column.null_mask(),
+                                  bitmap->buffer.size_bytes,
+                                  cudaMemcpyDefault,
+                                  stream.value()));
+    return NANOARROW_OK;
+  }
+
+  template <typename T>
+  int populate_data_buffer(device_span<T const> input, ArrowBuffer* buffer) const
+  {
+    NANOARROW_RETURN_NOT_OK(ArrowBufferResize(buffer, input.size_bytes(), 1));
+    CUDF_CUDA_TRY(cudaMemcpyAsync(
+      buffer->data, input.data(), input.size_bytes(), cudaMemcpyDefault, stream.value()));
+    return NANOARROW_OK;
+  }
+
+  template <typename T,
+            CUDF_ENABLE_IF(!is_rep_layout_compatible<T>() && !cudf::is_fixed_point<T>())>
+  int operator()(ArrowArray*) const
+  {
+    CUDF_FAIL("Unsupported type for to_arrow_host", cudf::data_type_error);
+  }
+
+  template <typename T,
+            CUDF_ENABLE_IF(is_rep_layout_compatible<T>() || std::is_same_v<T, numeric::decimal128>)>
+  int operator()(ArrowArray* out) const
+  {
+    nanoarrow::UniqueArray tmp;
+
+    auto const storage_type = id_to_arrow_storage_type(column.type().id());
+    NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), storage_type, column));
+
+    NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
+    using DataType = std::conditional_t<std::is_same_v<T, numeric::decimal128>, __int128_t, T>;
+    NANOARROW_RETURN_NOT_OK(
+      populate_data_buffer(device_span<DataType const>(column.data<DataType>(), column.size()),
+                           ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+
+    ArrowArrayMove(tmp.get(), out);
+    return NANOARROW_OK;
+  }
+
+  // convert decimal types from libcudf to arrow where those types are not directly
+  // supported by Arrow. These types must be fit into 128 bits, the smallest
+  // decimal resolution supported by Arrow
+  template <typename T,
+            CUDF_ENABLE_IF(!is_rep_layout_compatible<T>() &&
+                           (std::is_same_v<T, numeric::decimal32> ||
+                            std::is_same_v<T, numeric::decimal64>))>
+  int operator()(ArrowArray* out) const
+  {
+    using DeviceType = std::conditional_t<std::is_same_v<T, numeric::decimal32>, int32_t, int64_t>;
+    nanoarrow::UniqueArray tmp;
+    NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, column));
+
+    NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
+    auto buf = detail::decimals_to_arrow<DeviceType>(column, stream, mr);
+    NANOARROW_RETURN_NOT_OK(
+      populate_data_buffer(device_span<__int128_t const>(
+                             reinterpret_cast<const __int128_t*>(buf->data()), column.size()),
+                           ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+
+    ArrowArrayMove(tmp.get(), out);
+    return NANOARROW_OK;
+  }
+};
+
+int get_column(cudf::column_view column,
+               rmm::cuda_stream_view stream,
+               rmm::device_async_resource_ref mr,
+               ArrowArray* out);
+
+template <>
+int dispatch_to_arrow_host::operator()<bool>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_BOOL, column));
+
+  NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
+  auto bitmask = bools_to_mask(column, stream, mr);
+  NANOARROW_RETURN_NOT_OK(populate_data_buffer(
+    device_span<uint8_t const>(reinterpret_cast<const uint8_t*>(bitmask.first->data()),
+                               bitmask.first->size()),
+    ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_host::operator()<cudf::string_view>(ArrowArray* out) const
+{
+  ArrowType nanoarrow_type = NANOARROW_TYPE_STRING;
+  if (column.num_children() > 0 &&
+      column.child(cudf::strings_column_view::offsets_column_index).type().id() ==
+        cudf::type_id::INT64) {
+    nanoarrow_type = NANOARROW_TYPE_LARGE_STRING;
+  }
+
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), nanoarrow_type, column));
+
+  if (column.size() == 0) {
+    // initialize the offset buffer with a single zero by convention
+    if (nanoarrow_type == NANOARROW_TYPE_LARGE_STRING) {
+      NANOARROW_RETURN_NOT_OK(
+        ArrowBufferAppendInt64(ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx), 0));
+    } else {
+      NANOARROW_RETURN_NOT_OK(
+        ArrowBufferAppendInt32(ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx), 0));
+    }
+
+    ArrowArrayMove(tmp.get(), out);
+    return NANOARROW_OK;
+  }
+
+  NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
+
+  auto const scv     = cudf::strings_column_view(column);
+  auto const offsets = scv.offsets();
+  if (offsets.type().id() == cudf::type_id::INT64) {
+    NANOARROW_RETURN_NOT_OK(populate_data_buffer(
+      device_span<int64_t const>(offsets.data<int64_t>() + scv.offset(), scv.size() + 1),
+      ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+  } else {
+    NANOARROW_RETURN_NOT_OK(populate_data_buffer(
+      device_span<int32_t const>(offsets.data<int32_t>() + scv.offset(), scv.size() + 1),
+      ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+  }
+
+  NANOARROW_RETURN_NOT_OK(
+    populate_data_buffer(device_span<char const>(scv.chars_begin(stream), scv.chars_size(stream)),
+                         ArrowArrayBuffer(tmp.get(), 2)));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_host::operator()<cudf::list_view>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_LIST, column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), 1));
+
+  NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
+  auto const lcv = cudf::lists_column_view(column);
+
+  if (column.size() == 0) {
+    // initialize the offsets buffer with a single zero by convention for 0 length
+    NANOARROW_RETURN_NOT_OK(
+      ArrowBufferAppendInt32(ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx), 0));
+  } else {
+    NANOARROW_RETURN_NOT_OK(
+      populate_data_buffer(device_span<int32_t const>(lcv.offsets_begin(), (column.size() + 1)),
+                           ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+  }
+
+  NANOARROW_RETURN_NOT_OK(get_column(lcv.child(), stream, mr, tmp->children[0]));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_host::operator()<cudf::dictionary32>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(
+    tmp.get(),
+    id_to_arrow_type(column.child(cudf::dictionary_column_view::indices_column_index).type().id()),
+    column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateDictionary(tmp.get()));
+
+  NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
+  auto dcv          = cudf::dictionary_column_view(column);
+  auto dict_indices = dcv.get_indices_annotated();
+  switch (dict_indices.type().id()) {
+    case type_id::INT8:
+    case type_id::UINT8:
+      NANOARROW_RETURN_NOT_OK(populate_data_buffer(
+        device_span<int8_t const>(dict_indices.data<int8_t>(), dict_indices.size()),
+        ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+      break;
+    case type_id::INT16:
+    case type_id::UINT16:
+      NANOARROW_RETURN_NOT_OK(populate_data_buffer(
+        device_span<int16_t const>(dict_indices.data<int16_t>(), dict_indices.size()),
+        ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+      break;
+    case type_id::INT32:
+    case type_id::UINT32:
+      NANOARROW_RETURN_NOT_OK(populate_data_buffer(
+        device_span<int32_t const>(dict_indices.data<int32_t>(), dict_indices.size()),
+        ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+      break;
+    case type_id::INT64:
+    case type_id::UINT64:
+      NANOARROW_RETURN_NOT_OK(populate_data_buffer(
+        device_span<int64_t const>(dict_indices.data<int64_t>(), dict_indices.size()),
+        ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+      break;
+    default: CUDF_FAIL("unsupported type for dictionary indices");
+  }
+
+  NANOARROW_RETURN_NOT_OK(get_column(dcv.keys(), stream, mr, tmp->dictionary));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_host::operator()<cudf::struct_view>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRUCT, column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), column.num_children()));
+  NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
+
+  auto const scv = cudf::structs_column_view(column);
+
+  for (size_t i = 0; i < size_t(tmp->n_children); ++i) {
+    ArrowArray* child_ptr = tmp->children[i];
+    auto const child      = scv.get_sliced_child(i, stream);
+    NANOARROW_RETURN_NOT_OK(get_column(child, stream, mr, child_ptr));
+  }
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+int get_column(cudf::column_view column,
+               rmm::cuda_stream_view stream,
+               rmm::device_async_resource_ref mr,
+               ArrowArray* out)
+{
+  return column.type().id() != type_id::EMPTY
+           ? type_dispatcher(column.type(), dispatch_to_arrow_host{column, stream, mr}, out)
+           : initialize_array(out, NANOARROW_TYPE_NA, column);
+}
+
+unique_device_array_t create_device_array(nanoarrow::UniqueArray&& out)
+{
+  ArrowError err;
+  if (ArrowArrayFinishBuildingDefault(out.get(), &err) != NANOARROW_OK) {
+    std::cerr << err.message << std::endl;
+    CUDF_FAIL("failed to build");
+  }
+
+  unique_device_array_t result(new ArrowDeviceArray, [](ArrowDeviceArray* arr) {
+    if (arr->array.release != nullptr) { ArrowArrayRelease(&arr->array); }
+    delete arr;
+  });
+
+  result->device_id   = -1;
+  result->device_type = ARROW_DEVICE_CPU;
+  result->sync_event  = nullptr;
+  ArrowArrayMove(out.get(), &result->array);
+  return result;
+}
+
+}  // namespace
+
+unique_device_array_t to_arrow_host(cudf::table_view const& table,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::device_async_resource_ref mr)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_STRUCT));
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), table.num_columns()));
+  tmp->length     = table.num_rows();
+  tmp->null_count = 0;
+
+  for (cudf::size_type i = 0; i < table.num_columns(); ++i) {
+    auto child = tmp->children[i];
+    auto col   = table.column(i);
+    NANOARROW_THROW_NOT_OK(
+      cudf::type_dispatcher(col.type(), detail::dispatch_to_arrow_host{col, stream, mr}, child));
+  }
+
+  // wait for all the stream operations to complete before we return.
+  // this ensures that the host memory that we're returning will be populated
+  // before we return from this function.
+  stream.synchronize();
+
+  return create_device_array(std::move(tmp));
+}
+
+unique_device_array_t to_arrow_host(cudf::column_view const& col,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::device_async_resource_ref mr)
+{
+  nanoarrow::UniqueArray tmp;
+
+  NANOARROW_THROW_NOT_OK(
+    cudf::type_dispatcher(col.type(), detail::dispatch_to_arrow_host{col, stream, mr}, tmp.get()));
+
+  // wait for all the stream operations to complete before we return.
+  // this ensures that the host memory that we're returning will be populated
+  // before we return from this function.
+  stream.synchronize();
+
+  return create_device_array(std::move(tmp));
+}
+
+}  // namespace detail
+
+unique_device_array_t to_arrow_host(cudf::column_view const& col,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::to_arrow_host(col, stream, mr);
+}
+
+unique_device_array_t to_arrow_host(cudf::table_view const& table,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::to_arrow_host(table, stream, mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/src/interop/to_arrow_schema.cpp b/cpp/src/interop/to_arrow_schema.cpp
index 19915464236..b98ca8a7bed 100644
--- a/cpp/src/interop/to_arrow_schema.cpp
+++ b/cpp/src/interop/to_arrow_schema.cpp
@@ -20,7 +20,6 @@
 #include <cudf/detail/interop.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/interop.hpp>
-#include <cudf/interop/detail/arrow.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
@@ -120,7 +119,11 @@ int dispatch_to_arrow_type::operator()<cudf::string_view>(column_view input,
                                                           column_metadata const&,
                                                           ArrowSchema* out)
 {
-  return ArrowSchemaSetType(out, NANOARROW_TYPE_STRING);
+  return ((input.num_children() == 0 ||
+           input.child(cudf::strings_column_view::offsets_column_index).type().id() ==
+             type_id::INT32))
+           ? ArrowSchemaSetType(out, NANOARROW_TYPE_STRING)
+           : ArrowSchemaSetType(out, NANOARROW_TYPE_LARGE_STRING);
 }
 
 // these forward declarations are needed due to the recursive calls to them
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 05e9759632f..88187623930 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -270,6 +270,7 @@ ConfigureTest(
   INTEROP_TEST
   interop/to_arrow_device_test.cpp
   interop/to_arrow_test.cpp
+  interop/to_arrow_host_test.cpp
   interop/from_arrow_test.cpp
   interop/from_arrow_device_test.cpp
   interop/from_arrow_host_test.cpp
diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp
index 4147728b2a6..a961f73d955 100644
--- a/cpp/tests/interop/nanoarrow_utils.hpp
+++ b/cpp/tests/interop/nanoarrow_utils.hpp
@@ -18,7 +18,6 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <cudf/interop/detail/arrow.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -29,6 +28,7 @@
 #include <cudf/wrappers/durations.hpp>
 
 #include <nanoarrow/nanoarrow.hpp>
+#include <nanoarrow/nanoarrow_device.h>
 
 struct generated_test_data {
   generated_test_data(cudf::size_type length)
@@ -211,6 +211,7 @@ DEFINE_NANOARROW_STORAGE(cudf::duration_us, INT64);
 DEFINE_NANOARROW_STORAGE(cudf::duration_ns, INT64);
 DEFINE_NANOARROW_STORAGE(uint8_t, UINT8);
 DEFINE_NANOARROW_STORAGE(int32_t, INT32);
+DEFINE_NANOARROW_STORAGE(__int128_t, DECIMAL128);
 
 #undef DEFINE_NANOARROW_STORAGE
 
@@ -255,8 +256,7 @@ std::enable_if_t<std::is_same_v<T, bool>, nanoarrow::UniqueArray> get_nanoarrow_
     ArrowBitmap out;
     ArrowBitmapInit(&out);
     NANOARROW_THROW_NOT_OK(ArrowBitmapResize(&out, b.size(), 1));
-    out.buffer.size_bytes = (b.size() >> 3) + ((b.size() & 7) != 0);
-    out.size_bits         = b.size();
+    std::memset(out.buffer.data, 0, out.buffer.size_bytes);
 
     for (size_t i = 0; i < b.size(); ++i) {
       ArrowBitSetTo(out.buffer.data, i, static_cast<uint8_t>(b[i]));
@@ -296,6 +296,7 @@ std::enable_if_t<std::is_same_v<T, cudf::string_view>, nanoarrow::UniqueArray> g
 {
   nanoarrow::UniqueArray tmp;
   NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(tmp.get()), mask.size()));
   NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(tmp.get()));
   NANOARROW_THROW_NOT_OK(ArrowArrayReserve(tmp.get(), data.size()));
 
@@ -378,3 +379,5 @@ get_nanoarrow_cudf_table(cudf::size_type length);
 
 std::tuple<std::unique_ptr<cudf::table>, nanoarrow::UniqueSchema, nanoarrow::UniqueArray>
 get_nanoarrow_host_tables(cudf::size_type length);
+
+void slice_host_nanoarrow(ArrowArray* arr, int64_t start, int64_t end);
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index 8903f09b82b..77da4039103 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -31,7 +31,6 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/interop.hpp>
-#include <cudf/interop/detail/arrow.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
diff --git a/cpp/tests/interop/to_arrow_host_test.cpp b/cpp/tests/interop/to_arrow_host_test.cpp
new file mode 100644
index 00000000000..fc0ed6c9352
--- /dev/null
+++ b/cpp/tests/interop/to_arrow_host_test.cpp
@@ -0,0 +1,1117 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nanoarrow_utils.hpp"
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/testing_main.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/interop.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/dictionary/encode.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <numeric>
+
+using vector_of_columns = std::vector<std::unique_ptr<cudf::column>>;
+
+struct BaseToArrowHostFixture : public cudf::test::BaseFixture {
+  template <typename T>
+  std::enable_if_t<cudf::is_fixed_width<T>() and !std::is_same_v<T, bool>, void> compare_subset(
+    ArrowArrayView const* expected,
+    int64_t start_offset_expected,
+    ArrowArrayView const* actual,
+    int64_t start_offset_actual,
+    int64_t length)
+  {
+    for (int64_t i = 0; i < length; ++i) {
+      const bool is_null = ArrowArrayViewIsNull(expected, start_offset_expected + i);
+      EXPECT_EQ(is_null, ArrowArrayViewIsNull(actual, start_offset_actual + i));
+      if (is_null) continue;
+
+      const auto expected_val = ArrowArrayViewGetIntUnsafe(expected, start_offset_expected + i);
+      const auto actual_val   = ArrowArrayViewGetIntUnsafe(actual, start_offset_actual + i);
+
+      EXPECT_EQ(expected_val, actual_val);
+    }
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::string_view>, void> compare_subset(
+    ArrowArrayView const* expected,
+    int64_t start_offset_expected,
+    ArrowArrayView const* actual,
+    int64_t start_offset_actual,
+    int64_t length)
+  {
+    for (int64_t i = 0; i < length; ++i) {
+      const bool is_null = ArrowArrayViewIsNull(expected, start_offset_expected + i);
+      EXPECT_EQ(is_null, ArrowArrayViewIsNull(actual, start_offset_actual + i));
+      if (is_null) continue;
+
+      const auto expected_view = ArrowArrayViewGetBytesUnsafe(expected, start_offset_expected + i);
+      const auto actual_view   = ArrowArrayViewGetBytesUnsafe(actual, start_offset_actual + i);
+
+      EXPECT_EQ(expected_view.size_bytes, actual_view.size_bytes);
+      EXPECT_TRUE(
+        0 == std::memcmp(expected_view.data.data, actual_view.data.data, expected_view.size_bytes));
+    }
+  }
+
+  void compare_child_subset(ArrowArrayView const* expected,
+                            int64_t exp_start_offset,
+                            ArrowArrayView const* actual,
+                            int64_t act_start_offset,
+                            int64_t length)
+  {
+    EXPECT_EQ(expected->storage_type, actual->storage_type);
+    EXPECT_EQ(expected->n_children, actual->n_children);
+
+    switch (expected->storage_type) {
+      case NANOARROW_TYPE_LIST:
+        for (int64_t i = 0; i < length; ++i) {
+          const auto expected_start = exp_start_offset + i;
+          const auto actual_start   = act_start_offset + i;
+
+          // ArrowArrayViewIsNull accounts for the array offset, so we can properly
+          // compare the validity of indexes
+          const bool is_null = ArrowArrayViewIsNull(expected, expected_start);
+          EXPECT_EQ(is_null, ArrowArrayViewIsNull(actual, actual_start));
+          if (is_null) continue;
+
+          // ArrowArrayViewListChildOffset does not account for array offset, so we need
+          // to add the offset to the index in order to get the correct offset into the list
+          const int64_t start_offset_expected =
+            ArrowArrayViewListChildOffset(expected, expected->offset + expected_start);
+          const int64_t start_offset_actual =
+            ArrowArrayViewListChildOffset(actual, actual->offset + actual_start);
+
+          const int64_t end_offset_expected =
+            ArrowArrayViewListChildOffset(expected, expected->offset + expected_start + 1);
+          const int64_t end_offset_actual =
+            ArrowArrayViewListChildOffset(actual, actual->offset + actual_start + 1);
+
+          // verify the list lengths are the same
+          EXPECT_EQ(end_offset_expected - start_offset_expected,
+                    end_offset_actual - start_offset_actual);
+          // compare the list values
+          compare_child_subset(expected->children[0],
+                               start_offset_expected,
+                               actual->children[0],
+                               start_offset_actual,
+                               end_offset_expected - start_offset_expected);
+        }
+        break;
+      case NANOARROW_TYPE_STRUCT:
+        for (int64_t i = 0; i < length; ++i) {
+          SCOPED_TRACE("idx: " + std::to_string(i));
+          const auto expected_start = exp_start_offset + i;
+          const auto actual_start   = act_start_offset + i;
+
+          const bool is_null = ArrowArrayViewIsNull(expected, expected_start);
+          EXPECT_EQ(is_null, ArrowArrayViewIsNull(actual, actual_start));
+          if (is_null) continue;
+
+          for (int64_t child = 0; child < expected->n_children; ++child) {
+            SCOPED_TRACE("child: " + std::to_string(child));
+            compare_child_subset(expected->children[child],
+                                 expected_start + expected->offset,
+                                 actual->children[child],
+                                 actual_start + actual->offset,
+                                 1);
+          }
+        }
+        break;
+      case NANOARROW_TYPE_STRING:
+      case NANOARROW_TYPE_LARGE_STRING:
+      case NANOARROW_TYPE_BINARY:
+      case NANOARROW_TYPE_LARGE_BINARY:
+        compare_subset<cudf::string_view>(
+          expected, exp_start_offset, actual, act_start_offset, length);
+        break;
+      default:
+        compare_subset<int64_t>(expected, exp_start_offset, actual, act_start_offset, length);
+        break;
+    }
+  }
+
+  void compare_arrays(ArrowArrayView const* expected, ArrowArrayView const* actual)
+  {
+    EXPECT_EQ(expected->length, actual->length);
+    EXPECT_EQ(expected->null_count, actual->null_count);
+    EXPECT_EQ(expected->offset, actual->offset);
+    EXPECT_EQ(expected->n_children, actual->n_children);
+    EXPECT_EQ(expected->storage_type, actual->storage_type);
+
+    // cudf automatically pushes down nulls and purges non-empty, non-zero nulls
+    // from the children columns. So while we can memcmp the buffers for top
+    // level arrays, we need to do an "equivalence" comparison for nested
+    // arrays (lists and structs) by checking each index for null and skipping
+    // comparisons for children if null.
+    switch (expected->storage_type) {
+      case NANOARROW_TYPE_STRUCT:
+        // if we're a struct with no children, then we just skip
+        // attempting to compare the children
+        if (expected->n_children == 0) {
+          EXPECT_EQ(nullptr, actual->children);
+          break;
+        }
+        // otherwise we can fallthrough and do the same thing we do for lists
+      case NANOARROW_TYPE_LIST:
+        compare_child_subset(expected, 0, actual, 0, expected->length);
+        break;
+      default:
+        for (int64_t i = 0; i < actual->array->n_buffers; ++i) {
+          SCOPED_TRACE("buffer " + std::to_string(i));
+          auto expected_buf = expected->buffer_views[i];
+          auto actual_buf   = actual->buffer_views[i];
+
+          EXPECT_TRUE(0 == std::memcmp(expected_buf.data.data,
+                                       actual_buf.data.data,
+                                       expected_buf.size_bytes));
+        }
+    }
+
+    if (expected->dictionary != nullptr) {
+      EXPECT_NE(nullptr, actual->dictionary);
+      SCOPED_TRACE("dictionary");
+      compare_arrays(expected->dictionary, actual->dictionary);
+    } else {
+      EXPECT_EQ(nullptr, actual->dictionary);
+    }
+  }
+};
+
+struct ToArrowHostDeviceTest : public BaseToArrowHostFixture {};
+template <typename T>
+struct ToArrowHostDeviceTestDurationsTest : public BaseToArrowHostFixture {};
+
+TYPED_TEST_SUITE(ToArrowHostDeviceTestDurationsTest, cudf::test::DurationTypes);
+
+TEST_F(ToArrowHostDeviceTest, EmptyTable)
+{
+  auto [tbl, schema, arr] = get_nanoarrow_host_tables(0);
+
+  auto got_arrow_host = cudf::to_arrow_host(tbl->view());
+  EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+  EXPECT_EQ(-1, got_arrow_host->device_id);
+  EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+  ArrowArrayView expected, actual;
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, arr.get(), nullptr));
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  compare_arrays(&expected, &actual);
+
+  ArrowArrayViewReset(&expected);
+  ArrowArrayViewReset(&actual);
+}
+
+TEST_F(ToArrowHostDeviceTest, DateTimeTable)
+{
+  auto data = std::initializer_list<int64_t>{1, 2, 3, 4, 5, 6};
+  auto col =
+    cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>(data);
+  cudf::table_view input_view({col});
+
+  nanoarrow::UniqueSchema expected_schema;
+  ArrowSchemaInit(expected_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+  ArrowSchemaInit(expected_schema->children[0]);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime(
+    expected_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+  expected_schema->children[0]->flags = 0;
+
+  auto got_arrow_host = cudf::to_arrow_host(input_view);
+  EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+  EXPECT_EQ(-1, got_arrow_host->device_id);
+  EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+  ArrowArrayView expected, actual;
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+  expected.length              = data.size();
+  expected.children[0]->length = data.size();
+  ArrowArrayViewSetLength(expected.children[0], data.size());
+  expected.children[0]->buffer_views[0].data.data  = nullptr;
+  expected.children[0]->buffer_views[0].size_bytes = 0;
+  expected.children[0]->buffer_views[1].data.data  = data.begin();
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  compare_arrays(&expected, &actual);
+  ArrowArrayViewReset(&actual);
+
+  got_arrow_host = cudf::to_arrow_host(input_view.column(0));
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  BaseToArrowHostFixture::compare_arrays(expected.children[0], &actual);
+  ArrowArrayViewReset(&actual);
+
+  ArrowArrayViewReset(&expected);
+  ArrowArrayViewReset(&actual);
+}
+
+TYPED_TEST(ToArrowHostDeviceTestDurationsTest, DurationTable)
+{
+  using T = TypeParam;
+
+  if (cudf::type_to_id<TypeParam>() == cudf::type_id::DURATION_DAYS) { return; }
+
+  auto data = {T{1}, T{2}, T{3}, T{4}, T{5}, T{6}};
+  auto col  = cudf::test::fixed_width_column_wrapper<T>(data);
+
+  cudf::table_view input_view({col});
+
+  nanoarrow::UniqueSchema expected_schema;
+  ArrowSchemaInit(expected_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+
+  ArrowSchemaInit(expected_schema->children[0]);
+  const ArrowTimeUnit arrow_unit = [&] {
+    switch (cudf::type_to_id<TypeParam>()) {
+      case cudf::type_id::DURATION_SECONDS: return NANOARROW_TIME_UNIT_SECOND;
+      case cudf::type_id::DURATION_MILLISECONDS: return NANOARROW_TIME_UNIT_MILLI;
+      case cudf::type_id::DURATION_MICROSECONDS: return NANOARROW_TIME_UNIT_MICRO;
+      case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TIME_UNIT_NANO;
+      default: CUDF_FAIL("Unsupported duration unit in arrow");
+    }
+  }();
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime(
+    expected_schema->children[0], NANOARROW_TYPE_DURATION, arrow_unit, nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+  expected_schema->children[0]->flags = 0;
+
+  auto got_arrow_host = cudf::to_arrow_host(input_view);
+  EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+  EXPECT_EQ(-1, got_arrow_host->device_id);
+  EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+  ArrowArrayView expected, actual;
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+
+  expected.length              = data.size();
+  expected.children[0]->length = data.size();
+  ArrowArrayViewSetLength(expected.children[0], data.size());
+  expected.children[0]->buffer_views[0].data.data  = nullptr;
+  expected.children[0]->buffer_views[0].size_bytes = 0;
+  expected.children[0]->buffer_views[1].data.data  = data.begin();
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  BaseToArrowHostFixture::compare_arrays(&expected, &actual);
+  ArrowArrayViewReset(&actual);
+
+  got_arrow_host = cudf::to_arrow_host(input_view.column(0));
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  BaseToArrowHostFixture::compare_arrays(expected.children[0], &actual);
+  ArrowArrayViewReset(&actual);
+
+  ArrowArrayViewReset(&expected);
+}
+
+TEST_F(ToArrowHostDeviceTest, NestedList)
+{
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 != 0; });
+  auto col = cudf::test::lists_column_wrapper<int64_t>(
+    {{{{{1, 2}, valids}, {{3, 4}, valids}, {5}}, {{6}, {{7, 8, 9}, valids}}}, valids});
+  cudf::table_view input_view({col});
+
+  nanoarrow::UniqueSchema expected_schema;
+  ArrowSchemaInit(expected_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(expected_schema->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+  expected_schema->children[0]->flags = ARROW_FLAG_NULLABLE;
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(expected_schema->children[0]->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0]->children[0], "element"));
+  expected_schema->children[0]->children[0]->flags = 0;
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(
+    expected_schema->children[0]->children[0]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaSetName(expected_schema->children[0]->children[0]->children[0], "element"));
+  expected_schema->children[0]->children[0]->children[0]->flags = ARROW_FLAG_NULLABLE;
+
+  auto got_arrow_host = cudf::to_arrow_host(input_view);
+  EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+  EXPECT_EQ(-1, got_arrow_host->device_id);
+  EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+  auto list_arr = get_nanoarrow_list_array<int64_t>({6, 7, 8, 9}, {0, 1, 4}, {1, 0, 1, 1});
+  std::vector<int32_t> offset{0, 0, 2};
+
+  ArrowBitmap mask;
+  ArrowBitmapInit(&mask);
+  NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&mask, 2));
+  NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 0, 1));
+  NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 1, 1));
+
+  nanoarrow::UniqueArray expected_arr;
+  EXPECT_EQ(NANOARROW_OK,
+            ArrowArrayInitFromSchema(expected_arr.get(), expected_schema.get(), nullptr));
+  expected_arr->length     = input_view.num_rows();
+  expected_arr->null_count = 0;
+
+  ArrowArraySetValidityBitmap(expected_arr->children[0], &mask);
+  expected_arr->children[0]->length     = input_view.num_rows();
+  expected_arr->children[0]->null_count = 1;
+  auto offset_buf                       = ArrowArrayBuffer(expected_arr->children[0], 1);
+  EXPECT_EQ(
+    NANOARROW_OK,
+    ArrowBufferAppend(
+      offset_buf, reinterpret_cast<void const*>(offset.data()), offset.size() * sizeof(int32_t)));
+  list_arr.move(expected_arr->children[0]->children[0]);
+  NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_arr.get(), nullptr));
+
+  ArrowArrayView expected, actual;
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_arr.get(), nullptr));
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  compare_arrays(&expected, &actual);
+  ArrowArrayViewReset(&actual);
+
+  got_arrow_host = cudf::to_arrow_host(input_view.column(0));
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  compare_arrays(expected.children[0], &actual);
+  ArrowArrayViewReset(&actual);
+
+  ArrowArrayViewReset(&expected);
+}
+
+TEST_F(ToArrowHostDeviceTest, StructColumn)
+{
+  // Create cudf table
+  auto nested_type_field_names =
+    std::vector<std::vector<std::string>>{{"string", "integral", "bool", "nested_list", "struct"}};
+  auto str_col =
+    cudf::test::strings_column_wrapper{
+      "Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"}
+      .release();
+  auto str_col2 =
+    cudf::test::strings_column_wrapper{{"CUDF", "ROCKS", "EVERYWHERE"}, {0, 1, 0}}.release();
+  int num_rows{str_col->size()};
+  auto int_col = cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{48, 27, 25}}.release();
+  auto int_col2 =
+    cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{12, 24, 47}, {1, 0, 1}}.release();
+  auto bool_col = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}}.release();
+  auto list_col =
+    cudf::test::lists_column_wrapper<int64_t>({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})
+      .release();
+  vector_of_columns cols2;
+  cols2.push_back(std::move(str_col2));
+  cols2.push_back(std::move(int_col2));
+  auto [null_mask, null_count] =
+    cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}});
+  auto sub_struct_col =
+    cudf::make_structs_column(num_rows, std::move(cols2), null_count, std::move(*null_mask));
+  vector_of_columns cols;
+  cols.push_back(std::move(str_col));
+  cols.push_back(std::move(int_col));
+  cols.push_back(std::move(bool_col));
+  cols.push_back(std::move(list_col));
+  cols.push_back(std::move(sub_struct_col));
+
+  auto struct_col = cudf::make_structs_column(num_rows, std::move(cols), 0, {});
+  cudf::table_view input_view({struct_col->view()});
+
+  nanoarrow::UniqueSchema expected_schema;
+  ArrowSchemaInit(expected_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+
+  ArrowSchemaInit(expected_schema->children[0]);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema->children[0], 5));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+  expected_schema->children[0]->flags = 0;
+
+  auto child = expected_schema->children[0];
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[0], "string"));
+  child->children[0]->flags = 0;
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[1], "integral"));
+  child->children[1]->flags = 0;
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[2], "bool"));
+  child->children[2]->flags = 0;
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3], "nested_list"));
+  child->children[3]->flags = 0;
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3]->children[0], "element"));
+  child->children[3]->children[0]->flags = 0;
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element"));
+  child->children[3]->children[0]->children[0]->flags = 0;
+
+  ArrowSchemaInit(child->children[4]);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(child->children[4], 2));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4], "struct"));
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[0], "string2"));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[1], "integral2"));
+
+  // create nanoarrow table
+  // first our underlying arrays
+  std::vector<std::string> str{"Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"};
+  std::vector<std::string> str2{"CUDF", "ROCKS", "EVERYWHERE"};
+  auto str_array  = get_nanoarrow_array<cudf::string_view>(str);
+  auto int_array  = get_nanoarrow_array<int32_t>({48, 27, 25});
+  auto str2_array = get_nanoarrow_array<cudf::string_view>(str2, {0, 1, 0});
+  // struct null will get pushed down and superimposed on this array
+  auto int2_array = get_nanoarrow_array<int32_t, uint8_t>({12, 24, 47}, {1, 0, 0});
+  auto bool_array = get_nanoarrow_array<bool>({true, true, false});
+  auto list_arr =
+    get_nanoarrow_list_array<int64_t>({1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 2, 4, 5, 6, 7, 9});
+  std::vector<int32_t> offset{0, 3, 4, 6};
+
+  nanoarrow::UniqueArray expected_arr;
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayInitFromSchema(expected_arr.get(), expected_schema.get(), nullptr));
+  expected_arr->length = input_view.num_rows();
+
+  auto array_a        = expected_arr->children[0];
+  auto view_a         = input_view.column(0);
+  array_a->length     = view_a.size();
+  array_a->null_count = view_a.null_count();
+
+  str_array.move(array_a->children[0]);
+  int_array.move(array_a->children[1]);
+  bool_array.move(array_a->children[2]);
+
+  array_a->children[3]->length     = input_view.num_rows();
+  array_a->children[3]->null_count = 0;
+
+  auto offset_buf = ArrowArrayBuffer(array_a->children[3], 1);
+  EXPECT_EQ(
+    NANOARROW_OK,
+    ArrowBufferAppend(
+      offset_buf, reinterpret_cast<void const*>(offset.data()), offset.size() * sizeof(int32_t)));
+  list_arr.move(array_a->children[3]->children[0]);
+
+  ArrowBitmap mask;
+  ArrowBitmapInit(&mask);
+  NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&mask, 3));
+  NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 1, 2));
+  NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 0, 1));
+
+  auto array_struct = array_a->children[4];
+  auto view_struct  = view_a.child(4);
+  ArrowArraySetValidityBitmap(array_struct, &mask);
+  array_struct->null_count = view_struct.null_count();
+  array_struct->length     = view_struct.size();
+
+  str2_array.move(array_struct->children[0]);
+  int2_array.move(array_struct->children[1]);
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_arr.get(), nullptr));
+
+  auto got_arrow_host = cudf::to_arrow_host(input_view);
+  EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+  EXPECT_EQ(-1, got_arrow_host->device_id);
+  EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+  ArrowArrayView expected, actual;
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_arr.get(), nullptr));
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  compare_arrays(&expected, &actual);
+  ArrowArrayViewReset(&actual);
+
+  got_arrow_host = cudf::to_arrow_host(input_view.column(0));
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  compare_arrays(expected.children[0], &actual);
+  ArrowArrayViewReset(&actual);
+
+  ArrowArrayViewReset(&expected);
+}
+
+template <typename T>
+using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint32Table)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const col   = fp_wrapper<int32_t>({-1, 2, 3, 4, 5, 6}, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    auto const data = std::vector<__int128_t>{-1, 2, 3, 4, 5, 6};
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<int32_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(data).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint64Table)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const col   = fp_wrapper<int64_t>({-1, 2, 3, 4, 5, 6}, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    auto const data = std::vector<__int128_t>{-1, 2, 3, 4, 5, 6};
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<int64_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(data).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint128Table)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const col   = fp_wrapper<__int128_t>({-1, 2, 3, 4, 5, 6}, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    auto const data = std::vector<__int128_t>{-1, 2, 3, 4, 5, 6};
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(data).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint32TableLarge)
+{
+  using namespace numeric;
+  auto constexpr NUM_ELEMENTS = 1000;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const iota  = thrust::make_counting_iterator(1);
+    auto const col   = fp_wrapper<int32_t>(iota, iota + NUM_ELEMENTS, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    auto expect_data = std::vector<__int128_t>(NUM_ELEMENTS);
+    std::iota(expect_data.begin(), expect_data.end(), 1);
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<int32_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(expect_data).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint64TableLarge)
+{
+  using namespace numeric;
+  auto constexpr NUM_ELEMENTS = 1000;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const iota  = thrust::make_counting_iterator(1);
+    auto const col   = fp_wrapper<int64_t>(iota, iota + NUM_ELEMENTS, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    auto expect_data = std::vector<__int128_t>(NUM_ELEMENTS);
+    std::iota(expect_data.begin(), expect_data.end(), 1);
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<int64_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(expect_data).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint128TableLarge)
+{
+  using namespace numeric;
+  auto constexpr NUM_ELEMENTS = 1000;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const iota  = thrust::make_counting_iterator(1);
+    auto const col   = fp_wrapper<__int128_t>(iota, iota + NUM_ELEMENTS, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    auto expect_data = std::vector<__int128_t>(NUM_ELEMENTS);
+    std::iota(expect_data.begin(), expect_data.end(), 1);
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(expect_data).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint32TableNullsSimple)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const data     = std::vector<__int128_t>{1, 2, 3, 4, 5, 6, 0, 0};
+    auto const validity = std::vector<uint8_t>{1, 1, 1, 1, 1, 1, 0, 0};
+    auto const col =
+      fp_wrapper<int32_t>({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<int32_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(data, validity).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint64TableNullsSimple)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const data     = std::vector<__int128_t>{1, 2, 3, 4, 5, 6, 0, 0};
+    auto const validity = std::vector<uint8_t>{1, 1, 1, 1, 1, 1, 0, 0};
+    auto const col =
+      fp_wrapper<int64_t>({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<int64_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(data, validity).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint128TableNullsSimple)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const data     = std::vector<__int128_t>{1, 2, 3, 4, 5, 6, 0, 0};
+    auto const validity = std::vector<uint8_t>{1, 1, 1, 1, 1, 1, 0, 0};
+    auto const col =
+      fp_wrapper<__int128_t>({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(data, validity).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+struct ToArrowHostDeviceTestSlice
+  : public ToArrowHostDeviceTest,
+    public ::testing::WithParamInterface<std::tuple<cudf::size_type, cudf::size_type>> {};
+
+TEST_P(ToArrowHostDeviceTestSlice, SliceTest)
+{
+  auto [table, expected_schema, expected_array] = get_nanoarrow_host_tables(10000);
+  auto cudf_table_view                          = table->view();
+  auto const [start, end]                       = GetParam();
+
+  slice_host_nanoarrow(expected_array.get(), start, end);
+  auto sliced_cudf_table = cudf::slice(cudf_table_view, {start, end})[0];
+  auto got_arrow_host    = cudf::to_arrow_host(sliced_cudf_table);
+  EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+  EXPECT_EQ(-1, got_arrow_host->device_id);
+  EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+  ArrowArrayView expected, actual;
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  compare_arrays(&expected, &actual);
+  ArrowArrayViewReset(&actual);
+
+  ArrowArrayViewReset(&expected);
+}
+
+INSTANTIATE_TEST_CASE_P(ToArrowHostDeviceTest,
+                        ToArrowHostDeviceTestSlice,
+                        ::testing::Values(std::make_tuple(0, 10000),
+                                          std::make_tuple(100, 3000),
+                                          std::make_tuple(0, 0),
+                                          std::make_tuple(0, 3000)));

From 743264f6ac924fdbec58fad666f989b14b901a98 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Wed, 24 Jul 2024 05:32:31 -0500
Subject: [PATCH 11/15] Warn on cuDF failure when `POLARS_VERBOSE` is true
 (#16308)

Just something quick to get us started here

Closes https://github.com/rapidsai/cudf/issues/16256

Authors:
  - https://github.com/brandon-b-miller
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16308
---
 python/cudf_polars/cudf_polars/callback.py | 12 +++++++-
 python/cudf_polars/tests/test_config.py    | 34 ++++++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf_polars/tests/test_config.py

diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index 764cdd3b3ca..f31193aa938 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -5,11 +5,15 @@
 
 from __future__ import annotations
 
+import os
+import warnings
 from functools import partial
 from typing import TYPE_CHECKING
 
 import nvtx
 
+from polars.exceptions import PerformanceWarning
+
 from cudf_polars.dsl.translate import translate_ir
 
 if TYPE_CHECKING:
@@ -61,6 +65,12 @@ def execute_with_cudf(
     try:
         with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
             nt.set_udf(partial(_callback, translate_ir(nt)))
-    except exception:
+    except exception as e:
+        if bool(int(os.environ.get("POLARS_VERBOSE", 0))):
+            warnings.warn(
+                f"Query execution with GPU not supported, reason: {type(e)}: {e}",
+                PerformanceWarning,
+                stacklevel=2,
+            )
         if raise_on_fail:
             raise
diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py
new file mode 100644
index 00000000000..5b4bba55552
--- /dev/null
+++ b/python/cudf_polars/tests/test_config.py
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.dsl.ir import IR
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
+
+
+def test_polars_verbose_warns(monkeypatch):
+    def raise_unimplemented(self):
+        raise NotImplementedError("We don't support this")
+
+    monkeypatch.setattr(IR, "__post_init__", raise_unimplemented)
+    q = pl.LazyFrame({})
+    # Ensure that things raise
+    assert_ir_translation_raises(q, NotImplementedError)
+    with (
+        pl.Config(verbose=True),
+        pytest.raises(pl.exceptions.ComputeError),
+        pytest.warns(
+            pl.exceptions.PerformanceWarning,
+            match="Query execution with GPU not supported",
+        ),
+    ):
+        # And ensure that collecting issues the correct warning.
+        assert_gpu_result_equal(q)

From 7191b74ce244518f17ef65e701f5a262f1c5cf8a Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 24 Jul 2024 03:55:48 -1000
Subject: [PATCH 12/15] Align Index __init__ APIs with pandas 2.x (#16362)

* It would be nice to have `Index`'s constructor to not go through `IndexMeta.__call__`, but I think that would be a separate effort
* There were a couple `verify_integrity` keyword arguments added that don't raise a `NotImplementedError` since there's not support, but I don't think it's worth making this case falling back in `cudf.pandas` as it's just a validation and won't affect further behavior with the object

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16362
---
 docs/cudf/source/conf.py            |  1 +
 python/cudf/cudf/core/index.py      | 48 ++++++++++++++++++++++-------
 python/cudf/cudf/core/multiindex.py |  2 +-
 3 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index c3c14ac8cad..f544536fb31 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -556,6 +556,7 @@ def on_missing_reference(app, env, node, contnode):
     ("py:class", "Dtype"),
     # The following are erroneously warned due to
     # https://github.com/sphinx-doc/sphinx/issues/11225
+    ("py:obj", "cudf.Index.values_host"),
     ("py:class", "pa.Array"),
     ("py:class", "ScalarLike"),
     ("py:class", "ParentType"),
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 73b7298410a..1c48b8f4f2d 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -78,6 +78,11 @@ class IndexMeta(type):
     """Custom metaclass for Index that overrides instance/subclass tests."""
 
     def __call__(cls, data, *args, **kwargs):
+        if kwargs.get("tupleize_cols", True) is not True:
+            raise NotImplementedError(
+                "tupleize_cols is currently not supported."
+            )
+
         if cls is Index:
             return as_index(
                 arbitrary=data,
@@ -997,21 +1002,23 @@ def __dask_tokenize__(self):
 
 class Index(SingleColumnFrame, BaseIndex, metaclass=IndexMeta):
     """
-    An array of orderable values that represent the indices of another Column
+    Immutable sequence used for indexing and alignment.
 
-    Attributes
-    ----------
-    _values: A Column object
-    name: A string
+    The basic object storing axis labels for all pandas objects.
 
     Parameters
     ----------
-    data : Column
-        The Column of data for this index
-    name : str optional
-        The name of the Index. If not provided, the Index adopts the value
-        Column's name. Otherwise if this name is different from the value
-        Column's, the data Column will be cloned to adopt this name.
+    data : array-like (1-dimensional)
+    dtype : str, numpy.dtype, or ExtensionDtype, optional
+        Data type for the output Index. If not specified, this will be
+        inferred from `data`.
+    copy : bool, default False
+        Copy input data.
+    name : object
+        Name to be stored in the index.
+    tupleize_cols : bool (default: True)
+        When True, attempt to create a MultiIndex if possible.
+        Currently not supported.
     """
 
     @_performance_tracking
@@ -1735,8 +1742,18 @@ def __init__(
         if tz is not None:
             raise NotImplementedError("tz is not yet supported")
         if normalize is not False:
+            warnings.warn(
+                "The 'normalize' keyword is "
+                "deprecated and will be removed in a future version. ",
+                FutureWarning,
+            )
             raise NotImplementedError("normalize == True is not yet supported")
         if closed is not None:
+            warnings.warn(
+                "The 'closed' keyword is "
+                "deprecated and will be removed in a future version. ",
+                FutureWarning,
+            )
             raise NotImplementedError("closed is not yet supported")
         if ambiguous != "raise":
             raise NotImplementedError("ambiguous is not yet supported")
@@ -2480,6 +2497,14 @@ def __init__(
         if freq is not None:
             raise NotImplementedError("freq is not yet supported")
 
+        if closed is not None:
+            warnings.warn(
+                "The 'closed' keyword is "
+                "deprecated and will be removed in a future version. ",
+                FutureWarning,
+            )
+            raise NotImplementedError("closed is not yet supported")
+
         if unit is not None:
             warnings.warn(
                 "The 'unit' keyword is "
@@ -2863,6 +2888,7 @@ def __init__(
         dtype=None,
         copy: bool = False,
         name=None,
+        verify_integrity: bool = True,
     ):
         name = _getdefault_name(data, name=name)
 
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index ff4b06c6334..dfc596bf279 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -150,7 +150,7 @@ def __init__(
         dtype=None,
         copy=False,
         name=None,
-        **kwargs,
+        verify_integrity=True,
     ):
         if sortorder is not None:
             raise NotImplementedError("sortorder is not yet supported")

From 8fcf72a787acb0168c97d11b8ab9130146e9b37e Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Wed, 24 Jul 2024 12:06:29 -0500
Subject: [PATCH 13/15] [JNI] Add setKernelPinnedCopyThreshold and
 setPinnedAllocationThreshold (#16288)

In 24.08 two new cuDF methods are being added, and the second method is still in flight (see: https://github.com/rapidsai/cudf/pull/16206):

```
cudf::set_kernel_pinned_copy_threshold
cudf::set_allocate_host_as_pinned_threshold
```

We'd like to expose these methods in our JNI layer. I created a Cudf.java with the two static methods, and put the definitions in CudfJni.cpp.

Marked as draft since I need https://github.com/rapidsai/cudf/pull/16206 to merge, and we are still testing it.

Authors:
  - Alessandro Bellina (https://github.com/abellina)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/16288
---
 java/src/main/java/ai/rapids/cudf/Cudf.java | 36 +++++++++++++++++++++
 java/src/main/native/src/CudfJni.cpp        | 25 ++++++++++++++
 2 files changed, 61 insertions(+)
 create mode 100644 java/src/main/java/ai/rapids/cudf/Cudf.java

diff --git a/java/src/main/java/ai/rapids/cudf/Cudf.java b/java/src/main/java/ai/rapids/cudf/Cudf.java
new file mode 100644
index 00000000000..d09e2f87ed4
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/Cudf.java
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+public class Cudf {
+
+  static {
+    NativeDepsLoader.loadNativeDeps();
+  }
+
+  /**
+   * cuDF copies that are smaller than the threshold will use a kernel to copy, instead
+   * of cudaMemcpyAsync.
+   */
+  public static native void setKernelPinnedCopyThreshold(long kernelPinnedCopyThreshold);
+
+  /**
+   * cudf allocations that are smaller than the threshold will use the pinned host
+   * memory resource.
+   */
+  public static native void setPinnedAllocationThreshold(long pinnedAllocationThreshold);
+}
diff --git a/java/src/main/native/src/CudfJni.cpp b/java/src/main/native/src/CudfJni.cpp
index 698a8f6ff02..2860dc2e4b2 100644
--- a/java/src/main/native/src/CudfJni.cpp
+++ b/java/src/main/native/src/CudfJni.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf/copying.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
 
 #include <sstream>
 
@@ -201,4 +202,28 @@ JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_isPtdsEnabled(JNIEnv* env, j
   return cudf::jni::is_ptds_enabled;
 }
 
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cudf_setKernelPinnedCopyThreshold(JNIEnv* env,
+                                                                             jclass clazz,
+                                                                             jlong jthreshold)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    auto threshold = static_cast<std::size_t>(jthreshold);
+    cudf::set_kernel_pinned_copy_threshold(threshold);
+  }
+  CATCH_STD(env, )
+}
+
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cudf_setPinnedAllocationThreshold(JNIEnv* env,
+                                                                             jclass clazz,
+                                                                             jlong jthreshold)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    auto threshold = static_cast<std::size_t>(jthreshold);
+    cudf::set_allocate_host_as_pinned_threshold(threshold);
+  }
+  CATCH_STD(env, )
+}
+
 }  // extern "C"

From 73937fbabaeea76665663ed23688b1cac61b7ee9 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 24 Jul 2024 16:42:00 -0400
Subject: [PATCH 14/15] Migrate lists/filling to pylibcudf (#16189)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16189
---
 .../_lib/pylibcudf/libcudf/lists/filling.pxd  | 19 ++++++++++
 python/cudf/cudf/_lib/pylibcudf/lists.pxd     |  2 +
 python/cudf/cudf/_lib/pylibcudf/lists.pyx     | 38 +++++++++++++++++++
 .../cudf/cudf/pylibcudf_tests/test_lists.py   | 16 ++++++++
 4 files changed, 75 insertions(+)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/lists/filling.pxd

diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/filling.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/filling.pxd
new file mode 100644
index 00000000000..8403fd179f7
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/filling.pxd
@@ -0,0 +1,19 @@
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+
+
+cdef extern from "cudf/lists/filling.hpp" namespace "cudf::lists" nogil:
+    cdef unique_ptr[column] sequences(
+        const column_view& starts,
+        const column_view& sizes,
+    ) except +
+
+    cdef unique_ptr[column] sequences(
+        const column_view& starts,
+        const column_view& steps,
+        const column_view& sizes,
+    ) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index cacecae6010..6e9bd5ff76b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -36,4 +36,6 @@ cpdef Column extract_list_element(Column, ColumnOrSizeType)
 
 cpdef Column count_elements(Column)
 
+cpdef Column sequences(Column, Column, Column steps = *)
+
 cpdef Column sort_lists(Column, bool, null_order, bool stable = *)
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index b5661a3e634..3837eaaca78 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -9,6 +9,7 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.lists cimport (
     contains as cpp_contains,
     explode as cpp_explode,
+    filling as cpp_filling,
     gather as cpp_gather,
     reverse as cpp_reverse,
 )
@@ -326,6 +327,43 @@ cpdef Column count_elements(Column input):
     return Column.from_libcudf(move(c_result))
 
 
+cpdef Column sequences(Column starts, Column sizes, Column steps = None):
+    """Create a lists column in which each row contains a sequence of
+    values specified by a tuple of (start, step, size) parameters.
+
+    For details, see :cpp:func:`sequences`.
+
+    Parameters
+    ----------
+    starts : Column
+        First values in the result sequences.
+    sizes : Column
+        Numbers of values in the result sequences.
+    steps : Optional[Column]
+        Increment values for the result sequences.
+
+    Returns
+    -------
+    Column
+        The result column containing generated sequences.
+    """
+    cdef unique_ptr[column] c_result
+
+    if steps is not None:
+        with nogil:
+            c_result = move(cpp_filling.sequences(
+                starts.view(),
+                steps.view(),
+                sizes.view(),
+            ))
+    else:
+        with nogil:
+            c_result = move(cpp_filling.sequences(
+                starts.view(),
+                sizes.view(),
+            ))
+    return Column.from_libcudf(move(c_result))
+
 cpdef Column sort_lists(
     Column input,
     bool ascending,
diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py
index 87472f6d59b..0b2e0e00ce8 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_lists.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py
@@ -198,6 +198,22 @@ def test_count_elements(test_data):
     assert_column_eq(expect, res)
 
 
+def test_sequences():
+    starts = plc.interop.from_arrow(pa.array([0, 1, 2, 3, 4]))
+    steps = plc.interop.from_arrow(pa.array([2, 1, 1, 1, -3]))
+    sizes = plc.interop.from_arrow(pa.array([0, 2, 2, 1, 3]))
+
+    res1 = plc.lists.sequences(starts, sizes, steps)
+    res2 = plc.lists.sequences(starts, sizes)
+
+    expect1 = pa.array([[], [1, 2], [2, 3], [3], [4, 1, -2]])
+    expect2 = pa.array([[], [1, 2], [2, 3], [3], [4, 5, 6]])
+
+    assert_column_eq(expect1, res1)
+
+    assert_column_eq(expect2, res2)
+
+
 @pytest.mark.parametrize(
     "ascending,na_position,expected",
     [

From 8bba6dfad239b4fd69a82acbc5dd7707ba576cce Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 24 Jul 2024 18:16:03 -0400
Subject: [PATCH 15/15] Migrate lists/set_operations to pylibcudf (#16190)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/16190
---
 .../libcudf/lists/set_operations.pxd          |  39 ++++
 python/cudf/cudf/_lib/pylibcudf/lists.pxd     |   8 +
 python/cudf/cudf/_lib/pylibcudf/lists.pyx     | 203 +++++++++++++++++-
 .../cudf/cudf/pylibcudf_tests/test_lists.py   |  90 ++++++++
 4 files changed, 339 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/lists/set_operations.pxd

diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/set_operations.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/set_operations.pxd
new file mode 100644
index 00000000000..eb796897f87
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/set_operations.pxd
@@ -0,0 +1,39 @@
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport nan_equality, null_equality
+
+
+cdef extern from "cudf/lists/set_operations.hpp" namespace "cudf::lists" nogil:
+    cdef unique_ptr[column] difference_distinct(
+        const lists_column_view& lhs,
+        const lists_column_view& rhs,
+        null_equality nulls_equal,
+        nan_equality nans_equal
+    ) except +
+
+    cdef unique_ptr[column] have_overlap(
+        const lists_column_view& lhs,
+        const lists_column_view& rhs,
+        null_equality nulls_equal,
+        nan_equality nans_equal
+    ) except +
+
+    cdef unique_ptr[column] intersect_distinct(
+        const lists_column_view& lhs,
+        const lists_column_view& rhs,
+        null_equality nulls_equal,
+        nan_equality nans_equal
+    ) except +
+
+    cdef unique_ptr[column] union_distinct(
+        const lists_column_view& lhs,
+        const lists_column_view& rhs,
+        null_equality nulls_equal,
+        nan_equality nans_equal
+    ) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index 6e9bd5ff76b..4e2406c2aea 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -39,3 +39,11 @@ cpdef Column count_elements(Column)
 cpdef Column sequences(Column, Column, Column steps = *)
 
 cpdef Column sort_lists(Column, bool, null_order, bool stable = *)
+
+cpdef Column difference_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*)
+
+cpdef Column have_overlap(Column, Column, bool nulls_equal=*, bool nans_equal=*)
+
+cpdef Column intersect_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*)
+
+cpdef Column union_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*)
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index 3837eaaca78..7555c8c6970 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -12,6 +12,7 @@ from cudf._lib.pylibcudf.libcudf.lists cimport (
     filling as cpp_filling,
     gather as cpp_gather,
     reverse as cpp_reverse,
+    set_operations as cpp_set_operations,
 )
 from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
     concatenate_list_elements as cpp_concatenate_list_elements,
@@ -29,7 +30,13 @@ from cudf._lib.pylibcudf.libcudf.lists.sorting cimport (
     stable_sort_lists as cpp_stable_sort_lists,
 )
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport null_order, order, size_type
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    nan_equality,
+    null_equality,
+    null_order,
+    order,
+    size_type,
+)
 from cudf._lib.pylibcudf.lists cimport ColumnOrScalar, ColumnOrSizeType
 
 from .column cimport Column, ListColumnView
@@ -413,3 +420,197 @@ cpdef Column sort_lists(
                     na_position,
             ))
     return Column.from_libcudf(move(c_result))
+
+
+cpdef Column difference_distinct(
+    Column lhs,
+    Column rhs,
+    bool nulls_equal=True,
+    bool nans_equal=True
+):
+    """Create a column of index values indicating the position of a search
+    key row within the corresponding list row in the lists column.
+
+    For details, see :cpp:func:`difference_distinct`.
+
+    Parameters
+    ----------
+    lhs : Column
+        The input lists column of elements that may be included.
+    rhs : Column
+        The input lists column of elements to exclude.
+    nulls_equal : bool, default True
+        If true, null elements are considered equal. Otherwise, unequal.
+    nans_equal : bool, default True
+        If true, libcudf will treat nan elements from {-nan, +nan}
+        as equal. Otherwise, unequal. Otherwise, unequal.
+
+    Returns
+    -------
+    Column
+        A lists column containing the difference results.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView lhs_view = lhs.list_view()
+    cdef ListColumnView rhs_view = rhs.list_view()
+
+    cdef null_equality c_nulls_equal = (
+        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
+    )
+    cdef nan_equality c_nans_equal = (
+        nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
+    )
+
+    with nogil:
+        c_result = move(cpp_set_operations.difference_distinct(
+            lhs_view.view(),
+            rhs_view.view(),
+            c_nulls_equal,
+            c_nans_equal,
+        ))
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column have_overlap(
+    Column lhs,
+    Column rhs,
+    bool nulls_equal=True,
+    bool nans_equal=True
+):
+    """Check if lists at each row of the given lists columns overlap.
+
+    For details, see :cpp:func:`have_overlap`.
+
+    Parameters
+    ----------
+    lhs : Column
+        The input lists column for one side.
+    rhs : Column
+        The input lists column for the other side.
+    nulls_equal : bool, default True
+        If true, null elements are considered equal. Otherwise, unequal.
+    nans_equal : bool, default True
+        If true, libcudf will treat nan elements from {-nan, +nan}
+        as equal. Otherwise, unequal. Otherwise, unequal.
+
+    Returns
+    -------
+    Column
+        A column containing the check results.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView lhs_view = lhs.list_view()
+    cdef ListColumnView rhs_view = rhs.list_view()
+
+    cdef null_equality c_nulls_equal = (
+        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
+    )
+    cdef nan_equality c_nans_equal = (
+        nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
+    )
+
+    with nogil:
+        c_result = move(cpp_set_operations.have_overlap(
+            lhs_view.view(),
+            rhs_view.view(),
+            c_nulls_equal,
+            c_nans_equal,
+        ))
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column intersect_distinct(
+    Column lhs,
+    Column rhs,
+    bool nulls_equal=True,
+    bool nans_equal=True
+):
+    """Create a lists column of distinct elements common to two input lists columns.
+
+    For details, see :cpp:func:`intersect_distinct`.
+
+    Parameters
+    ----------
+    lhs : Column
+        The input lists column of elements that may be included.
+    rhs : Column
+        The input lists column of elements to exclude.
+    nulls_equal : bool, default True
+        If true, null elements are considered equal. Otherwise, unequal.
+    nans_equal : bool, default True
+        If true, libcudf will treat nan elements from {-nan, +nan}
+        as equal. Otherwise, unequal. Otherwise, unequal.
+
+    Returns
+    -------
+    Column
+        A lists column containing the intersection results.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView lhs_view = lhs.list_view()
+    cdef ListColumnView rhs_view = rhs.list_view()
+
+    cdef null_equality c_nulls_equal = (
+        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
+    )
+    cdef nan_equality c_nans_equal = (
+        nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
+    )
+
+    with nogil:
+        c_result = move(cpp_set_operations.intersect_distinct(
+            lhs_view.view(),
+            rhs_view.view(),
+            c_nulls_equal,
+            c_nans_equal,
+        ))
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column union_distinct(
+    Column lhs,
+    Column rhs,
+    bool nulls_equal=True,
+    bool nans_equal=True
+):
+    """Create a lists column of distinct elements found in
+    either of two input lists columns.
+
+    For details, see :cpp:func:`union_distinct`.
+
+    Parameters
+    ----------
+    lhs : Column
+        The input lists column of elements that may be included.
+    rhs : Column
+        The input lists column of elements to exclude.
+    nulls_equal : bool, default True
+        If true, null elements are considered equal. Otherwise, unequal.
+    nans_equal : bool, default True
+        If true, libcudf will treat nan elements from {-nan, +nan}
+        as equal. Otherwise, unequal. Otherwise, unequal.
+
+    Returns
+    -------
+    Column
+        A lists column containing the union results.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView lhs_view = lhs.list_view()
+    cdef ListColumnView rhs_view = rhs.list_view()
+
+    cdef null_equality c_nulls_equal = (
+        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
+    )
+    cdef nan_equality c_nans_equal = (
+        nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
+    )
+
+    with nogil:
+        c_result = move(cpp_set_operations.union_distinct(
+            lhs_view.view(),
+            rhs_view.view(),
+            c_nulls_equal,
+            c_nans_equal,
+        ))
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py
index 0b2e0e00ce8..f135ab4ccff 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_lists.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+import numpy as np
 import pyarrow as pa
 import pytest
 from utils import assert_column_eq
@@ -22,6 +23,13 @@ def column():
     return pa.array([3, 2, 5, 6]), pa.array([-1, 0, 0, 0], type=pa.int32())
 
 
+@pytest.fixture
+def set_lists_column():
+    lhs = [[np.nan, np.nan, 2, 1, 2], [1, 2, 3], None, [4, None, 5]]
+    rhs = [[np.nan, 1, 2, 3], [4, 5], [None, 7, 8], [None, None]]
+    return lhs, rhs
+
+
 @pytest.fixture
 def lists_column():
     return [[4, 2, 3, 1], [1, 2, None, 4], [-10, 10, 10, 0]]
@@ -253,3 +261,85 @@ def test_sort_lists(lists_column, ascending, na_position, expected):
 
     assert_column_eq(expect, res)
     assert_column_eq(expect, res_stable)
+
+
+@pytest.mark.parametrize(
+    "set_operation,nans_equal,nulls_equal,expected",
+    [
+        (
+            plc.lists.difference_distinct,
+            True,
+            True,
+            [[], [1, 2, 3], None, [4, 5]],
+        ),
+        (
+            plc.lists.difference_distinct,
+            False,
+            True,
+            [[], [1, 2, 3], None, [4, None, 5]],
+        ),
+        (
+            plc.lists.have_overlap,
+            True,
+            True,
+            [True, False, None, True],
+        ),
+        (
+            plc.lists.have_overlap,
+            False,
+            False,
+            [True, False, None, False],
+        ),
+        (
+            plc.lists.intersect_distinct,
+            True,
+            True,
+            [[np.nan, 1, 2], [], None, [None]],
+        ),
+        (
+            plc.lists.intersect_distinct,
+            True,
+            False,
+            [[1, 2], [], None, [None]],
+        ),
+        (
+            plc.lists.union_distinct,
+            False,
+            True,
+            [
+                [np.nan, 2, 1, 3],
+                [1, 2, 3, 4, 5],
+                None,
+                [4, None, 5, None, None],
+            ],
+        ),
+        (
+            plc.lists.union_distinct,
+            False,
+            False,
+            [
+                [np.nan, np.nan, 2, 1, np.nan, 3],
+                [1, 2, 3, 4, 5],
+                None,
+                [4, None, 5, None, None],
+            ],
+        ),
+    ],
+)
+def test_set_operations(
+    set_lists_column, set_operation, nans_equal, nulls_equal, expected
+):
+    lhs, rhs = set_lists_column
+
+    res = set_operation(
+        plc.interop.from_arrow(pa.array(lhs)),
+        plc.interop.from_arrow(pa.array(rhs)),
+        nans_equal,
+        nulls_equal,
+    )
+
+    if set_operation != plc.lists.have_overlap:
+        expect = pa.array(expected, type=pa.list_(pa.float64()))
+    else:
+        expect = pa.array(expected)
+    assert_column_eq(expect, res)