From 01d35eac55aac310fc1b24d3d19b06649f775e70 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 24 Apr 2023 09:35:54 -0700
Subject: [PATCH 01/18] Add `pinned_host_vector`

---
 cpp/include/cudf/io/detail/utils.hpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/cpp/include/cudf/io/detail/utils.hpp b/cpp/include/cudf/io/detail/utils.hpp
index adb7078d96d..666deb6776e 100644
--- a/cpp/include/cudf/io/detail/utils.hpp
+++ b/cpp/include/cudf/io/detail/utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,10 @@
 
 #pragma once
 
+#include <cudf/detail/utilities/pinned_allocator.hpp>
+
+#include <thrust/host_vector.h>
+
 namespace cudf {
 namespace io {
 namespace detail {
@@ -23,6 +27,13 @@ namespace detail {
  * @brief Whether writer writes in chunks or all at once
  */
 enum class SingleWriteMode : bool { YES, NO };
+
+/**
+ * @brief Helper for pinned host memory
+ */
+template <typename T>
+using pinned_host_vector = thrust::host_vector<T, cudf::detail::pinned_allocator<T>>();
+
 }  // namespace detail
 }  // namespace io
 }  // namespace cudf

From e2393ed772e0989344ddf241c6d57da167eb66a3 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 24 Apr 2023 09:39:52 -0700
Subject: [PATCH 02/18] Revert "Add `pinned_host_vector`"

This reverts commit 01d35eac55aac310fc1b24d3d19b06649f775e70.
---
 cpp/include/cudf/io/detail/utils.hpp | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/cpp/include/cudf/io/detail/utils.hpp b/cpp/include/cudf/io/detail/utils.hpp
index 666deb6776e..adb7078d96d 100644
--- a/cpp/include/cudf/io/detail/utils.hpp
+++ b/cpp/include/cudf/io/detail/utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,6 @@
 
 #pragma once
 
-#include <cudf/detail/utilities/pinned_allocator.hpp>
-
-#include <thrust/host_vector.h>
-
 namespace cudf {
 namespace io {
 namespace detail {
@@ -27,13 +23,6 @@ namespace detail {
  * @brief Whether writer writes in chunks or all at once
  */
 enum class SingleWriteMode : bool { YES, NO };
-
-/**
- * @brief Helper for pinned host memory
- */
-template <typename T>
-using pinned_host_vector = thrust::host_vector<T, cudf::detail::pinned_allocator<T>>();
-
 }  // namespace detail
 }  // namespace io
 }  // namespace cudf

From 7aa1d75ac826ba9f9e3279f49358ceeed02a3b65 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 24 Apr 2023 09:42:16 -0700
Subject: [PATCH 03/18] Move `pinned_host_vector` into `pinned_allocator.hpp`

---
 .../cudf/detail/utilities/pinned_allocator.hpp        | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/cpp/include/cudf/detail/utilities/pinned_allocator.hpp b/cpp/include/cudf/detail/utilities/pinned_allocator.hpp
index 84abf7c014f..283511d7491 100644
--- a/cpp/include/cudf/detail/utilities/pinned_allocator.hpp
+++ b/cpp/include/cudf/detail/utilities/pinned_allocator.hpp
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2022 NVIDIA Corporation
+ *  Copyright 2008-2023 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -22,6 +22,8 @@
 
 #include <cudf/utilities/error.hpp>
 
+#include <thrust/host_vector.h>
+
 namespace cudf::detail {
 
 /*! \p pinned_allocator is a CUDA-specific host memory allocator
@@ -199,4 +201,11 @@ class pinned_allocator {
     return !operator==(x);
   }
 };
+
+/**
+ * @brief Helper for pinned host memory
+ */
+template <typename T>
+using pinned_host_vector = thrust::host_vector<T, pinned_allocator<T>>();
+
 }  // namespace cudf::detail

From ca18ce951631471665852b3251381211f5da9069 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 24 Apr 2023 09:53:05 -0700
Subject: [PATCH 04/18] Switch to use `pinned_host_vector`

---
 conda/recipes/libcudf/meta.yaml                 |  2 +-
 cpp/benchmarks/io/text/multibyte_split.cpp      |  5 ++---
 ...ned_allocator.hpp => pinned_host_vector.hpp} |  2 +-
 cpp/src/io/text/bgzip_data_chunk_source.cu      | 13 +++++--------
 cpp/src/io/text/data_chunk_source_factories.cpp | 17 ++++++-----------
 cpp/src/io/utilities/hostdevice_vector.hpp      |  7 +++----
 6 files changed, 18 insertions(+), 28 deletions(-)
 rename cpp/include/cudf/detail/utilities/{pinned_allocator.hpp => pinned_host_vector.hpp} (99%)

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 2bb571f858d..94b244b94d5 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -139,7 +139,7 @@ outputs:
         - test -f $PREFIX/include/cudf/detail/utilities/integer_utils.hpp
         - test -f $PREFIX/include/cudf/detail/utilities/linked_column.hpp
         - test -f $PREFIX/include/cudf/detail/utilities/logger.hpp
-        - test -f $PREFIX/include/cudf/detail/utilities/pinned_allocator.hpp
+        - test -f $PREFIX/include/cudf/detail/utilities/pinned_host_vector.hpp
         - test -f $PREFIX/include/cudf/detail/utilities/vector_factories.hpp
         - test -f $PREFIX/include/cudf/detail/utilities/visitor_overload.hpp
         - test -f $PREFIX/include/cudf/dictionary/detail/concatenate.hpp
diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp
index 41b5ddb567e..a697c98a320 100644
--- a/cpp/benchmarks/io/text/multibyte_split.cpp
+++ b/cpp/benchmarks/io/text/multibyte_split.cpp
@@ -23,7 +23,7 @@
 #include <cudf_test/file_utilities.hpp>
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/utilities/pinned_allocator.hpp>
+#include <cudf/detail/utilities/pinned_host_vector.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/detail/bgzip_utils.hpp>
@@ -33,7 +33,6 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-#include <thrust/host_vector.h>
 #include <thrust/transform.h>
 
 #include <nvbench/nvbench.cuh>
@@ -136,7 +135,7 @@ static void bench_multibyte_split(nvbench::state& state,
   std::unique_ptr<cudf::io::datasource> datasource;
   auto device_input      = create_random_input(file_size_approx, delim_factor, 0.05, delim);
   auto host_input        = std::vector<char>{};
-  auto host_pinned_input = thrust::host_vector<char, cudf::detail::pinned_allocator<char>>{};
+  auto host_pinned_input = cudf::detail::pinned_host_vector<char>{};
 
   if (source_type != data_chunk_source_type::device &&
       source_type != data_chunk_source_type::host_pinned) {
diff --git a/cpp/include/cudf/detail/utilities/pinned_allocator.hpp b/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp
similarity index 99%
rename from cpp/include/cudf/detail/utilities/pinned_allocator.hpp
rename to cpp/include/cudf/detail/utilities/pinned_host_vector.hpp
index 283511d7491..461e76bc218 100644
--- a/cpp/include/cudf/detail/utilities/pinned_allocator.hpp
+++ b/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp
@@ -203,7 +203,7 @@ class pinned_allocator {
 };
 
 /**
- * @brief Helper for pinned host memory
+ * @brief A vector class with pinned host memory allocator
  */
 template <typename T>
 using pinned_host_vector = thrust::host_vector<T, pinned_allocator<T>>();
diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu
index 32678191cb1..715f70605df 100644
--- a/cpp/src/io/text/bgzip_data_chunk_source.cu
+++ b/cpp/src/io/text/bgzip_data_chunk_source.cu
@@ -19,7 +19,7 @@
 #include "io/utilities/config_utils.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/pinned_allocator.hpp>
+#include <cudf/detail/utilities/pinned_host_vector.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/detail/bgzip_utils.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -65,10 +65,7 @@ struct bgzip_nvcomp_transform_functor {
 class bgzip_data_chunk_reader : public data_chunk_reader {
  private:
   template <typename T>
-  using pinned_host_vector = thrust::host_vector<T, cudf::detail::pinned_allocator<T>>;
-
-  template <typename T>
-  static void copy_to_device(const pinned_host_vector<T>& host,
+  static void copy_to_device(cudf::detail::pinned_host_vector<T> const& host,
                              rmm::device_uvector<T>& device,
                              rmm::cuda_stream_view stream)
   {
@@ -84,9 +81,9 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
       1 << 16;  // 64k offset allocation, resized on demand
 
     cudaEvent_t event;
-    pinned_host_vector<char> h_compressed_blocks;
-    pinned_host_vector<std::size_t> h_compressed_offsets;
-    pinned_host_vector<std::size_t> h_decompressed_offsets;
+    cudf::detail::pinned_host_vector<char> h_compressed_blocks;
+    cudf::detail::pinned_host_vector<std::size_t> h_compressed_offsets;
+    cudf::detail::pinned_host_vector<std::size_t> h_decompressed_offsets;
     rmm::device_uvector<char> d_compressed_blocks;
     rmm::device_uvector<char> d_decompressed_blocks;
     rmm::device_uvector<std::size_t> d_compressed_offsets;
diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp
index b6c88c1346f..1ff1a4f8ebf 100644
--- a/cpp/src/io/text/data_chunk_source_factories.cpp
+++ b/cpp/src/io/text/data_chunk_source_factories.cpp
@@ -17,7 +17,7 @@
 #include "io/text/device_data_chunks.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/pinned_allocator.hpp>
+#include <cudf/detail/utilities/pinned_host_vector.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 
 #include <rmm/device_buffer.hpp>
@@ -30,16 +30,16 @@ namespace cudf::io::text {
 
 namespace {
 
+struct host_ticket {
+  cudaEvent_t event;
+  cudf::detail::pinned_host_vector<char> buffer;
+};
+
 /**
  * @brief A reader which produces owning chunks of device memory which contain a copy of the data
  * from an istream.
  */
 class datasource_chunk_reader : public data_chunk_reader {
-  struct host_ticket {
-    cudaEvent_t event;
-    thrust::host_vector<char, cudf::detail::pinned_allocator<char>> buffer;
-  };
-
   constexpr static int num_tickets = 2;
 
  public:
@@ -114,11 +114,6 @@ class datasource_chunk_reader : public data_chunk_reader {
  * from an istream.
  */
 class istream_data_chunk_reader : public data_chunk_reader {
-  struct host_ticket {
-    cudaEvent_t event;
-    thrust::host_vector<char, cudf::detail::pinned_allocator<char>> buffer;
-  };
-
   constexpr static int num_tickets = 2;
 
  public:
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index ff8502b1dcd..566132f8463 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -19,7 +19,7 @@
 #include "config_utils.hpp"
 #include "hostdevice_span.hpp"
 
-#include <cudf/detail/utilities/pinned_allocator.hpp>
+#include <cudf/detail/utilities/pinned_host_vector.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
@@ -67,7 +67,7 @@ class hostdevice_vector {
     if (hostdevice_vector_uses_pageable_buffer()) {
       h_data_owner = thrust::host_vector<T>();
     } else {
-      h_data_owner = thrust::host_vector<T, cudf::detail::pinned_allocator<T>>();
+      h_data_owner = cudf::detail::pinned_host_vector<T>();
     }
 
     std::visit(
@@ -177,8 +177,7 @@ class hostdevice_vector {
   }
 
  private:
-  std::variant<thrust::host_vector<T>, thrust::host_vector<T, cudf::detail::pinned_allocator<T>>>
-    h_data_owner;
+  std::variant<thrust::host_vector<T>, cudf::detail::pinned_host_vector<T>> h_data_owner;
   T* host_data        = nullptr;
   size_t current_size = 0;
   rmm::device_uvector<T> d_data;

From d7070e73a5f51de900f03b17566ed8b0c38e412f Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 24 Apr 2023 10:00:06 -0700
Subject: [PATCH 05/18] Adopt `pinned_host_vector`

---
 cpp/src/io/orc/writer_impl.cu     | 58 +++++++++----------------------
 cpp/src/io/orc/writer_impl.hpp    |  2 +-
 cpp/src/io/parquet/writer_impl.cu | 22 +++---------
 3 files changed, 23 insertions(+), 59 deletions(-)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index f1eb52f63f1..d3f0d82927e 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -28,6 +28,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/pinned_host_vector.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
@@ -79,11 +80,6 @@ struct row_group_index_info {
 };
 
 namespace {
-/**
- * @brief Helper for pinned host memory
- */
-template <typename T>
-using pinned_buffer = std::unique_ptr<T, decltype(&cudaFreeHost)>;
 
 /**
  * @brief Translates ORC compression to nvCOMP compression
@@ -2196,28 +2192,17 @@ std::unique_ptr<table_input_metadata> make_table_meta(table_view const& input)
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @return A tuple of the intermediate results containing the processed data
  */
-std::tuple<orc_streams,
-           hostdevice_vector<compression_result>,
-           hostdevice_2dvector<gpu::StripeStream>,
-           encoded_data,
-           file_segmentation,
-           hostdevice_2dvector<gpu::StripeDictionary>,
-           std::vector<StripeInformation>,
-           orc_table_view,
-           rmm::device_buffer,
-           intermediate_statistics,
-           pinned_buffer<uint8_t>>
-convert_table_to_orc_data(table_view const& input,
-                          table_input_metadata const& table_meta,
-                          stripe_size_limits max_stripe_size,
-                          size_type row_index_stride,
-                          bool enable_dictionary,
-                          CompressionKind compression_kind,
-                          size_t compression_blocksize,
-                          statistics_freq stats_freq,
-                          bool single_write_mode,
-                          data_sink const& out_sink,
-                          rmm::cuda_stream_view stream)
+auto convert_table_to_orc_data(table_view const& input,
+                               table_input_metadata const& table_meta,
+                               stripe_size_limits max_stripe_size,
+                               size_type row_index_stride,
+                               bool enable_dictionary,
+                               CompressionKind compression_kind,
+                               size_t compression_blocksize,
+                               statistics_freq stats_freq,
+                               bool single_write_mode,
+                               data_sink const& out_sink,
+                               rmm::cuda_stream_view stream)
 {
   auto const input_tview = table_device_view::create(input, stream);
 
@@ -2298,7 +2283,7 @@ convert_table_to_orc_data(table_view const& input,
             std::move(orc_table),
             rmm::device_buffer{},  // compressed_data
             intermediate_statistics{stream},
-            pinned_buffer<uint8_t>{nullptr, cudaFreeHost}};
+            cudf::detail::pinned_host_vector<uint8_t>{}};
   }
 
   // Allocate intermediate output stream buffer
@@ -2333,16 +2318,7 @@ convert_table_to_orc_data(table_view const& input,
       max_stream_size = std::max(max_stream_size, stream_size);
     }
 
-    if (all_device_write) {
-      return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
-    } else {
-      return pinned_buffer<uint8_t>{[](size_t size) {
-                                      uint8_t* ptr = nullptr;
-                                      CUDF_CUDA_TRY(cudaMallocHost(&ptr, size));
-                                      return ptr;
-                                    }(max_stream_size),
-                                    cudaFreeHost};
-    }
+    return cudf::detail::pinned_host_vector{all_device_write ? 0 : max_stream_size};
   }();
 
   // Compress the data streams
@@ -2489,7 +2465,7 @@ void writer::impl::write(table_view const& input)
                          orc_table,
                          compressed_data,
                          intermediate_stats,
-                         stream_output.get());
+                         stream_output);
 
   // Update data into the footer. This needs to be called even when num_rows==0.
   add_table_to_footer_data(orc_table, stripes);
@@ -2504,7 +2480,7 @@ void writer::impl::write_orc_data_to_sink(orc_streams& streams,
                                           orc_table_view const& orc_table,
                                           rmm::device_buffer const& compressed_data,
                                           intermediate_statistics& intermediate_stats,
-                                          uint8_t* stream_output)
+                                          host_span<uint8_t> stream_output)
 {
   if (orc_table.num_rows() == 0) { return; }
 
@@ -2544,7 +2520,7 @@ void writer::impl::write_orc_data_to_sink(orc_streams& streams,
         strm_desc,
         enc_data.streams[strm_desc.column_id][segmentation.stripes[stripe_id].first],
         static_cast<uint8_t const*>(compressed_data.data()),
-        stream_output,
+        stream_output.data(),
         &stripe,
         &streams,
         _compression_kind,
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index f003de8087e..96c6593556f 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -313,7 +313,7 @@ class writer::impl {
                               orc_table_view const& orc_table,
                               rmm::device_buffer const& compressed_data,
                               intermediate_statistics& intermediate_stats,
-                              uint8_t* stream_output);
+                              host_span<uint8_t> stream_output);
 
   /**
    * @brief Add the processed table data into the internal file footer.
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 79581c0d21c..e2ead6f9985 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -34,6 +34,7 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/linked_column.hpp>
+#include <cudf/detail/utilities/pinned_host_vector.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/detail/dremel.hpp>
 #include <cudf/lists/lists_column_view.hpp>
@@ -67,11 +68,6 @@ using namespace cudf::io::parquet;
 using namespace cudf::io;
 
 namespace {
-/**
- * @brief Helper for pinned host memory
- */
-template <typename T>
-using pinned_buffer = std::unique_ptr<T, decltype(&cudaFreeHost)>;
 
 /**
  * @brief Function that translates GDF compression to parquet compression
@@ -1835,7 +1831,7 @@ void writer::impl::write(table_view const& table, std::vector<partition_info> co
                        num_stats_bfr);
   }
 
-  pinned_buffer<uint8_t> host_bfr{nullptr, cudaFreeHost};
+  cudf::detail::pinned_host_vector<uint8_t> host_bfr(max_chunk_bfr_size);
 
   // Encode row groups in batches
   for (auto b = 0, r = 0; b < static_cast<size_type>(batch_list.size()); b++) {
@@ -1889,25 +1885,17 @@ void writer::impl::write(table_view const& table, std::vector<partition_info> co
             _stream.synchronize();
           }
         } else {
-          if (!host_bfr) {
-            host_bfr = pinned_buffer<uint8_t>{[](size_t size) {
-                                                uint8_t* ptr = nullptr;
-                                                CUDF_CUDA_TRY(cudaMallocHost(&ptr, size));
-                                                return ptr;
-                                              }(max_chunk_bfr_size),
-                                              cudaFreeHost};
-          }
           // copy the full data
-          CUDF_CUDA_TRY(cudaMemcpyAsync(host_bfr.get(),
+          CUDF_CUDA_TRY(cudaMemcpyAsync(host_bfr.data(),
                                         dev_bfr,
                                         ck.ck_stat_size + ck.compressed_size,
                                         cudaMemcpyDefault,
                                         _stream.value()));
           _stream.synchronize();
-          _out_sink[p]->host_write(host_bfr.get() + ck.ck_stat_size, ck.compressed_size);
+          _out_sink[p]->host_write(host_bfr.data() + ck.ck_stat_size, ck.compressed_size);
           if (ck.ck_stat_size != 0) {
             column_chunk_meta.statistics_blob.resize(ck.ck_stat_size);
-            memcpy(column_chunk_meta.statistics_blob.data(), host_bfr.get(), ck.ck_stat_size);
+            memcpy(column_chunk_meta.statistics_blob.data(), host_bfr.data(), ck.ck_stat_size);
           }
         }
         row_group.total_byte_size += ck.compressed_size;

From 7358c7891071e80ce4b135f1628590c582800022 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 24 Apr 2023 10:06:22 -0700
Subject: [PATCH 06/18] Fix bug in type alias

---
 cpp/include/cudf/detail/utilities/pinned_host_vector.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp b/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp
index 461e76bc218..83f061e9407 100644
--- a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp
@@ -206,6 +206,6 @@ class pinned_allocator {
  * @brief A vector class with pinned host memory allocator
  */
 template <typename T>
-using pinned_host_vector = thrust::host_vector<T, pinned_allocator<T>>();
+using pinned_host_vector = thrust::host_vector<T, pinned_allocator<T>>;
 
 }  // namespace cudf::detail

From a266de6a0fe2121cb8b39a213f0cefe3ef3582ce Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 24 Apr 2023 10:20:52 -0700
Subject: [PATCH 07/18] Fix bug and rename variable

---
 cpp/src/io/orc/writer_impl.cu  | 56 +++++++++++++++++-----------------
 cpp/src/io/orc/writer_impl.hpp |  4 +--
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index d3f0d82927e..17312aa19bb 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -2273,17 +2273,17 @@ auto convert_table_to_orc_data(table_view const& input,
   auto stripes = gather_stripes(num_index_streams, segmentation, &enc_data, &strm_descs, stream);
 
   if (num_rows == 0) {
-    return {std::move(streams),
-            hostdevice_vector<compression_result>{},  // comp_results
-            std::move(strm_descs),
-            std::move(enc_data),
-            std::move(segmentation),
-            std::move(stripe_dict),
-            std::move(stripes),
-            std::move(orc_table),
-            rmm::device_buffer{},  // compressed_data
-            intermediate_statistics{stream},
-            cudf::detail::pinned_host_vector<uint8_t>{}};
+    return std::tuple{std::move(streams),
+                      hostdevice_vector<compression_result>{},  // comp_results
+                      std::move(strm_descs),
+                      std::move(enc_data),
+                      std::move(segmentation),
+                      std::move(stripe_dict),
+                      std::move(stripes),
+                      std::move(orc_table),
+                      rmm::device_buffer{},  // compressed_data
+                      intermediate_statistics{stream},
+                      cudf::detail::pinned_host_vector<uint8_t>()};
   }
 
   // Allocate intermediate output stream buffer
@@ -2297,7 +2297,7 @@ auto convert_table_to_orc_data(table_view const& input,
   auto const padded_block_header_size =
     util::round_up_unsafe<size_t>(block_header_size, compressed_block_align);
 
-  auto stream_output = [&]() {
+  auto bounce_buffer = [&]() {
     size_t max_stream_size = 0;
     bool all_device_write  = true;
 
@@ -2318,7 +2318,7 @@ auto convert_table_to_orc_data(table_view const& input,
       max_stream_size = std::max(max_stream_size, stream_size);
     }
 
-    return cudf::detail::pinned_host_vector{all_device_write ? 0 : max_stream_size};
+    return cudf::detail::pinned_host_vector<uint8_t>(all_device_write ? 0 : max_stream_size);
   }();
 
   // Compress the data streams
@@ -2350,17 +2350,17 @@ auto convert_table_to_orc_data(table_view const& input,
 
   auto intermediate_stats = gather_statistic_blobs(stats_freq, orc_table, segmentation, stream);
 
-  return {std::move(streams),
-          std::move(comp_results),
-          std::move(strm_descs),
-          std::move(enc_data),
-          std::move(segmentation),
-          std::move(stripe_dict),
-          std::move(stripes),
-          std::move(orc_table),
-          std::move(compressed_data),
-          std::move(intermediate_stats),
-          std::move(stream_output)};
+  return std::tuple{std::move(streams),
+                    std::move(comp_results),
+                    std::move(strm_descs),
+                    std::move(enc_data),
+                    std::move(segmentation),
+                    std::move(stripe_dict),
+                    std::move(stripes),
+                    std::move(orc_table),
+                    std::move(compressed_data),
+                    std::move(intermediate_stats),
+                    std::move(bounce_buffer)};
 }
 
 }  // namespace
@@ -2434,7 +2434,7 @@ void writer::impl::write(table_view const& input)
                          orc_table,
                          compressed_data,
                          intermediate_stats,
-                         stream_output] = [&] {
+                         bounce_buffer] = [&] {
     try {
       return convert_table_to_orc_data(input,
                                        *_table_meta,
@@ -2465,7 +2465,7 @@ void writer::impl::write(table_view const& input)
                          orc_table,
                          compressed_data,
                          intermediate_stats,
-                         stream_output);
+                         bounce_buffer);
 
   // Update data into the footer. This needs to be called even when num_rows==0.
   add_table_to_footer_data(orc_table, stripes);
@@ -2480,7 +2480,7 @@ void writer::impl::write_orc_data_to_sink(orc_streams& streams,
                                           orc_table_view const& orc_table,
                                           rmm::device_buffer const& compressed_data,
                                           intermediate_statistics& intermediate_stats,
-                                          host_span<uint8_t> stream_output)
+                                          host_span<uint8_t> bounce_buffer)
 {
   if (orc_table.num_rows() == 0) { return; }
 
@@ -2520,7 +2520,7 @@ void writer::impl::write_orc_data_to_sink(orc_streams& streams,
         strm_desc,
         enc_data.streams[strm_desc.column_id][segmentation.stripes[stripe_id].first],
         static_cast<uint8_t const*>(compressed_data.data()),
-        stream_output.data(),
+        bounce_buffer.data(),
         &stripe,
         &streams,
         _compression_kind,
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 96c6593556f..533e60f0d0c 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -302,7 +302,7 @@ class writer::impl {
    * @param orc_table Non-owning view of a cuDF table that includes ORC-related information
    * @param compressed_data Compressed stream data
    * @param intermediate_stats Statistics data stored between calls to write
-   * @param stream_output Temporary host output buffer
+   * @param bounce_buffer Temporary host output buffer
    */
   void write_orc_data_to_sink(orc_streams& streams,
                               hostdevice_vector<compression_result> const& comp_results,
@@ -313,7 +313,7 @@ class writer::impl {
                               orc_table_view const& orc_table,
                               rmm::device_buffer const& compressed_data,
                               intermediate_statistics& intermediate_stats,
-                              host_span<uint8_t> stream_output);
+                              host_span<uint8_t> bounce_buffer);
 
   /**
    * @brief Add the processed table data into the internal file footer.

From 4d436870c8729d1100d00ac367eda561d83dd1d2 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 24 Apr 2023 10:31:34 -0700
Subject: [PATCH 08/18] Switch to use `SingleWriteMode`

---
 cpp/src/io/orc/writer_impl.cu      | 14 +++++++-------
 cpp/src/io/orc/writer_impl.hpp     |  8 ++++----
 cpp/src/io/parquet/writer_impl.cu  | 10 +++++-----
 cpp/src/io/parquet/writer_impl.hpp |  6 +++---
 4 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 17312aa19bb..5877b61d32a 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -375,11 +375,11 @@ __global__ void copy_string_data(char* string_pool,
 }  // namespace
 
 void persisted_statistics::persist(int num_table_rows,
-                                   bool single_write_mode,
+                                   SingleWriteMode single_write_mode,
                                    intermediate_statistics& intermediate_stats,
                                    rmm::cuda_stream_view stream)
 {
-  if (not single_write_mode) {
+  if (single_write_mode == SingleWriteMode::YES) {
     // persist the strings in the chunks into a string pool and update pointers
     auto const num_chunks = static_cast<int>(intermediate_stats.stripe_stat_chunks.size());
     // min offset and max offset + 1 for total size
@@ -666,7 +666,7 @@ orc_streams create_streams(host_span<orc_column_view> columns,
                            std::map<uint32_t, size_t> const& decimal_column_sizes,
                            bool enable_dictionary,
                            CompressionKind compression_kind,
-                           bool single_write_mode)
+                           SingleWriteMode single_write_mode)
 {
   // 'column 0' row index stream
   std::vector<Stream> streams{{ROW_INDEX, 0}};  // TODO: Separate index and data streams?
@@ -681,7 +681,7 @@ orc_streams create_streams(host_span<orc_column_view> columns,
 
   for (auto& column : columns) {
     auto const is_nullable = [&]() -> bool {
-      if (single_write_mode) {
+      if (single_write_mode == SingleWriteMode::YES) {
         return column.nullable();
       } else {
         // For chunked write, when not provided nullability, we assume the worst case scenario
@@ -2200,7 +2200,7 @@ auto convert_table_to_orc_data(table_view const& input,
                                CompressionKind compression_kind,
                                size_t compression_blocksize,
                                statistics_freq stats_freq,
-                               bool single_write_mode,
+                               SingleWriteMode single_write_mode,
                                data_sink const& out_sink,
                                rmm::cuda_stream_view stream)
 {
@@ -2375,7 +2375,7 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
     _compression_kind(to_orc_compression(options.get_compression())),
     _compression_blocksize(compression_block_size(_compression_kind)),
     _stats_freq(options.get_statistics_freq()),
-    _single_write_mode(mode == SingleWriteMode::YES),
+    _single_write_mode(mode),
     _kv_meta(options.get_key_value_metadata()),
     _out_sink(std::move(sink))
 {
@@ -2395,7 +2395,7 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
     _compression_kind(to_orc_compression(options.get_compression())),
     _compression_blocksize(compression_block_size(_compression_kind)),
     _stats_freq(options.get_statistics_freq()),
-    _single_write_mode(mode == SingleWriteMode::YES),
+    _single_write_mode(mode),
     _kv_meta(options.get_key_value_metadata()),
     _out_sink(std::move(sink))
 {
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 533e60f0d0c..3d4e8ab7bf3 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -209,7 +209,7 @@ struct persisted_statistics {
   }
 
   void persist(int num_table_rows,
-               bool single_write_mode,
+               SingleWriteMode single_write_mode,
                intermediate_statistics& intermediate_stats,
                rmm::cuda_stream_view stream);
 
@@ -334,9 +334,9 @@ class writer::impl {
   CompressionKind const _compression_kind;
   size_t const _compression_blocksize;
   statistics_freq const _stats_freq;
-  bool const _single_write_mode;  // Special parameter only used by `write()` to indicate that
-                                  // we are guaranteeing a single table write. This enables some
-                                  // internal optimizations.
+  SingleWriteMode const _single_write_mode;  // Special parameter only used by `write()` to indicate
+                                             // that we are guaranteeing a single table write. This
+                                             // enables some internal optimizations.
   std::map<std::string, std::string> const _kv_meta;  // Optional user metadata.
   std::unique_ptr<data_sink> const _out_sink;
 
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index e2ead6f9985..b80016ebc48 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -506,7 +506,7 @@ struct leaf_schema_fn {
 
 inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col,
                             column_in_metadata const& col_meta,
-                            bool single_write_mode)
+                            SingleWriteMode single_write_mode)
 {
   if (col_meta.is_nullability_defined()) {
     CUDF_EXPECTS(col_meta.nullable() || !col->nullable(),
@@ -516,7 +516,7 @@ inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col,
   }
   // For chunked write, when not provided nullability, we assume the worst case scenario
   // that all columns are nullable.
-  return not single_write_mode or col->nullable();
+  return single_write_mode == SingleWriteMode::NO or col->nullable();
 }
 
 /**
@@ -528,7 +528,7 @@ inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col,
 std::vector<schema_tree_node> construct_schema_tree(
   cudf::detail::LinkedColVector const& linked_columns,
   table_input_metadata& metadata,
-  bool single_write_mode,
+  SingleWriteMode single_write_mode,
   bool int96_timestamps)
 {
   std::vector<schema_tree_node> schema;
@@ -1330,7 +1330,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _int96_timestamps(options.is_enabled_int96_timestamps()),
     _column_index_truncate_length(options.get_column_index_truncate_length()),
     _kv_meta(options.get_key_value_metadata()),
-    _single_write_mode(mode == SingleWriteMode::YES),
+    _single_write_mode(mode),
     _out_sink(std::move(sinks))
 {
   if (options.get_metadata()) {
@@ -1356,7 +1356,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _int96_timestamps(options.is_enabled_int96_timestamps()),
     _column_index_truncate_length(options.get_column_index_truncate_length()),
     _kv_meta(options.get_key_value_metadata()),
-    _single_write_mode(mode == SingleWriteMode::YES),
+    _single_write_mode(mode),
     _out_sink(std::move(sinks))
 {
   if (options.get_metadata()) {
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 34ad95f3ee4..d4fdb9a8813 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -229,9 +229,9 @@ class writer::impl {
   bool const _int96_timestamps;
   int32_t const _column_index_truncate_length;
   std::vector<std::map<std::string, std::string>> const _kv_meta;  // Optional user metadata.
-  bool const _single_write_mode = true;  // Special parameter only used by `write()` to indicate
-                                         // that we are guaranteeing a single table write.
-                                         // This enables some internal optimizations.
+  SingleWriteMode const _single_write_mode;  // Special parameter only used by `write()` to indicate
+                                             // that we are guaranteeing a single table write.
+                                             // This enables some internal optimizations.
   std::vector<std::unique_ptr<data_sink>> const _out_sink;
 
   // Internal states, filled during `write()` and written to sink during `write` and `close()`.

From d33ad7d4c6700baf001f29ccfe05316a9bbbaf71 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Mon, 24 Apr 2023 12:08:18 -0600
Subject: [PATCH 09/18] Update cpp/src/io/orc/writer_impl.cu

Co-authored-by: Yunsong Wang <wangyunsong89@gmail.com>
---
 cpp/src/io/orc/writer_impl.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 5877b61d32a..333fa141aea 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -379,7 +379,7 @@ void persisted_statistics::persist(int num_table_rows,
                                    intermediate_statistics& intermediate_stats,
                                    rmm::cuda_stream_view stream)
 {
-  if (single_write_mode == SingleWriteMode::YES) {
+  if (single_write_mode == SingleWriteMode::NO) {
     // persist the strings in the chunks into a string pool and update pointers
     auto const num_chunks = static_cast<int>(intermediate_stats.stripe_stat_chunks.size());
     // min offset and max offset + 1 for total size

From 155f99f1c022ac7da542eac21ca30f1b86651601 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 24 Apr 2023 11:13:44 -0700
Subject: [PATCH 10/18] Rename variable

---
 cpp/src/io/orc/writer_impl.cu     | 14 +++++++-------
 cpp/src/io/orc/writer_impl.hpp    |  2 +-
 cpp/src/io/parquet/writer_impl.cu | 10 +++++-----
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 333fa141aea..525bc5701ca 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -375,11 +375,11 @@ __global__ void copy_string_data(char* string_pool,
 }  // namespace
 
 void persisted_statistics::persist(int num_table_rows,
-                                   SingleWriteMode single_write_mode,
+                                   SingleWriteMode write_mode,
                                    intermediate_statistics& intermediate_stats,
                                    rmm::cuda_stream_view stream)
 {
-  if (single_write_mode == SingleWriteMode::NO) {
+  if (write_mode == SingleWriteMode::NO) {
     // persist the strings in the chunks into a string pool and update pointers
     auto const num_chunks = static_cast<int>(intermediate_stats.stripe_stat_chunks.size());
     // min offset and max offset + 1 for total size
@@ -666,7 +666,7 @@ orc_streams create_streams(host_span<orc_column_view> columns,
                            std::map<uint32_t, size_t> const& decimal_column_sizes,
                            bool enable_dictionary,
                            CompressionKind compression_kind,
-                           SingleWriteMode single_write_mode)
+                           SingleWriteMode write_mode)
 {
   // 'column 0' row index stream
   std::vector<Stream> streams{{ROW_INDEX, 0}};  // TODO: Separate index and data streams?
@@ -681,7 +681,7 @@ orc_streams create_streams(host_span<orc_column_view> columns,
 
   for (auto& column : columns) {
     auto const is_nullable = [&]() -> bool {
-      if (single_write_mode == SingleWriteMode::YES) {
+      if (write_mode == SingleWriteMode::YES) {
         return column.nullable();
       } else {
         // For chunked write, when not provided nullability, we assume the worst case scenario
@@ -2187,7 +2187,7 @@ std::unique_ptr<table_input_metadata> make_table_meta(table_view const& input)
  * @param compression_kind The compression kind
  * @param compression_blocksize The block size used for compression
  * @param stats_freq Column statistics granularity type for parquet/orc writers
- * @param single_write_mode Flag to indicate if there is only a single table write
+ * @param write_mode Flag to indicate if there is only a single table write
  * @param out_sink Sink for writing data
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @return A tuple of the intermediate results containing the processed data
@@ -2200,7 +2200,7 @@ auto convert_table_to_orc_data(table_view const& input,
                                CompressionKind compression_kind,
                                size_t compression_blocksize,
                                statistics_freq stats_freq,
-                               SingleWriteMode single_write_mode,
+                               SingleWriteMode write_mode,
                                data_sink const& out_sink,
                                rmm::cuda_stream_view stream)
 {
@@ -2254,7 +2254,7 @@ auto convert_table_to_orc_data(table_view const& input,
                                 decimal_column_sizes(dec_chunk_sizes.rg_sizes),
                                 enable_dictionary,
                                 compression_kind,
-                                single_write_mode);
+                                write_mode);
   auto enc_data = encode_columns(orc_table,
                                  std::move(dictionaries),
                                  std::move(dec_chunk_sizes),
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 3d4e8ab7bf3..f4fb8207671 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -209,7 +209,7 @@ struct persisted_statistics {
   }
 
   void persist(int num_table_rows,
-               SingleWriteMode single_write_mode,
+               SingleWriteMode write_mode,
                intermediate_statistics& intermediate_stats,
                rmm::cuda_stream_view stream);
 
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index b80016ebc48..8b8c2be346f 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -506,7 +506,7 @@ struct leaf_schema_fn {
 
 inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col,
                             column_in_metadata const& col_meta,
-                            SingleWriteMode single_write_mode)
+                            SingleWriteMode write_mode)
 {
   if (col_meta.is_nullability_defined()) {
     CUDF_EXPECTS(col_meta.nullable() || !col->nullable(),
@@ -516,7 +516,7 @@ inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col,
   }
   // For chunked write, when not provided nullability, we assume the worst case scenario
   // that all columns are nullable.
-  return single_write_mode == SingleWriteMode::NO or col->nullable();
+  return write_mode == SingleWriteMode::NO or col->nullable();
 }
 
 /**
@@ -528,7 +528,7 @@ inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col,
 std::vector<schema_tree_node> construct_schema_tree(
   cudf::detail::LinkedColVector const& linked_columns,
   table_input_metadata& metadata,
-  SingleWriteMode single_write_mode,
+  SingleWriteMode write_mode,
   bool int96_timestamps)
 {
   std::vector<schema_tree_node> schema;
@@ -542,7 +542,7 @@ std::vector<schema_tree_node> construct_schema_tree(
 
   std::function<void(cudf::detail::LinkedColPtr const&, column_in_metadata&, size_t)> add_schema =
     [&](cudf::detail::LinkedColPtr const& col, column_in_metadata& col_meta, size_t parent_idx) {
-      bool col_nullable = is_col_nullable(col, col_meta, single_write_mode);
+      bool col_nullable = is_col_nullable(col, col_meta, write_mode);
 
       auto set_field_id = [&schema, parent_idx](schema_tree_node& s,
                                                 column_in_metadata const& col_meta) {
@@ -679,7 +679,7 @@ std::vector<schema_tree_node> construct_schema_tree(
         right_child_meta.set_name("value");
         // check the repetition type of key is required i.e. the col should be non-nullable
         auto key_col = col->children[lists_column_view::child_column_index]->children[0];
-        CUDF_EXPECTS(!is_col_nullable(key_col, left_child_meta, single_write_mode),
+        CUDF_EXPECTS(!is_col_nullable(key_col, left_child_meta, write_mode),
                      "key column cannot be nullable. For chunked writing, explicitly set the "
                      "nullability to false in metadata");
         // process key

From 4d110d17838516af98665ae0e0b1397f229d5f69 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 24 Apr 2023 11:16:16 -0700
Subject: [PATCH 11/18] Rename `SingleWriteMode` into `single_write_mode`

---
 cpp/include/cudf/io/detail/orc.hpp     |  4 ++--
 cpp/include/cudf/io/detail/parquet.hpp |  4 ++--
 cpp/include/cudf/io/detail/utils.hpp   |  2 +-
 cpp/src/io/functions.cpp               |  8 ++++----
 cpp/src/io/orc/writer_impl.cu          | 18 +++++++++---------
 cpp/src/io/orc/writer_impl.hpp         | 12 ++++++------
 cpp/src/io/parquet/writer_impl.cu      | 14 +++++++-------
 cpp/src/io/parquet/writer_impl.hpp     | 10 +++++-----
 8 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index 1a53690e317..b7794c0df6a 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -96,7 +96,7 @@ class writer {
    */
   explicit writer(std::unique_ptr<cudf::io::data_sink> sink,
                   orc_writer_options const& options,
-                  SingleWriteMode mode,
+                  single_write_mode mode,
                   rmm::cuda_stream_view stream);
 
   /**
@@ -109,7 +109,7 @@ class writer {
    */
   explicit writer(std::unique_ptr<cudf::io::data_sink> sink,
                   chunked_orc_writer_options const& options,
-                  SingleWriteMode mode,
+                  single_write_mode mode,
                   rmm::cuda_stream_view stream);
 
   /**
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 304d42a29c9..9a94924824d 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -156,7 +156,7 @@ class writer {
    */
   explicit writer(std::vector<std::unique_ptr<data_sink>> sinks,
                   parquet_writer_options const& options,
-                  SingleWriteMode mode,
+                  single_write_mode mode,
                   rmm::cuda_stream_view stream);
 
   /**
@@ -171,7 +171,7 @@ class writer {
    */
   explicit writer(std::vector<std::unique_ptr<data_sink>> sinks,
                   chunked_parquet_writer_options const& options,
-                  SingleWriteMode mode,
+                  single_write_mode mode,
                   rmm::cuda_stream_view stream);
 
   /**
diff --git a/cpp/include/cudf/io/detail/utils.hpp b/cpp/include/cudf/io/detail/utils.hpp
index adb7078d96d..e84da92da6c 100644
--- a/cpp/include/cudf/io/detail/utils.hpp
+++ b/cpp/include/cudf/io/detail/utils.hpp
@@ -22,7 +22,7 @@ namespace detail {
 /**
  * @brief Whether writer writes in chunks or all at once
  */
-enum class SingleWriteMode : bool { YES, NO };
+enum class single_write_mode : bool { YES, NO };
 }  // namespace detail
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 76c50d548f9..ddbbce53bab 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -428,7 +428,7 @@ void write_orc(orc_writer_options const& options)
   CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for ORC writing");
 
   auto writer = std::make_unique<detail_orc::writer>(
-    std::move(sinks[0]), options, io_detail::SingleWriteMode::YES, cudf::get_default_stream());
+    std::move(sinks[0]), options, io_detail::single_write_mode::YES, cudf::get_default_stream());
 
   writer->write(options.get_table());
 }
@@ -444,7 +444,7 @@ orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& options
   CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for ORC writing");
 
   writer = std::make_unique<detail_orc::writer>(
-    std::move(sinks[0]), options, io_detail::SingleWriteMode::NO, cudf::get_default_stream());
+    std::move(sinks[0]), options, io_detail::single_write_mode::NO, cudf::get_default_stream());
 }
 
 /**
@@ -519,7 +519,7 @@ std::unique_ptr<std::vector<uint8_t>> write_parquet(parquet_writer_options const
 
   auto sinks  = make_datasinks(options.get_sink());
   auto writer = std::make_unique<detail_parquet::writer>(
-    std::move(sinks), options, io_detail::SingleWriteMode::YES, cudf::get_default_stream());
+    std::move(sinks), options, io_detail::single_write_mode::YES, cudf::get_default_stream());
 
   writer->write(options.get_table(), options.get_partitions());
 
@@ -575,7 +575,7 @@ parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options co
   auto sinks = make_datasinks(options.get_sink());
 
   writer = std::make_unique<detail_parquet::writer>(
-    std::move(sinks), options, io_detail::SingleWriteMode::NO, cudf::get_default_stream());
+    std::move(sinks), options, io_detail::single_write_mode::NO, cudf::get_default_stream());
 }
 
 /**
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 525bc5701ca..53aa8d2400f 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -375,11 +375,11 @@ __global__ void copy_string_data(char* string_pool,
 }  // namespace
 
 void persisted_statistics::persist(int num_table_rows,
-                                   SingleWriteMode write_mode,
+                                   single_write_mode write_mode,
                                    intermediate_statistics& intermediate_stats,
                                    rmm::cuda_stream_view stream)
 {
-  if (write_mode == SingleWriteMode::NO) {
+  if (write_mode == single_write_mode::NO) {
     // persist the strings in the chunks into a string pool and update pointers
     auto const num_chunks = static_cast<int>(intermediate_stats.stripe_stat_chunks.size());
     // min offset and max offset + 1 for total size
@@ -666,7 +666,7 @@ orc_streams create_streams(host_span<orc_column_view> columns,
                            std::map<uint32_t, size_t> const& decimal_column_sizes,
                            bool enable_dictionary,
                            CompressionKind compression_kind,
-                           SingleWriteMode write_mode)
+                           single_write_mode write_mode)
 {
   // 'column 0' row index stream
   std::vector<Stream> streams{{ROW_INDEX, 0}};  // TODO: Separate index and data streams?
@@ -681,7 +681,7 @@ orc_streams create_streams(host_span<orc_column_view> columns,
 
   for (auto& column : columns) {
     auto const is_nullable = [&]() -> bool {
-      if (write_mode == SingleWriteMode::YES) {
+      if (write_mode == single_write_mode::YES) {
         return column.nullable();
       } else {
         // For chunked write, when not provided nullability, we assume the worst case scenario
@@ -2200,7 +2200,7 @@ auto convert_table_to_orc_data(table_view const& input,
                                CompressionKind compression_kind,
                                size_t compression_blocksize,
                                statistics_freq stats_freq,
-                               SingleWriteMode write_mode,
+                               single_write_mode write_mode,
                                data_sink const& out_sink,
                                rmm::cuda_stream_view stream)
 {
@@ -2367,7 +2367,7 @@ auto convert_table_to_orc_data(table_view const& input,
 
 writer::impl::impl(std::unique_ptr<data_sink> sink,
                    orc_writer_options const& options,
-                   SingleWriteMode mode,
+                   single_write_mode mode,
                    rmm::cuda_stream_view stream)
   : _stream(stream),
     _max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
@@ -2387,7 +2387,7 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
 
 writer::impl::impl(std::unique_ptr<data_sink> sink,
                    chunked_orc_writer_options const& options,
-                   SingleWriteMode mode,
+                   single_write_mode mode,
                    rmm::cuda_stream_view stream)
   : _stream(stream),
     _max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
@@ -2688,7 +2688,7 @@ void writer::impl::close()
 // Forward to implementation
 writer::writer(std::unique_ptr<data_sink> sink,
                orc_writer_options const& options,
-               SingleWriteMode mode,
+               single_write_mode mode,
                rmm::cuda_stream_view stream)
   : _impl(std::make_unique<impl>(std::move(sink), options, mode, stream))
 {
@@ -2697,7 +2697,7 @@ writer::writer(std::unique_ptr<data_sink> sink,
 // Forward to implementation
 writer::writer(std::unique_ptr<data_sink> sink,
                chunked_orc_writer_options const& options,
-               SingleWriteMode mode,
+               single_write_mode mode,
                rmm::cuda_stream_view stream)
   : _impl(std::make_unique<impl>(std::move(sink), options, mode, stream))
 {
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index f4fb8207671..472b1294992 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -209,7 +209,7 @@ struct persisted_statistics {
   }
 
   void persist(int num_table_rows,
-               SingleWriteMode write_mode,
+               single_write_mode write_mode,
                intermediate_statistics& intermediate_stats,
                rmm::cuda_stream_view stream);
 
@@ -248,7 +248,7 @@ class writer::impl {
    */
   explicit impl(std::unique_ptr<data_sink> sink,
                 orc_writer_options const& options,
-                SingleWriteMode mode,
+                single_write_mode mode,
                 rmm::cuda_stream_view stream);
 
   /**
@@ -261,7 +261,7 @@ class writer::impl {
    */
   explicit impl(std::unique_ptr<data_sink> sink,
                 chunked_orc_writer_options const& options,
-                SingleWriteMode mode,
+                single_write_mode mode,
                 rmm::cuda_stream_view stream);
 
   /**
@@ -334,9 +334,9 @@ class writer::impl {
   CompressionKind const _compression_kind;
   size_t const _compression_blocksize;
   statistics_freq const _stats_freq;
-  SingleWriteMode const _single_write_mode;  // Special parameter only used by `write()` to indicate
-                                             // that we are guaranteeing a single table write. This
-                                             // enables some internal optimizations.
+  single_write_mode const _single_write_mode;  // Special parameter only used by `write()` to
+                                               // indicate that we are guaranteeing a single table
+                                               // write. This enables some internal optimizations.
   std::map<std::string, std::string> const _kv_meta;  // Optional user metadata.
   std::unique_ptr<data_sink> const _out_sink;
 
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 8b8c2be346f..48108ca2669 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -506,7 +506,7 @@ struct leaf_schema_fn {
 
 inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col,
                             column_in_metadata const& col_meta,
-                            SingleWriteMode write_mode)
+                            single_write_mode write_mode)
 {
   if (col_meta.is_nullability_defined()) {
     CUDF_EXPECTS(col_meta.nullable() || !col->nullable(),
@@ -516,7 +516,7 @@ inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col,
   }
   // For chunked write, when not provided nullability, we assume the worst case scenario
   // that all columns are nullable.
-  return write_mode == SingleWriteMode::NO or col->nullable();
+  return write_mode == single_write_mode::NO or col->nullable();
 }
 
 /**
@@ -528,7 +528,7 @@ inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col,
 std::vector<schema_tree_node> construct_schema_tree(
   cudf::detail::LinkedColVector const& linked_columns,
   table_input_metadata& metadata,
-  SingleWriteMode write_mode,
+  single_write_mode write_mode,
   bool int96_timestamps)
 {
   std::vector<schema_tree_node> schema;
@@ -1315,7 +1315,7 @@ size_t writer::impl::column_index_buffer_size(gpu::EncColumnChunk* ck) const
 
 writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
                    parquet_writer_options const& options,
-                   SingleWriteMode mode,
+                   single_write_mode mode,
                    rmm::cuda_stream_view stream)
   : _stream(stream),
     _compression(to_parquet_compression(options.get_compression())),
@@ -1341,7 +1341,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
 
 writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
                    chunked_parquet_writer_options const& options,
-                   SingleWriteMode mode,
+                   single_write_mode mode,
                    rmm::cuda_stream_view stream)
   : _stream(stream),
     _compression(to_parquet_compression(options.get_compression())),
@@ -2042,7 +2042,7 @@ std::unique_ptr<std::vector<uint8_t>> writer::impl::close(
 // Forward to implementation
 writer::writer(std::vector<std::unique_ptr<data_sink>> sinks,
                parquet_writer_options const& options,
-               SingleWriteMode mode,
+               single_write_mode mode,
                rmm::cuda_stream_view stream)
   : _impl(std::make_unique<impl>(std::move(sinks), options, mode, stream))
 {
@@ -2050,7 +2050,7 @@ writer::writer(std::vector<std::unique_ptr<data_sink>> sinks,
 
 writer::writer(std::vector<std::unique_ptr<data_sink>> sinks,
                chunked_parquet_writer_options const& options,
-               SingleWriteMode mode,
+               single_write_mode mode,
                rmm::cuda_stream_view stream)
   : _impl(std::make_unique<impl>(std::move(sinks), options, mode, stream))
 {
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index d4fdb9a8813..c88287994a1 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -68,7 +68,7 @@ class writer::impl {
    */
   explicit impl(std::vector<std::unique_ptr<data_sink>> sinks,
                 parquet_writer_options const& options,
-                SingleWriteMode mode,
+                single_write_mode mode,
                 rmm::cuda_stream_view stream);
 
   /**
@@ -81,7 +81,7 @@ class writer::impl {
    */
   explicit impl(std::vector<std::unique_ptr<data_sink>> sinks,
                 chunked_parquet_writer_options const& options,
-                SingleWriteMode mode,
+                single_write_mode mode,
                 rmm::cuda_stream_view stream);
 
   /**
@@ -229,9 +229,9 @@ class writer::impl {
   bool const _int96_timestamps;
   int32_t const _column_index_truncate_length;
   std::vector<std::map<std::string, std::string>> const _kv_meta;  // Optional user metadata.
-  SingleWriteMode const _single_write_mode;  // Special parameter only used by `write()` to indicate
-                                             // that we are guaranteeing a single table write.
-                                             // This enables some internal optimizations.
+  single_write_mode const _single_write_mode;  // Special parameter only used by `write()` to
+                                               // indicate that we are guaranteeing a single table
+                                               // write. This enables some internal optimizations.
   std::vector<std::unique_ptr<data_sink>> const _out_sink;
 
   // Internal states, filled during `write()` and written to sink during `write` and `close()`.

From 21c3139d1b8332e0f4fa709895cfa4738700d2b8 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 24 Apr 2023 11:25:01 -0700
Subject: [PATCH 12/18] Fix copyright year

---
 cpp/include/cudf/io/detail/utils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/io/detail/utils.hpp b/cpp/include/cudf/io/detail/utils.hpp
index e84da92da6c..7bbda21858d 100644
--- a/cpp/include/cudf/io/detail/utils.hpp
+++ b/cpp/include/cudf/io/detail/utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 2b01e442b839d09b06bf8bf508257cc5dbc3b321 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 24 Apr 2023 11:46:52 -0700
Subject: [PATCH 13/18] Switch to use `host_span`

---
 cpp/src/io/orc/writer_impl.cu  | 2 +-
 cpp/src/io/orc/writer_impl.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 53aa8d2400f..8f7a28b2d8c 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -2476,7 +2476,7 @@ void writer::impl::write_orc_data_to_sink(orc_streams& streams,
                                           hostdevice_2dvector<gpu::StripeStream> const& strm_descs,
                                           encoded_data const& enc_data,
                                           file_segmentation const& segmentation,
-                                          std::vector<StripeInformation>& stripes,
+                                          host_span<StripeInformation> stripes,
                                           orc_table_view const& orc_table,
                                           rmm::device_buffer const& compressed_data,
                                           intermediate_statistics& intermediate_stats,
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 472b1294992..a063506f55a 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -309,7 +309,7 @@ class writer::impl {
                               hostdevice_2dvector<gpu::StripeStream> const& strm_descs,
                               encoded_data const& enc_data,
                               file_segmentation const& segmentation,
-                              std::vector<StripeInformation>& stripes,
+                              host_span<StripeInformation> stripes,
                               orc_table_view const& orc_table,
                               rmm::device_buffer const& compressed_data,
                               intermediate_statistics& intermediate_stats,

From 2230619075158a234e130ab57fb219ee6fe06b0f Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 24 Apr 2023 11:50:57 -0700
Subject: [PATCH 14/18] Reorganize parameters

---
 cpp/src/io/orc/writer_impl.cu  | 12 ++++++------
 cpp/src/io/orc/writer_impl.hpp | 26 +++++++++++++-------------
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 8f7a28b2d8c..41588ec9d88 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -2456,30 +2456,30 @@ void writer::impl::write(table_view const& input)
   }();
 
   // Compression/encoding were all successful. Now write the intermediate results.
-  write_orc_data_to_sink(streams,
-                         comp_results,
+  write_orc_data_to_sink(comp_results,
                          strm_descs,
                          enc_data,
                          segmentation,
-                         stripes,
                          orc_table,
                          compressed_data,
                          intermediate_stats,
+                         streams,
+                         stripes,
                          bounce_buffer);
 
   // Update data into the footer. This needs to be called even when num_rows==0.
   add_table_to_footer_data(orc_table, stripes);
 }
 
-void writer::impl::write_orc_data_to_sink(orc_streams& streams,
-                                          hostdevice_vector<compression_result> const& comp_results,
+void writer::impl::write_orc_data_to_sink(hostdevice_vector<compression_result> const& comp_results,
                                           hostdevice_2dvector<gpu::StripeStream> const& strm_descs,
                                           encoded_data const& enc_data,
                                           file_segmentation const& segmentation,
-                                          host_span<StripeInformation> stripes,
                                           orc_table_view const& orc_table,
                                           rmm::device_buffer const& compressed_data,
                                           intermediate_statistics& intermediate_stats,
+                                          orc_streams& streams,
+                                          host_span<StripeInformation> stripes,
                                           host_span<uint8_t> bounce_buffer)
 {
   if (orc_table.num_rows() == 0) { return; }
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index a063506f55a..0e8c70c9351 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -293,26 +293,26 @@ class writer::impl {
    * The intermediate data is generated from processing (compressing/encoding) an cuDF input table
    * by `process_for_write` called in the `write()` function.
    *
-   * @param streams List of stream descriptors
-   * @param comp_results Status of data compression
-   * @param strm_descs List of stream descriptors
-   * @param enc_data ORC per-chunk streams of encoded data
-   * @param segmentation Description of how the ORC file is segmented into stripes and rowgroups
-   * @param stripes List of stripe description
-   * @param orc_table Non-owning view of a cuDF table that includes ORC-related information
-   * @param compressed_data Compressed stream data
-   * @param intermediate_stats Statistics data stored between calls to write
-   * @param bounce_buffer Temporary host output buffer
+   * @param[in] comp_results Status of data compression
+   * @param[in] strm_descs List of stream descriptors
+   * @param[in] enc_data ORC per-chunk streams of encoded data
+   * @param[in] segmentation Description of how the ORC file is segmented into stripes and rowgroups
+   * @param[in] orc_table Non-owning view of a cuDF table that includes ORC-related information
+   * @param[in] compressed_data Compressed stream data
+   * @param[in,out] intermediate_stats Statistics data stored between calls to write
+   * @param[in,out] streams List of stream descriptors
+   * @param[in,out] stripes List of stripe description
+   * @param[in,out] bounce_buffer Temporary host output buffer
    */
-  void write_orc_data_to_sink(orc_streams& streams,
-                              hostdevice_vector<compression_result> const& comp_results,
+  void write_orc_data_to_sink(hostdevice_vector<compression_result> const& comp_results,
                               hostdevice_2dvector<gpu::StripeStream> const& strm_descs,
                               encoded_data const& enc_data,
                               file_segmentation const& segmentation,
-                              host_span<StripeInformation> stripes,
                               orc_table_view const& orc_table,
                               rmm::device_buffer const& compressed_data,
                               intermediate_statistics& intermediate_stats,
+                              orc_streams& streams,
+                              host_span<StripeInformation> stripes,
                               host_span<uint8_t> bounce_buffer);
 
   /**

From 9170fef683ff981c3f09fdf686a8ae26db57b2d0 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 24 Apr 2023 12:05:22 -0700
Subject: [PATCH 15/18] More span adoption

---
 cpp/src/io/orc/writer_impl.cu  | 32 +++++++++++++++++---------------
 cpp/src/io/orc/writer_impl.hpp | 12 ++++++------
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 41588ec9d88..70e15c96bb8 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -2456,27 +2456,29 @@ void writer::impl::write(table_view const& input)
   }();
 
   // Compression/encoding were all successful. Now write the intermediate results.
-  write_orc_data_to_sink(comp_results,
-                         strm_descs,
-                         enc_data,
-                         segmentation,
-                         orc_table,
-                         compressed_data,
-                         intermediate_stats,
-                         streams,
-                         stripes,
-                         bounce_buffer);
+  write_orc_data_to_sink(
+    enc_data,
+    segmentation,
+    orc_table,
+    device_span<uint8_t const>(reinterpret_cast<uint8_t const*>(compressed_data.data()),
+                               compressed_data.size()),
+    comp_results,
+    strm_descs,
+    intermediate_stats,
+    streams,
+    stripes,
+    bounce_buffer);
 
   // Update data into the footer. This needs to be called even when num_rows==0.
   add_table_to_footer_data(orc_table, stripes);
 }
 
-void writer::impl::write_orc_data_to_sink(hostdevice_vector<compression_result> const& comp_results,
-                                          hostdevice_2dvector<gpu::StripeStream> const& strm_descs,
-                                          encoded_data const& enc_data,
+void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data,
                                           file_segmentation const& segmentation,
                                           orc_table_view const& orc_table,
-                                          rmm::device_buffer const& compressed_data,
+                                          device_span<uint8_t const> compressed_data,
+                                          host_span<compression_result const> comp_results,
+                                          host_2dspan<gpu::StripeStream const> strm_descs,
                                           intermediate_statistics& intermediate_stats,
                                           orc_streams& streams,
                                           host_span<StripeInformation> stripes,
@@ -2519,7 +2521,7 @@ void writer::impl::write_orc_data_to_sink(hostdevice_vector<compression_result>
       write_tasks.push_back(write_data_stream(
         strm_desc,
         enc_data.streams[strm_desc.column_id][segmentation.stripes[stripe_id].first],
-        static_cast<uint8_t const*>(compressed_data.data()),
+        compressed_data.data(),
         bounce_buffer.data(),
         &stripe,
         &streams,
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 0e8c70c9351..5cabfd66d95 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -293,23 +293,23 @@ class writer::impl {
    * The intermediate data is generated from processing (compressing/encoding) an cuDF input table
    * by `process_for_write` called in the `write()` function.
    *
-   * @param[in] comp_results Status of data compression
-   * @param[in] strm_descs List of stream descriptors
    * @param[in] enc_data ORC per-chunk streams of encoded data
    * @param[in] segmentation Description of how the ORC file is segmented into stripes and rowgroups
    * @param[in] orc_table Non-owning view of a cuDF table that includes ORC-related information
    * @param[in] compressed_data Compressed stream data
+   * @param[in] comp_results Status of data compression
+   * @param[in] strm_descs List of stream descriptors
    * @param[in,out] intermediate_stats Statistics data stored between calls to write
    * @param[in,out] streams List of stream descriptors
    * @param[in,out] stripes List of stripe description
    * @param[in,out] bounce_buffer Temporary host output buffer
    */
-  void write_orc_data_to_sink(hostdevice_vector<compression_result> const& comp_results,
-                              hostdevice_2dvector<gpu::StripeStream> const& strm_descs,
-                              encoded_data const& enc_data,
+  void write_orc_data_to_sink(encoded_data const& enc_data,
                               file_segmentation const& segmentation,
                               orc_table_view const& orc_table,
-                              rmm::device_buffer const& compressed_data,
+                              device_span<uint8_t const> compressed_data,
+                              host_span<compression_result const> comp_results,
+                              host_2dspan<gpu::StripeStream const> strm_descs,
                               intermediate_statistics& intermediate_stats,
                               orc_streams& streams,
                               host_span<StripeInformation> stripes,

From 9cc92ddef815dd68f3b1966c8e57c3469a5f359d Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 24 Apr 2023 13:36:03 -0700
Subject: [PATCH 16/18] Fix docs

---
 cpp/src/io/orc/writer_impl.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 5cabfd66d95..2b04b418b41 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -302,7 +302,7 @@ class writer::impl {
    * @param[in,out] intermediate_stats Statistics data stored between calls to write
    * @param[in,out] streams List of stream descriptors
    * @param[in,out] stripes List of stripe description
-   * @param[in,out] bounce_buffer Temporary host output buffer
+   * @param[out] bounce_buffer Temporary host output buffer
    */
   void write_orc_data_to_sink(encoded_data const& enc_data,
                               file_segmentation const& segmentation,

From 4282b9432f3b2fcf56bf8196f7593606ed04856c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 24 Apr 2023 13:44:37 -0700
Subject: [PATCH 17/18] Reorder parameters again

---
 cpp/src/io/orc/writer_impl.cu | 38 +++++++++++++++++------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 70e15c96bb8..c20a5ec96df 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -2273,16 +2273,16 @@ auto convert_table_to_orc_data(table_view const& input,
   auto stripes = gather_stripes(num_index_streams, segmentation, &enc_data, &strm_descs, stream);
 
   if (num_rows == 0) {
-    return std::tuple{std::move(streams),
-                      hostdevice_vector<compression_result>{},  // comp_results
-                      std::move(strm_descs),
-                      std::move(enc_data),
+    return std::tuple{std::move(enc_data),
                       std::move(segmentation),
-                      std::move(stripe_dict),
-                      std::move(stripes),
                       std::move(orc_table),
-                      rmm::device_buffer{},  // compressed_data
+                      rmm::device_buffer{},                     // compressed_data
+                      hostdevice_vector<compression_result>{},  // comp_results
+                      std::move(strm_descs),
                       intermediate_statistics{stream},
+                      std::move(streams),
+                      std::move(stripes),
+                      std::move(stripe_dict),
                       cudf::detail::pinned_host_vector<uint8_t>()};
   }
 
@@ -2350,16 +2350,16 @@ auto convert_table_to_orc_data(table_view const& input,
 
   auto intermediate_stats = gather_statistic_blobs(stats_freq, orc_table, segmentation, stream);
 
-  return std::tuple{std::move(streams),
-                    std::move(comp_results),
-                    std::move(strm_descs),
-                    std::move(enc_data),
+  return std::tuple{std::move(enc_data),
                     std::move(segmentation),
-                    std::move(stripe_dict),
-                    std::move(stripes),
                     std::move(orc_table),
                     std::move(compressed_data),
+                    std::move(comp_results),
+                    std::move(strm_descs),
                     std::move(intermediate_stats),
+                    std::move(streams),
+                    std::move(stripes),
+                    std::move(stripe_dict),
                     std::move(bounce_buffer)};
 }
 
@@ -2424,16 +2424,16 @@ void writer::impl::write(table_view const& input)
   // is still intact.
   // Note that `out_sink_` is intentionally passed by const reference to prevent accidentally
   // writing anything to it.
-  [[maybe_unused]] auto [streams,
-                         comp_results,
-                         strm_descs,
-                         enc_data,
+  [[maybe_unused]] auto [enc_data,
                          segmentation,
-                         stripe_dict, /* unused, but its data will be accessed via pointer later */
-                         stripes,
                          orc_table,
                          compressed_data,
+                         comp_results,
+                         strm_descs,
                          intermediate_stats,
+                         streams,
+                         stripes,
+                         stripe_dict, /* unused, but its data will be accessed via pointer later */
                          bounce_buffer] = [&] {
     try {
       return convert_table_to_orc_data(input,

From 156463c7d6a2286f2452dc84450efa316b711a46 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 24 Apr 2023 13:55:29 -0700
Subject: [PATCH 18/18] Simplify code by changing `device_buffer` into
 `device_uvector`

---
 cpp/src/io/orc/writer_impl.cu | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index c20a5ec96df..fef1bb23733 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -2276,7 +2276,7 @@ auto convert_table_to_orc_data(table_view const& input,
     return std::tuple{std::move(enc_data),
                       std::move(segmentation),
                       std::move(orc_table),
-                      rmm::device_buffer{},                     // compressed_data
+                      rmm::device_uvector<uint8_t>{0, stream},  // compressed_data
                       hostdevice_vector<compression_result>{},  // comp_results
                       std::move(strm_descs),
                       intermediate_statistics{stream},
@@ -2322,7 +2322,7 @@ auto convert_table_to_orc_data(table_view const& input,
   }();
 
   // Compress the data streams
-  rmm::device_buffer compressed_data(compressed_bfr_size, stream);
+  rmm::device_uvector<uint8_t> compressed_data(compressed_bfr_size, stream);
   hostdevice_vector<compression_result> comp_results(num_compressed_blocks, stream);
   thrust::fill(rmm::exec_policy(stream),
                comp_results.d_begin(),
@@ -2330,7 +2330,7 @@ auto convert_table_to_orc_data(table_view const& input,
                compression_result{0, compression_status::FAILURE});
   if (compression_kind != NONE) {
     strm_descs.host_to_device(stream);
-    gpu::CompressOrcDataStreams(static_cast<uint8_t*>(compressed_data.data()),
+    gpu::CompressOrcDataStreams(compressed_data.data(),
                                 num_compressed_blocks,
                                 compression_kind,
                                 compression_blocksize,
@@ -2456,18 +2456,16 @@ void writer::impl::write(table_view const& input)
   }();
 
   // Compression/encoding were all successful. Now write the intermediate results.
-  write_orc_data_to_sink(
-    enc_data,
-    segmentation,
-    orc_table,
-    device_span<uint8_t const>(reinterpret_cast<uint8_t const*>(compressed_data.data()),
-                               compressed_data.size()),
-    comp_results,
-    strm_descs,
-    intermediate_stats,
-    streams,
-    stripes,
-    bounce_buffer);
+  write_orc_data_to_sink(enc_data,
+                         segmentation,
+                         orc_table,
+                         compressed_data,
+                         comp_results,
+                         strm_descs,
+                         intermediate_stats,
+                         streams,
+                         stripes,
+                         bounce_buffer);
 
   // Update data into the footer. This needs to be called even when num_rows==0.
   add_table_to_footer_data(orc_table, stripes);