From 8d49db5bbd959d8b1ea28cbebf896e6e911716f5 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 27 Oct 2022 13:21:38 -0500
Subject: [PATCH 1/4] Ignore python docs build artifacts (#12000)

This PR gitignores some of the python docs build artifcats that keep showing up in `git status`

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/12000
---
 .gitignore                              | 2 ++
 docs/cudf/source/user_guide/10min.ipynb | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index aaac92ff643..91a7ecc49f7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -165,3 +165,5 @@ dask-worker-space/
 # Sphinx docs & build artifacts
 docs/cudf/source/api_docs/generated/*
 docs/cudf/source/api_docs/api/*
+docs/cudf/source/user_guide/example_output/*
+docs/cudf/source/user_guide/cudf.*Dtype.*.rst
diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb
index b9278151e64..ce6c55fe134 100644
--- a/docs/cudf/source/user_guide/10min.ipynb
+++ b/docs/cudf/source/user_guide/10min.ipynb
@@ -5474,7 +5474,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Writing to parquet files, using the CPU via PyArrow."
+    "Writing to parquet files with GPU-accelerated parquet writer"
    ]
   },
   {
@@ -5749,7 +5749,7 @@
     }
    ],
    "source": [
-    "ddf.to_parquet('example_files')  "
+    "ddf.to_parquet('example_output/ddf_parquet_files')"
    ]
   },
   {

From b4ca89492b5e4bf8d54102a4082c2d362350e783 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 27 Oct 2022 20:26:22 +0200
Subject: [PATCH 2/4] Add `strip_delimiters` option to `read_text` (#11946)

This adds a `strip_delimiters` post-processing option to `read_text`. I needed to implement some lightweight striping because a thread-per-row parallelization of the string gather gave pretty bad performance.

For consistency, I also removed the special-case handling of delimiters at the end (previously adding an empty row), to match the read_csv behavior.

Benchmark results:

```
benchmarks/MULTIBYTE_SPLIT_NVBENCH --axis size_approx[pow2]=30 --axis byte_range_percent=100 --axis T=device --axis delim_size=4
```

### [0] Tesla T4

|   T    | strip_delimiters | delim_percent |    size_approx    |   CPU Time  | Noise | Peak Memory Usage | Encoded file size |
|--------|------------------|---------------|-------------------|------------|-------|-------------------|-------------------|
| device |                0 |             1 | 2^30 = 1073741824 | 178.133 ms | 0.36% |         3.709 GiB |      1014.442 MiB |
| device |                1 |             1 | 2^30 = 1073741824 | 188.328 ms | 0.31% |         4.690 GiB |      1014.442 MiB |
| device |                0 |            25 | 2^30 = 1073741824 | 206.188 ms | 0.03% |         5.292 GiB |       953.075 MiB |
| device |                1 |            25 | 2^30 = 1073741824 | 242.534 ms | 0.50% |         5.975 GiB |       953.075 MiB |

Closes #11625

Authors:
  - Tobias Ribizel (https://github.com/upsj)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/11946
---
 cpp/benchmarks/io/text/multibyte_split.cpp   |  5 +-
 cpp/include/cudf/io/text/multibyte_split.hpp | 30 +++++-
 cpp/src/io/text/multibyte_split.cu           | 48 ++++++++--
 cpp/tests/io/text/multibyte_split_test.cpp   | 98 +++++++++++++++++++-
 python/cudf/cudf/_lib/cpp/io/text.pxd        | 10 +-
 python/cudf/cudf/_lib/text.pyx               | 22 +++--
 python/cudf/cudf/io/text.py                  |  2 +
 python/cudf/cudf/tests/test_text.py          | 24 ++++-
 python/cudf/cudf/utils/ioutils.py            |  7 ++
 9 files changed, 212 insertions(+), 34 deletions(-)

diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp
index 380766fee46..c0e82b34623 100644
--- a/cpp/benchmarks/io/text/multibyte_split.cpp
+++ b/cpp/benchmarks/io/text/multibyte_split.cpp
@@ -121,6 +121,7 @@ static void bench_multibyte_split(nvbench::state& state,
   auto const delim_percent      = state.get_int64("delim_percent");
   auto const file_size_approx   = state.get_int64("size_approx");
   auto const byte_range_percent = state.get_int64("byte_range_percent");
+  auto const strip_delimiters   = bool(state.get_int64("strip_delimiters"));
 
   auto const byte_range_factor = static_cast<double>(byte_range_percent) / 100;
   CUDF_EXPECTS(delim_percent >= 1, "delimiter percent must be at least 1");
@@ -182,12 +183,13 @@ static void bench_multibyte_split(nvbench::state& state,
   auto const range_size   = static_cast<int64_t>(device_input.size() * byte_range_factor);
   auto const range_offset = (device_input.size() - range_size) / 2;
   cudf::io::text::byte_range_info range{range_offset, range_size};
+  cudf::io::text::parse_options options{range, strip_delimiters};
   std::unique_ptr<cudf::column> output;
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     try_drop_l3_cache();
-    output = cudf::io::text::multibyte_split(*source, delim, range);
+    output = cudf::io::text::multibyte_split(*source, delim, options);
   });
 
   state.add_buffer_size(mem_stats_logger.peak_memory_usage(), "pmu", "Peak Memory Usage");
@@ -203,6 +205,7 @@ using source_type_list = nvbench::enum_type_list<data_chunk_source_type::device,
 
 NVBENCH_BENCH_TYPES(bench_multibyte_split, NVBENCH_TYPE_AXES(source_type_list))
   .set_name("multibyte_split")
+  .add_int64_axis("strip_delimiters", {0, 1})
   .add_int64_axis("delim_size", {1, 4, 7})
   .add_int64_axis("delim_percent", {1, 25})
   .add_int64_power_of_two_axis("size_approx", {15, 30})
diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp
index abb966a55bf..a7edc9be0e4 100644
--- a/cpp/include/cudf/io/text/multibyte_split.hpp
+++ b/cpp/include/cudf/io/text/multibyte_split.hpp
@@ -30,11 +30,25 @@ namespace cudf {
 namespace io {
 namespace text {
 
+/**
+ * @brief Parsing options for multibyte_split.
+ */
+struct parse_options {
+  /**
+   * @brief Only rows starting inside this byte range will be part of the output column.
+   */
+  byte_range_info byte_range = create_byte_range_info_max();
+  /**
+   * @brief Whether delimiters at the end of rows should be stripped from the output column
+   */
+  bool strip_delimiters = false;
+};
+
 /**
  * @brief Splits the source text into a strings column using a multiple byte delimiter.
  *
- * Providing a byte range allows multibyte_split to read a whole file, but only return the offsets
- * of delimiters which begin within the range. If thinking in terms of "records", where each
+ * Providing a byte range allows multibyte_split to read a file partially, only returning the
+ * offsets of delimiters which begin within the range. If thinking in terms of "records", where each
  * delimiter dictates the end of a record, all records which begin within the byte range provided
  * will be returned, including any record which may begin in the range but end outside of the
  * range. Records which begin outside of the range will ignored, even if those records end inside
@@ -63,7 +77,7 @@ namespace text {
  *
  * @param source The source string
  * @param delimiter UTF-8 encoded string for which to find offsets in the source
- * @param byte_range range in which to consider offsets relevant
+ * @param options the parsing options to use (including byte range)
  * @param mr Memory resource to use for the device memory allocation
  * @return The strings found by splitting the source by the delimiter within the relevant byte
  * range.
@@ -71,8 +85,14 @@ namespace text {
 std::unique_ptr<cudf::column> multibyte_split(
   data_chunk_source const& source,
   std::string const& delimiter,
-  std::optional<byte_range_info> byte_range = std::nullopt,
-  rmm::mr::device_memory_resource* mr       = rmm::mr::get_current_device_resource());
+  parse_options options               = {},
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+std::unique_ptr<cudf::column> multibyte_split(
+  data_chunk_source const& source,
+  std::string const& delimiter,
+  std::optional<byte_range_info> byte_range,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::column> multibyte_split(data_chunk_source const& source,
                                               std::string const& delimiter,
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 29cec0e8c3f..0d699fc72fd 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -21,13 +21,16 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/io/text/byte_range_info.hpp>
 #include <cudf/io/text/data_chunk_source.hpp>
 #include <cudf/io/text/detail/multistate.hpp>
 #include <cudf/io/text/detail/tile_state.hpp>
+#include <cudf/io/text/multibyte_split.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -551,6 +554,7 @@ class output_builder {
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
                                               byte_range_info byte_range,
+                                              bool strip_delimiters,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr,
                                               rmm::cuda_stream_pool& stream_pool)
@@ -756,8 +760,12 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   auto chars          = char_storage.gather(stream, mr);
   auto global_offsets = row_offset_storage.gather(stream, mr);
 
-  bool const insert_begin = *first_row_offset == 0;
-  bool const insert_end   = not last_row_offset.has_value() or last_row_offset == chunk_offset;
+  // insert an offset at the beginning if we started at the beginning of the input
+  bool const insert_begin = first_row_offset.value_or(0) == 0;
+  // insert an offset at the end if we have not terminated the last row
+  bool const insert_end =
+    not(last_row_offset.has_value() or
+        (global_offsets.size() > 0 and global_offsets.back_element(stream) == chunk_offset));
   rmm::device_uvector<int32_t> offsets{
     global_offsets.size() + insert_begin + insert_end, stream, mr};
   if (insert_begin) { offsets.set_element_to_zero_async(0, stream); }
@@ -771,10 +779,27 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
                     [baseline = *first_row_offset] __device__(byte_offset global_offset) {
                       return static_cast<int32_t>(global_offset - baseline);
                     });
-
   auto string_count = offsets.size() - 1;
-
-  return cudf::make_strings_column(string_count, std::move(offsets), std::move(chars));
+  if (strip_delimiters) {
+    auto it = cudf::detail::make_counting_transform_iterator(
+      0,
+      [ofs        = offsets.data(),
+       chars      = chars.data(),
+       delim_size = static_cast<size_type>(delimiter.size()),
+       last_row   = static_cast<size_type>(string_count) - 1,
+       insert_end] __device__(size_type row) {
+        auto const begin = ofs[row];
+        auto const len   = ofs[row + 1] - begin;
+        if (row == last_row && insert_end) {
+          return thrust::make_pair(chars + begin, len);
+        } else {
+          return thrust::make_pair(chars + begin, std::max<size_type>(0, len - delim_size));
+        };
+      });
+    return cudf::strings::detail::make_strings_column(it, it + string_count, stream, mr);
+  } else {
+    return cudf::make_strings_column(string_count, std::move(offsets), std::move(chars));
+  }
 }
 
 }  // namespace detail
@@ -783,12 +808,21 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
                                               std::string const& delimiter,
                                               std::optional<byte_range_info> byte_range,
                                               rmm::mr::device_memory_resource* mr)
+{
+  return multibyte_split(
+    source, delimiter, parse_options{byte_range.value_or(create_byte_range_info_max())}, mr);
+}
+
+std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
+                                              std::string const& delimiter,
+                                              parse_options options,
+                                              rmm::mr::device_memory_resource* mr)
 {
   auto stream      = cudf::get_default_stream();
   auto stream_pool = rmm::cuda_stream_pool(2);
 
   auto result = detail::multibyte_split(
-    source, delimiter, byte_range.value_or(create_byte_range_info_max()), stream, mr, stream_pool);
+    source, delimiter, options.byte_range, options.strip_delimiters, stream, mr, stream_pool);
 
   return result;
 }
@@ -797,7 +831,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
                                               std::string const& delimiter,
                                               rmm::mr::device_memory_resource* mr)
 {
-  return multibyte_split(source, delimiter, std::nullopt, mr);
+  return multibyte_split(source, delimiter, parse_options{}, mr);
 }
 
 }  // namespace text
diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp
index 43debf3d5b3..2da7073b334 100644
--- a/cpp/tests/io/text/multibyte_split_test.cpp
+++ b/cpp/tests/io/text/multibyte_split_test.cpp
@@ -62,12 +62,25 @@ TEST_F(MultibyteSplitTest, NondeterministicMatching)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
 }
 
+TEST_F(MultibyteSplitTest, NoDelimiter)
+{
+  auto delimiter  = std::string(":");
+  auto host_input = std::string("abcdefg");
+
+  auto expected = strings_column_wrapper{"abcdefg"};
+
+  auto source = cudf::io::text::make_source(host_input);
+  auto out    = cudf::io::text::multibyte_split(*source, delimiter);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
 TEST_F(MultibyteSplitTest, DelimiterAtEnd)
 {
   auto delimiter  = std::string(":");
   auto host_input = std::string("abcdefg:");
 
-  auto expected = strings_column_wrapper{"abcdefg:", ""};
+  auto expected = strings_column_wrapper{"abcdefg:"};
 
   auto source = cudf::io::text::make_source(host_input);
   auto out    = cudf::io::text::multibyte_split(*source, delimiter);
@@ -80,7 +93,7 @@ TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange)
   auto delimiter  = std::string(":");
   auto host_input = std::string("abcdefg:");
 
-  auto expected = strings_column_wrapper{"abcdefg:", ""};
+  auto expected = strings_column_wrapper{"abcdefg:"};
 
   auto source = cudf::io::text::make_source(host_input);
   auto out    = cudf::io::text::multibyte_split(
@@ -91,6 +104,22 @@ TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
 }
 
+TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange2)
+{
+  auto delimiter  = std::string(":");
+  auto host_input = std::string("abcdefg:");
+
+  auto expected = strings_column_wrapper{"abcdefg:"};
+
+  auto source = cudf::io::text::make_source(host_input);
+  auto out    = cudf::io::text::multibyte_split(
+    *source,
+    delimiter,
+    cudf::io::text::byte_range_info{0, static_cast<int64_t>(host_input.size() - 1)});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
 TEST_F(MultibyteSplitTest, LargeInputSparse)
 {
   auto host_input    = std::string(1024 * 1024 * 32, '.');
@@ -120,8 +149,6 @@ TEST_F(MultibyteSplitTest, LargeInput)
     host_expected.emplace_back(std::string("...:|"));
   }
 
-  host_expected.emplace_back(std::string(""));
-
   auto expected = strings_column_wrapper{host_expected.begin(), host_expected.end()};
 
   auto delimiter = std::string("...:|");
@@ -146,6 +173,52 @@ TEST_F(MultibyteSplitTest, OverlappingMatchErasure)
   // CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); // this use case it not yet supported.
 }
 
+TEST_F(MultibyteSplitTest, DelimiterErasure)
+{
+  auto delimiter = "\r\n";
+
+  auto host_input = std::string("line\r\nanother line\r\nthird line\r\n");
+  auto expected   = strings_column_wrapper{"line", "another line", "third line"};
+
+  cudf::io::text::parse_options options;
+  options.strip_delimiters = true;
+  auto source              = cudf::io::text::make_source(host_input);
+  auto out                 = cudf::io::text::multibyte_split(*source, delimiter, options);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
+TEST_F(MultibyteSplitTest, DelimiterErasureByteRange)
+{
+  auto delimiter = "\r\n";
+
+  auto host_input = std::string("line\r\nanother line\r\nthird line\r\n");
+  auto expected   = strings_column_wrapper{"line", "another line", "third line"};
+
+  cudf::io::text::parse_options options;
+  options.strip_delimiters = true;
+  options.byte_range       = cudf::io::text::byte_range_info(0, host_input.size() - 1);
+  auto source              = cudf::io::text::make_source(host_input);
+  auto out                 = cudf::io::text::multibyte_split(*source, delimiter, options);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
+TEST_F(MultibyteSplitTest, DelimiterErasureOverlap)
+{
+  auto delimiter = "::";
+
+  auto host_input = std::string("::a:::b::c::::d");
+  auto expected   = strings_column_wrapper{"", "a", "", "b", "c", "", "", "d"};
+
+  cudf::io::text::parse_options options;
+  options.strip_delimiters = true;
+  auto source              = cudf::io::text::make_source(host_input);
+  auto out                 = cudf::io::text::multibyte_split(*source, delimiter, options);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
 TEST_F(MultibyteSplitTest, HandpickedInput)
 {
   auto delimiters = "::|";
@@ -184,7 +257,7 @@ TEST_F(MultibyteSplitTest, HandpickedInput)
     "ggg::|",         "hhh::|",      "___::|",       "here::|", "is::|",     "another::|",
     "simple::|",      "text::|",     "seperated::|", "by::|",   "emojis::|", "which::|",
     "are::|",         "multiple::|", "bytes::|",     "and::|",  "used::|",   "as::|",
-    "delimiters.::|", "::|",         "::|",          "::|",     ""};
+    "delimiters.::|", "::|",         "::|",          "::|"};
 
   auto source = cudf::io::text::make_source(host_input);
   auto out    = cudf::io::text::multibyte_split(*source, delimiters);
@@ -359,6 +432,21 @@ TEST_F(MultibyteSplitTest, SmallInputAllPossibleRangesSingleByte)
   }
 }
 
+TEST_F(MultibyteSplitTest, SingletonRangeAtEnd)
+{
+  // we want a delimiter at the end of the file to not create a new empty row even if it is the only
+  // character in the byte range
+  using namespace cudf::io::text;
+  auto host_input = std::string("ab:cd:");
+  auto delimiter  = std::string(":");
+  auto source     = make_source(host_input);
+  auto expected   = strings_column_wrapper{};
+
+  auto out = multibyte_split(*source, delimiter, byte_range_info{5, 1});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, debug_output_level::ALL_ERRORS);
+}
+
 TEST_F(MultibyteSplitTest, EmptyInput)
 {
   using namespace cudf::io::text;
diff --git a/python/cudf/cudf/_lib/cpp/io/text.pxd b/python/cudf/cudf/_lib/cpp/io/text.pxd
index 7bbe870dad3..368b014ea4b 100644
--- a/python/cudf/cudf/_lib/cpp/io/text.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/text.pxd
@@ -1,6 +1,7 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint64_t
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
@@ -37,9 +38,12 @@ cdef extern from "cudf/io/text/data_chunk_source_factories.hpp" \
 cdef extern from "cudf/io/text/multibyte_split.hpp" \
         namespace "cudf::io::text" nogil:
 
-    unique_ptr[column] multibyte_split(data_chunk_source source,
-                                       string delimiter) except +
+    cdef cppclass parse_options:
+        byte_range_info byte_range
+        bool strip_delimiters
+
+        parse_options() except +
 
     unique_ptr[column] multibyte_split(data_chunk_source source,
                                        string delimiter,
-                                       byte_range_info byte_range) except +
+                                       parse_options options) except +
diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx
index 31a5617af58..be11132497e 100644
--- a/python/cudf/cudf/_lib/text.pyx
+++ b/python/cudf/cudf/_lib/text.pyx
@@ -19,12 +19,14 @@ from cudf._lib.cpp.io.text cimport (
     make_source_from_bgzip_file,
     make_source_from_file,
     multibyte_split,
+    parse_options,
 )
 
 
 def read_text(object filepaths_or_buffers,
               object delimiter=None,
               object byte_range=None,
+              object strip_delimiters=False,
               object compression=None,
               object compression_offsets=None):
     """
@@ -44,6 +46,7 @@ def read_text(object filepaths_or_buffers,
     cdef byte_range_info c_byte_range
     cdef uint64_t c_compression_begin_offset
     cdef uint64_t c_compression_end_offset
+    cdef parse_options c_options
 
     if compression is None:
         if isinstance(filepaths_or_buffers, TextIOBase):
@@ -71,19 +74,18 @@ def read_text(object filepaths_or_buffers,
     else:
         raise ValueError("Only bgzip compression is supported at the moment")
 
-    if (byte_range is None):
-        with nogil:
-            c_col = move(multibyte_split(dereference(datasource), delim))
-    else:
+    c_options = parse_options()
+    if byte_range is not None:
         c_byte_range_offset = byte_range[0]
         c_byte_range_size = byte_range[1]
-        c_byte_range = byte_range_info(
+        c_options.byte_range = byte_range_info(
             c_byte_range_offset,
             c_byte_range_size)
-        with nogil:
-            c_col = move(multibyte_split(
-                dereference(datasource),
-                delim,
-                c_byte_range))
+    c_options.strip_delimiters = strip_delimiters
+    with nogil:
+        c_col = move(multibyte_split(
+            dereference(datasource),
+            delim,
+            c_options))
 
     return {None: Column.from_unique_ptr(move(c_col))}
diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py
index 23983f01966..f341edbf6c1 100644
--- a/python/cudf/cudf/io/text.py
+++ b/python/cudf/cudf/io/text.py
@@ -14,6 +14,7 @@ def read_text(
     filepath_or_buffer,
     delimiter=None,
     byte_range=None,
+    strip_delimiters=False,
     compression=None,
     compression_offsets=None,
     **kwargs,
@@ -35,6 +36,7 @@ def read_text(
             filepath_or_buffer,
             delimiter=delimiter,
             byte_range=byte_range,
+            strip_delimiters=strip_delimiters,
             compression=compression,
             compression_offsets=compression_offsets,
         )
diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py
index 7f41d606473..627bf0a68bb 100644
--- a/python/cudf/cudf/tests/test_text.py
+++ b/python/cudf/cudf/tests/test_text.py
@@ -827,14 +827,20 @@ def test_read_text_byte_range(datadir):
 
 
 def test_read_text_byte_range_large(tmpdir):
-    content = str([["\n" if x % 5 == 0 else "x"] for x in range(0, 3000)])
-    delimiter = "1."
+    content = "".join(("\n" if x % 5 == 4 else "x") for x in range(0, 3000))
+    delimiter = "\n"
     temp_file = str(tmpdir) + "/temp.txt"
 
     with open(temp_file, "w") as f:
         f.write(content)
 
-    cudf.read_text(temp_file, delimiter=delimiter)
+    expected = cudf.Series(["xxxx\n" for i in range(0, 200)])
+
+    actual = cudf.read_text(
+        temp_file, delimiter=delimiter, byte_range=[1000, 1000]
+    )
+
+    assert_eq(expected, actual)
 
 
 def test_read_text_in_memory(datadir):
@@ -847,6 +853,18 @@ def test_read_text_in_memory(datadir):
     assert_eq(expected, actual)
 
 
+def test_read_text_in_memory_strip_delimiter(datadir):
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(["x", "y", "z"])
+
+    actual = cudf.read_text(
+        StringIO("x::y::z"), delimiter="::", strip_delimiters=True
+    )
+
+    assert_eq(expected, actual)
+
+
 def test_read_text_bgzip(datadir):
     chess_file_compressed = str(datadir) + "/chess.pgn.gz"
     chess_file = str(datadir) + "/chess.pgn"
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 0a0647f1297..5298e470a91 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -1215,6 +1215,13 @@
 delimiter : string, default None
     The delimiter that should be used for splitting text chunks into
     separate cudf column rows. The delimiter may be one or more characters.
+strip_delimiters : boolean, default False
+    Unlike the `str.split()` function, `read_text` preserves the delimiter
+    at the end of a field in output by default, meaning `a;b;c` will turn into
+    `['a;','b;','c']` when using `;` as a delimiter.
+    Setting this option to `True` will strip these trailing delimiters,
+    leaving only the contents between delimiters in the resulting column:
+    `['a','b','c']`
 byte_range : list or tuple, default None
     Byte range within the input file to be read. The first number is the
     offset in bytes, the second number is the range size in bytes.

From 43eb7a07c8ed1afdf5b80d2912d6e8993e5262d9 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 27 Oct 2022 21:26:55 +0200
Subject: [PATCH 3/4] Refactor multibyte_split `output_builder` (#11945)

This PR moves the `output_builder` and `split_device_span` classes out of `multibyte_split` and adds an iterator for the `split_device_span`, enabling it to be used directly in Thrust algorithms.

I also included a fix from #11875 to make the integration easier once that is merged.

Authors:
  - Tobias Ribizel (https://github.com/upsj)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/11945
---
 cpp/src/io/text/multibyte_split.cu         | 217 +------------
 cpp/src/io/utilities/output_builder.cuh    | 357 +++++++++++++++++++++
 cpp/tests/io/text/multibyte_split_test.cpp |  64 ++++
 3 files changed, 423 insertions(+), 215 deletions(-)
 create mode 100644 cpp/src/io/utilities/output_builder.cuh

diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 0d699fc72fd..1177be6b63f 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -14,10 +14,7 @@
  * limitations under the License.
  */
 
-// Can be removed once we use Thrust 1.16+
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wpragmas"
-#pragma GCC diagnostic ignored "-Wsizeof-array-div"
+#include <io/utilities/output_builder.cuh>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
@@ -48,54 +45,12 @@
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
 
-#pragma GCC diagnostic pop
-
 #include <cstdint>
 #include <limits>
 #include <memory>
 #include <numeric>
 #include <optional>
 
-namespace cudf {
-
-/**
- * @brief A device span consisting of two separate device_spans acting as if they were part of a
- * single span. The first head.size() entries are served from the first span, the remaining
- * tail.size() entries are served from the second span.
- *
- * @tparam T The type of elements in the span.
- */
-template <typename T>
-class split_device_span {
- public:
-  explicit constexpr split_device_span(device_span<T> head, device_span<T> tail = {})
-    : _head{head}, _tail{tail}
-  {
-  }
-
-  [[nodiscard]] constexpr T& operator[](size_type i)
-  {
-    return i < _head.size() ? _head[i] : _tail[i - _head.size()];
-  }
-
-  [[nodiscard]] constexpr const T& operator[](size_type i) const
-  {
-    return i < _head.size() ? _head[i] : _tail[i - _head.size()];
-  }
-
-  [[nodiscard]] constexpr size_type size() const { return _head.size() + _tail.size(); }
-
-  [[nodiscard]] constexpr device_span<T> head() const { return _head; }
-
-  [[nodiscard]] constexpr device_span<T> tail() const { return _tail; }
-
- private:
-  device_span<T> _head;
-  device_span<T> _tail;
-};
-
-}  // namespace cudf
-
 namespace {
 
 using cudf::io::text::detail::multistate;
@@ -385,172 +340,6 @@ std::vector<rmm::cuda_stream_view> get_streams(int32_t count, rmm::cuda_stream_p
   return streams;
 }
 
-/**
- * @brief A chunked storage class that provides preallocated memory for algorithms with known
- * worst-case output size. It provides functionality to retrieve the next chunk to write to, for
- * reporting how much memory was actually written and for gathering all previously written outputs
- * into a single contiguous vector.
- *
- * @tparam T The output element type.
- */
-template <typename T>
-class output_builder {
- public:
-  using size_type = typename rmm::device_uvector<T>::size_type;
-
-  /**
-   * @brief Initializes an output builder with given worst-case output size and stream.
-   *
-   * @param max_write_size the maximum number of elements that will be written into a
-   *                       split_device_span returned from `next_output`.
-   * @param stream the stream used to allocate the first chunk of memory.
-   * @param mr optional, the memory resource to use for allocation.
-   */
-  output_builder(size_type max_write_size,
-                 size_type max_growth,
-                 rmm::cuda_stream_view stream,
-                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-    : _size{0}, _max_write_size{max_write_size}, _max_growth{max_growth}
-  {
-    CUDF_EXPECTS(max_write_size > 0, "Internal error");
-    _chunks.emplace_back(0, stream, mr);
-    _chunks.back().reserve(max_write_size * 2, stream);
-  }
-
-  output_builder(output_builder&&)      = delete;
-  output_builder(const output_builder&) = delete;
-  output_builder& operator=(output_builder&&) = delete;
-  output_builder& operator=(const output_builder&) = delete;
-
-  /**
-   * @brief Returns the next free chunk of `max_write_size` elements from the underlying storage.
-   * Must be followed by a call to `advance_output` after the memory has been written to.
-   *
-   * @param stream The stream to allocate a new chunk of memory with, if necessary.
-   *               This should be the stream that will write to the `split_device_span`.
-   * @return A `split_device_span` starting directly after the last output and providing at least
-   *         `max_write_size` entries of storage.
-   */
-  [[nodiscard]] split_device_span<T> next_output(rmm::cuda_stream_view stream)
-  {
-    auto head_it   = _chunks.end() - (_chunks.size() > 1 and _chunks.back().is_empty() ? 2 : 1);
-    auto head_span = get_free_span(*head_it);
-    if (head_span.size() >= _max_write_size) { return split_device_span<T>{head_span}; }
-    if (head_it == _chunks.end() - 1) {
-      // insert a new vector of double size
-      auto const next_chunk_size =
-        std::min(_max_growth * _max_write_size, 2 * _chunks.back().capacity());
-      _chunks.emplace_back(0, stream, _chunks.back().memory_resource());
-      _chunks.back().reserve(next_chunk_size, stream);
-    }
-    auto tail_span = get_free_span(_chunks.back());
-    CUDF_EXPECTS(head_span.size() + tail_span.size() >= _max_write_size, "Internal error");
-    return split_device_span<T>{head_span, tail_span};
-  }
-
-  /**
-   * @brief Advances the output sizes after a `split_device_span` returned from `next_output` was
-   *        written to.
-   *
-   * @param actual_size The number of elements that were written to the result of the previous
-   *                    `next_output` call.
-   */
-  void advance_output(size_type actual_size, rmm::cuda_stream_view stream)
-  {
-    CUDF_EXPECTS(actual_size <= _max_write_size, "Internal error");
-    if (_chunks.size() < 2) {
-      auto const new_size = _chunks.back().size() + actual_size;
-      inplace_resize(_chunks.back(), new_size, stream);
-    } else {
-      auto& tail              = _chunks.back();
-      auto& prev              = _chunks.rbegin()[1];
-      auto const prev_advance = std::min(actual_size, prev.capacity() - prev.size());
-      auto const tail_advance = actual_size - prev_advance;
-      inplace_resize(prev, prev.size() + prev_advance, stream);
-      inplace_resize(tail, tail.size() + tail_advance, stream);
-    }
-    _size += actual_size;
-  }
-
-  /**
-   * @brief Returns the first element that was written to the output.
-   *        Requires a previous call to `next_output` and `advance_output` and `size() > 0`.
-   * @param stream The stream used to access the element.
-   * @return The first element that was written to the output.
-   */
-  [[nodiscard]] T front_element(rmm::cuda_stream_view stream) const
-  {
-    return _chunks.front().front_element(stream);
-  }
-
-  /**
-   * @brief Returns the last element that was written to the output.
-   *        Requires a previous call to `next_output` and `advance_output` and `size() > 0`.
-   * @param stream The stream used to access the element.
-   * @return The last element that was written to the output.
-   */
-  [[nodiscard]] T back_element(rmm::cuda_stream_view stream) const
-  {
-    auto const& last_nonempty_chunk =
-      _chunks.size() > 1 and _chunks.back().is_empty() ? _chunks.rbegin()[1] : _chunks.back();
-    return last_nonempty_chunk.back_element(stream);
-  }
-
-  [[nodiscard]] size_type size() const { return _size; }
-
-  /**
-   * @brief Gathers all previously written outputs into a single contiguous vector.
-   *
-   * @param stream The stream used to allocate and gather the output vector. All previous write
-   *               operations to the output buffer must have finished or happened on this stream.
-   * @param mr The memory resource used to allocate the output vector.
-   * @return The output vector.
-   */
-  rmm::device_uvector<T> gather(rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr) const
-  {
-    rmm::device_uvector<T> output{size(), stream, mr};
-    auto output_it = output.begin();
-    for (auto const& chunk : _chunks) {
-      output_it = thrust::copy(
-        rmm::exec_policy_nosync(stream), chunk.begin(), chunk.begin() + chunk.size(), output_it);
-    }
-    return output;
-  }
-
- private:
-  /**
-   * @brief Resizes a vector without reallocating
-   *
-   * @param vector The vector
-   * @param new_size The new size. Must be smaller than the vector's capacity
-   */
-  static void inplace_resize(rmm::device_uvector<T>& vector,
-                             size_type new_size,
-                             rmm::cuda_stream_view stream)
-  {
-    CUDF_EXPECTS(new_size <= vector.capacity(), "Internal error");
-    vector.resize(new_size, stream);
-  }
-
-  /**
-   * @brief Returns the span consisting of all currently unused elements in the vector
-   * (`i >= size() and i < capacity()`).
-   *
-   * @param vector The vector.
-   * @return The span of unused elements.
-   */
-  static device_span<T> get_free_span(rmm::device_uvector<T>& vector)
-  {
-    return device_span<T>{vector.data() + vector.size(), vector.capacity() - vector.size()};
-  }
-
-  size_type _size;
-  size_type _max_write_size;
-  size_type _max_growth;
-  std::vector<rmm::device_uvector<T>> _chunks;
-};
-
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
                                               byte_range_info byte_range,
@@ -732,9 +521,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
         chunk->data() + std::min<byte_offset>(sentinel - chunk_offset, chunk->size());
       auto const output_size = end - begin;
       auto char_output       = char_storage.next_output(scan_stream);
-      auto const split = begin + std::min<byte_offset>(output_size, char_output.head().size());
-      thrust::copy(rmm::exec_policy_nosync(scan_stream), begin, split, char_output.head().begin());
-      thrust::copy(rmm::exec_policy_nosync(scan_stream), split, end, char_output.tail().begin());
+      thrust::copy(rmm::exec_policy_nosync(scan_stream), begin, end, char_output.begin());
       char_storage.advance_output(output_size, scan_stream);
     }
 
diff --git a/cpp/src/io/utilities/output_builder.cuh b/cpp/src/io/utilities/output_builder.cuh
new file mode 100644
index 00000000000..e45143480fc
--- /dev/null
+++ b/cpp/src/io/utilities/output_builder.cuh
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <thrust/copy.h>
+
+#include <iterator>
+
+namespace cudf {
+
+template <typename T>
+class split_device_span_iterator;
+
+/**
+ * @brief A device span consisting of two separate device_spans acting as if they were part of a
+ * single span. The first head.size() entries are served from the first span, the remaining
+ * tail.size() entries are served from the second span.
+ *
+ * @tparam T The type of elements in the span.
+ */
+template <typename T>
+class split_device_span {
+ public:
+  using element_type    = T;
+  using value_type      = std::remove_cv<T>;
+  using size_type       = std::size_t;
+  using difference_type = std::ptrdiff_t;
+  using pointer         = T*;
+  using iterator        = split_device_span_iterator<T>;
+  using const_pointer   = T const*;
+  using reference       = T&;
+  using const_reference = T const&;
+
+  split_device_span() = default;
+
+  explicit constexpr split_device_span(device_span<T> head, device_span<T> tail = {})
+    : _head{head}, _tail{tail}
+  {
+  }
+
+  [[nodiscard]] constexpr reference operator[](size_type i) const
+  {
+    return i < _head.size() ? _head[i] : _tail[i - _head.size()];
+  }
+
+  [[nodiscard]] constexpr size_type size() const { return _head.size() + _tail.size(); }
+
+  [[nodiscard]] constexpr device_span<T> head() const { return _head; }
+
+  [[nodiscard]] constexpr device_span<T> tail() const { return _tail; }
+
+  [[nodiscard]] constexpr iterator begin() const;
+
+  [[nodiscard]] constexpr iterator end() const;
+
+ private:
+  device_span<T> _head;
+  device_span<T> _tail;
+};
+
+/**
+ * @brief A random access iterator indexing into a split_device_span.
+ *
+ * @tparam T The type of elements in the underlying span.
+ */
+template <typename T>
+class split_device_span_iterator {
+  using it = split_device_span_iterator;
+
+ public:
+  using size_type         = std::size_t;
+  using difference_type   = std::ptrdiff_t;
+  using value_type        = T;
+  using pointer           = value_type*;
+  using reference         = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  split_device_span_iterator() = default;
+
+  constexpr split_device_span_iterator(split_device_span<T> span, size_type offset)
+    : _span{span}, _offset{offset}
+  {
+  }
+
+  [[nodiscard]] constexpr reference operator*() const { return _span[_offset]; }
+
+  [[nodiscard]] constexpr reference operator[](size_type i) const { return _span[_offset + i]; }
+
+  [[nodiscard]] constexpr friend bool operator==(const it& lhs, const it& rhs)
+  {
+    return lhs._offset == rhs._offset;
+  }
+
+  [[nodiscard]] constexpr friend bool operator!=(const it& lhs, const it& rhs)
+  {
+    return !(lhs == rhs);
+  }
+  [[nodiscard]] constexpr friend bool operator<(const it& lhs, const it& rhs)
+  {
+    return lhs._offset < rhs._offset;
+  }
+
+  [[nodiscard]] constexpr friend bool operator>=(const it& lhs, const it& rhs)
+  {
+    return !(lhs < rhs);
+  }
+
+  [[nodiscard]] constexpr friend bool operator>(const it& lhs, const it& rhs) { return rhs < lhs; }
+
+  [[nodiscard]] constexpr friend bool operator<=(const it& lhs, const it& rhs)
+  {
+    return !(lhs > rhs);
+  }
+
+  [[nodiscard]] constexpr friend difference_type operator-(const it& lhs, const it& rhs)
+  {
+    return lhs._offset - rhs._offset;
+  }
+
+  [[nodiscard]] constexpr friend it operator+(it lhs, difference_type i) { return lhs += i; }
+
+  constexpr it& operator+=(difference_type i)
+  {
+    _offset += i;
+    return *this;
+  }
+
+  constexpr it& operator-=(difference_type i) { return *this += -i; }
+
+  constexpr it& operator++() { return *this += 1; }
+
+  constexpr it& operator--() { return *this -= 1; }
+
+  constexpr it operator++(int)
+  {
+    auto result = *this;
+    ++*this;
+    return result;
+  }
+
+  constexpr it operator--(int)
+  {
+    auto result = *this;
+    --*this;
+    return result;
+  }
+
+ private:
+  split_device_span<T> _span;
+  size_type _offset;
+};
+
+template <typename T>
+[[nodiscard]] constexpr split_device_span_iterator<T> split_device_span<T>::begin() const
+{
+  return {*this, 0};
+}
+
+template <typename T>
+[[nodiscard]] constexpr split_device_span_iterator<T> split_device_span<T>::end() const
+{
+  return {*this, size()};
+}
+
+/**
+ * @brief A chunked storage class that provides preallocated memory for algorithms with known
+ * worst-case output size. It provides functionality to retrieve the next chunk to write to, for
+ * reporting how much memory was actually written and for gathering all previously written outputs
+ * into a single contiguous vector.
+ *
+ * @tparam T The output element type.
+ */
+template <typename T>
+class output_builder {
+ public:
+  using size_type = typename rmm::device_uvector<T>::size_type;
+
+  /**
+   * @brief Initializes an output builder with given worst-case output size and stream.
+   *
+   * @param max_write_size the maximum number of elements that will be written into a
+   *                       split_device_span returned from `next_output`.
+   * @param stream the stream used to allocate the first chunk of memory.
+   * @param mr optional, the memory resource to use for allocation.
+   */
+  output_builder(size_type max_write_size,
+                 size_type max_growth,
+                 rmm::cuda_stream_view stream,
+                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+    : _size{0}, _max_write_size{max_write_size}, _max_growth{max_growth}
+  {
+    CUDF_EXPECTS(max_write_size > 0, "Internal error");
+    _chunks.emplace_back(0, stream, mr);
+    _chunks.back().reserve(max_write_size * 2, stream);
+  }
+
+  output_builder(output_builder&&)      = delete;
+  output_builder(const output_builder&) = delete;
+  output_builder& operator=(output_builder&&) = delete;
+  output_builder& operator=(const output_builder&) = delete;
+
+  /**
+   * @brief Returns the next free chunk of `max_write_size` elements from the underlying storage.
+   * Must be followed by a call to `advance_output` after the memory has been written to.
+   *
+   * @param stream The stream to allocate a new chunk of memory with, if necessary.
+   *               This should be the stream that will write to the `split_device_span`.
+   * @return A `split_device_span` starting directly after the last output and providing at least
+   *         `max_write_size` entries of storage.
+   */
+  [[nodiscard]] split_device_span<T> next_output(rmm::cuda_stream_view stream)
+  {
+    auto head_it   = _chunks.end() - (_chunks.size() > 1 and _chunks.back().is_empty() ? 2 : 1);
+    auto head_span = get_free_span(*head_it);
+    if (head_span.size() >= _max_write_size) { return split_device_span<T>{head_span}; }
+    if (head_it == _chunks.end() - 1) {
+      // insert a new device_uvector of double size
+      auto const next_chunk_size =
+        std::min(_max_growth * _max_write_size, 2 * _chunks.back().capacity());
+      _chunks.emplace_back(0, stream, _chunks.back().memory_resource());
+      _chunks.back().reserve(next_chunk_size, stream);
+    }
+    auto tail_span = get_free_span(_chunks.back());
+    CUDF_EXPECTS(head_span.size() + tail_span.size() >= _max_write_size, "Internal error");
+    return split_device_span<T>{head_span, tail_span};
+  }
+
+  /**
+   * @brief Advances the output sizes after a `split_device_span` returned from `next_output` was
+   *        written to.
+   *
+   * @param actual_size The number of elements that were written to the result of the previous
+   *                    `next_output` call.
+   * @param stream The stream on which to resize the vectors. Since this function will not
+   *               reallocate, this only changes the stream of the internally stored vectors,
+   *               impacting their subsequent copy and destruction behavior.
+   */
+  void advance_output(size_type actual_size, rmm::cuda_stream_view stream)
+  {
+    CUDF_EXPECTS(actual_size <= _max_write_size, "Internal error");
+    if (_chunks.size() < 2) {
+      auto const new_size = _chunks.back().size() + actual_size;
+      inplace_resize(_chunks.back(), new_size, stream);
+    } else {
+      auto& tail              = _chunks.back();
+      auto& prev              = _chunks.rbegin()[1];
+      auto const prev_advance = std::min(actual_size, prev.capacity() - prev.size());
+      auto const tail_advance = actual_size - prev_advance;
+      inplace_resize(prev, prev.size() + prev_advance, stream);
+      inplace_resize(tail, tail.size() + tail_advance, stream);
+    }
+    _size += actual_size;
+  }
+
+  /**
+   * @brief Returns the first element that was written to the output.
+   *        Requires a previous call to `next_output` and `advance_output` and `size() > 0`.
+   * @param stream The stream used to access the element.
+   * @return The first element that was written to the output.
+   */
+  [[nodiscard]] T front_element(rmm::cuda_stream_view stream) const
+  {
+    return _chunks.front().front_element(stream);
+  }
+
+  /**
+   * @brief Returns the last element that was written to the output.
+   *        Requires a previous call to `next_output` and `advance_output` and `size() > 0`.
+   * @param stream The stream used to access the element.
+   * @return The last element that was written to the output.
+   */
+  [[nodiscard]] T back_element(rmm::cuda_stream_view stream) const
+  {
+    auto const& last_nonempty_chunk =
+      _chunks.size() > 1 and _chunks.back().is_empty() ? _chunks.rbegin()[1] : _chunks.back();
+    return last_nonempty_chunk.back_element(stream);
+  }
+
+  [[nodiscard]] size_type size() const { return _size; }
+
+  /**
+   * @brief Gathers all previously written outputs into a single contiguous vector.
+   *
+   * @param stream The stream used to allocate and gather the output vector. All previous write
+   *               operations to the output buffer must have finished or happened on this stream.
+   * @param mr The memory resource used to allocate the output vector.
+   * @return The output vector.
+   */
+  rmm::device_uvector<T> gather(rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr) const
+  {
+    rmm::device_uvector<T> output{size(), stream, mr};
+    auto output_it = output.begin();
+    for (auto const& chunk : _chunks) {
+      output_it = thrust::copy(
+        rmm::exec_policy_nosync(stream), chunk.begin(), chunk.begin() + chunk.size(), output_it);
+    }
+    return output;
+  }
+
+ private:
+  /**
+   * @brief Resizes a vector without reallocating
+   *
+   * @param vector The vector
+   * @param new_size The new size. Must be smaller than the vector's capacity
+   * @param stream The stream on which to resize the vector. Since this function will not
+   *               reallocate, this only changes the stream of `vector`, impacting its subsequent
+   *               copy and destruction behavior.
+   */
+  static void inplace_resize(rmm::device_uvector<T>& vector,
+                             size_type new_size,
+                             rmm::cuda_stream_view stream)
+  {
+    CUDF_EXPECTS(new_size <= vector.capacity(), "Internal error");
+    vector.resize(new_size, stream);
+  }
+
+  /**
+   * @brief Returns the span consisting of all currently unused elements in the vector
+   *        (`i >= size() and i < capacity()`).
+   *
+   * @param vector The vector.
+   * @return The span of unused elements.
+   */
+  static device_span<T> get_free_span(rmm::device_uvector<T>& vector)
+  {
+    return device_span<T>{vector.data() + vector.size(), vector.capacity() - vector.size()};
+  }
+
+  size_type _size;
+  size_type _max_write_size;
+  size_type _max_growth;
+  std::vector<rmm::device_uvector<T>> _chunks;
+};
+
+}  // namespace cudf
diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp
index 2da7073b334..2783b006982 100644
--- a/cpp/tests/io/text/multibyte_split_test.cpp
+++ b/cpp/tests/io/text/multibyte_split_test.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <io/utilities/output_builder.cuh>
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -26,6 +28,7 @@
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/multibyte_split.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
 
 using namespace cudf;
 using namespace test;
@@ -499,4 +502,65 @@ TEST_F(MultibyteSplitTest, EmptyRangeSingleByte)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, debug_output_level::ALL_ERRORS);
 }
 
+TEST_F(MultibyteSplitTest, EmptySplitDeviceSpan)
+{
+  cudf::split_device_span<int> span;
+  ASSERT_EQ(span.size(), 0);
+  ASSERT_EQ(span.head().size(), 0);
+  ASSERT_EQ(span.head().data(), nullptr);
+  ASSERT_EQ(span.tail().size(), 0);
+  ASSERT_EQ(span.tail().data(), nullptr);
+}
+
+TEST_F(MultibyteSplitTest, SplitDeviceSpan)
+{
+  int i = 0;
+  int j = 1;
+  cudf::split_device_span<int> span{{&i, 1}, {&j, 1}};
+  ASSERT_EQ(span.size(), 2);
+  ASSERT_EQ(span.head().size(), 1);
+  ASSERT_EQ(span.head().data(), &i);
+  ASSERT_EQ(span.tail().size(), 1);
+  ASSERT_EQ(span.tail().data(), &j);
+  ASSERT_EQ(&span[0], &i);
+  ASSERT_EQ(&span[1], &j);
+  ASSERT_EQ(&*span.begin(), &i);
+  ASSERT_EQ(&*(span.begin() + 1), &j);
+  ASSERT_NE(span.begin() + 1, span.end());
+  ASSERT_EQ(span.begin() + 2, span.end());
+}
+
+TEST_F(MultibyteSplitTest, OutputBuilder)
+{
+  auto const stream = cudf::get_default_stream();
+  cudf::output_builder<char> builder{10, 4, stream};
+  auto const output = builder.next_output(stream);
+  ASSERT_GE(output.size(), 10);
+  ASSERT_EQ(output.tail().size(), 0);
+  ASSERT_EQ(output.tail().data(), nullptr);
+  ASSERT_EQ(builder.size(), 0);
+  builder.advance_output(1, stream);
+  ASSERT_EQ(builder.size(), 1);
+  auto const output2 = builder.next_output(stream);
+  ASSERT_EQ(output2.head().data(), output.head().data() + 1);
+  builder.advance_output(10, stream);
+  ASSERT_EQ(builder.size(), 11);
+  auto const output3 = builder.next_output(stream);
+  ASSERT_EQ(output3.head().size(), 9);
+  ASSERT_EQ(output3.head().data(), output.head().data() + 11);
+  ASSERT_EQ(output3.tail().size(), 40);
+  builder.advance_output(9, stream);
+  ASSERT_EQ(builder.size(), 20);
+  auto const output4 = builder.next_output(stream);
+  ASSERT_EQ(output4.head().size(), 0);
+  ASSERT_EQ(output4.tail().size(), output3.tail().size());
+  ASSERT_EQ(output4.tail().data(), output3.tail().data());
+  builder.advance_output(1, stream);
+  auto const output5 = builder.next_output(stream);
+  ASSERT_EQ(output5.head().size(), 39);
+  ASSERT_EQ(output5.head().data(), output4.tail().data() + 1);
+  ASSERT_EQ(output5.tail().size(), 0);
+  ASSERT_EQ(output5.tail().data(), nullptr);
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From bac20048c488827747ba7ba9c596af9f38aceff7 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 27 Oct 2022 15:03:48 -0500
Subject: [PATCH 4/4] Add pivot_table and crosstab to docs. (#12014)

This PR resolves #12012 by adding `cudf.pivot_table` and `cudf.crosstab` to the documentation.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/12014
---
 docs/cudf/source/api_docs/general_functions.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/cudf/source/api_docs/general_functions.rst b/docs/cudf/source/api_docs/general_functions.rst
index 272d95e84bc..40e1b766dc9 100644
--- a/docs/cudf/source/api_docs/general_functions.rst
+++ b/docs/cudf/source/api_docs/general_functions.rst
@@ -14,6 +14,8 @@ Data manipulations
    cudf.get_dummies
    cudf.melt
    cudf.pivot
+   cudf.pivot_table
+   cudf.crosstab
    cudf.unstack
 
 Top-level conversions