From 0b345d4008d0d77a4bc66fc37be1e28b4dd5c01f Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <tribizel@nvidia.com>
Date: Wed, 19 Oct 2022 12:53:19 +0000
Subject: [PATCH 01/10] add strip_delimiters option to read_text

---
 cpp/benchmarks/io/text/multibyte_split.cpp   |  5 +-
 cpp/include/cudf/io/text/multibyte_split.hpp | 30 ++++++++--
 cpp/src/io/text/multibyte_split.cu           | 60 +++++++++++++++++++-
 cpp/tests/io/text/multibyte_split_test.cpp   | 30 ++++++++++
 python/cudf/cudf/_lib/cpp/io/text.pxd        | 10 +++-
 python/cudf/cudf/_lib/text.pyx               | 22 +++----
 python/cudf/cudf/io/text.py                  |  2 +
 python/cudf/cudf/tests/test_text.py          | 12 ++++
 8 files changed, 149 insertions(+), 22 deletions(-)
diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp
index 380766fee46..c0e82b34623 100644
--- a/cpp/benchmarks/io/text/multibyte_split.cpp
+++ b/cpp/benchmarks/io/text/multibyte_split.cpp
@@ -121,6 +121,7 @@ static void bench_multibyte_split(nvbench::state& state,
   auto const delim_percent      = state.get_int64("delim_percent");
   auto const file_size_approx   = state.get_int64("size_approx");
   auto const byte_range_percent = state.get_int64("byte_range_percent");
+  auto const strip_delimiters   = bool(state.get_int64("strip_delimiters"));
 
   auto const byte_range_factor = static_cast<double>(byte_range_percent) / 100;
   CUDF_EXPECTS(delim_percent >= 1, "delimiter percent must be at least 1");
@@ -182,12 +183,13 @@ static void bench_multibyte_split(nvbench::state& state,
   auto const range_size   = static_cast<int64_t>(device_input.size() * byte_range_factor);
   auto const range_offset = (device_input.size() - range_size) / 2;
   cudf::io::text::byte_range_info range{range_offset, range_size};
+  cudf::io::text::parse_options options{range, strip_delimiters};
   std::unique_ptr<cudf::column> output;
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     try_drop_l3_cache();
-    output = cudf::io::text::multibyte_split(*source, delim, range);
+    output = cudf::io::text::multibyte_split(*source, delim, options);
   });
 
   state.add_buffer_size(mem_stats_logger.peak_memory_usage(), "pmu", "Peak Memory Usage");
@@ -203,6 +205,7 @@ using source_type_list = nvbench::enum_type_list<data_chunk_source_type::device,
 
 NVBENCH_BENCH_TYPES(bench_multibyte_split, NVBENCH_TYPE_AXES(source_type_list))
   .set_name("multibyte_split")
+  .add_int64_axis("strip_delimiters", {0, 1})
   .add_int64_axis("delim_size", {1, 4, 7})
   .add_int64_axis("delim_percent", {1, 25})
   .add_int64_power_of_two_axis("size_approx", {15, 30})
diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp
index abb966a55bf..8524033d10c 100644
--- a/cpp/include/cudf/io/text/multibyte_split.hpp
+++ b/cpp/include/cudf/io/text/multibyte_split.hpp
@@ -30,11 +30,25 @@ namespace cudf {
 namespace io {
 namespace text {
 
+/**
+ * @brief Parsing options for multibyte_split.
+ */
+struct parse_options {
+  /**
+   * @brief Only rows starting inside this byte range will be part of the output column.
+   */
+  byte_range_info byte_range = create_byte_range_info_max();
+  /**
+   * @brief Should delimiters at the end of rows be stripped from the output column?
+   */
+  bool strip_delimiters = false;
+};
+
 /**
  * @brief Splits the source text into a strings column using a multiple byte delimiter.
  *
- * Providing a byte range allows multibyte_split to read a whole file, but only return the offsets
- * of delimiters which begin within the range. If thinking in terms of "records", where each
+ * Providing a byte range allows multibyte_split to read a file partially, only returning the
+ * offsets of delimiters which begin within the range. If thinking in terms of "records", where each
  * delimiter dictates the end of a record, all records which begin within the byte range provided
  * will be returned, including any record which may begin in the range but end outside of the
  * range. Records which begin outside of the range will ignored, even if those records end inside
@@ -63,7 +77,7 @@ namespace text {
  *
  * @param source The source string
  * @param delimiter UTF-8 encoded string for which to find offsets in the source
- * @param byte_range range in which to consider offsets relevant
+ * @param options the parsing options to use (including byte range)
  * @param mr Memory resource to use for the device memory allocation
  * @return The strings found by splitting the source by the delimiter within the relevant byte
  * range.
@@ -71,8 +85,14 @@ namespace text {
 std::unique_ptr<cudf::column> multibyte_split(
   data_chunk_source const& source,
   std::string const& delimiter,
-  std::optional<byte_range_info> byte_range = std::nullopt,
-  rmm::mr::device_memory_resource* mr       = rmm::mr::get_current_device_resource());
+  parse_options options               = {},
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+std::unique_ptr<cudf::column> multibyte_split(
+  data_chunk_source const& source,
+  std::string const& delimiter,
+  std::optional<byte_range_info> byte_range,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::column> multibyte_split(data_chunk_source const& source,
                                               std::string const& delimiter,
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 29cec0e8c3f..86e6d5cd931 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -21,12 +21,14 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/io/text/byte_range_info.hpp>
 #include <cudf/io/text/data_chunk_source.hpp>
 #include <cudf/io/text/detail/multistate.hpp>
 #include <cudf/io/text/detail/tile_state.hpp>
+#include <cudf/io/text/multibyte_split.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/span.hpp>
@@ -551,6 +553,7 @@ class output_builder {
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
                                               byte_range_info byte_range,
+                                              bool strip_delimiters,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr,
                                               rmm::cuda_stream_pool& stream_pool)
@@ -771,8 +774,50 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
                     [baseline = *first_row_offset] __device__(byte_offset global_offset) {
                       return static_cast<int32_t>(global_offset - baseline);
                     });
-
   auto string_count = offsets.size() - 1;
+  if (strip_delimiters) {
+    rmm::device_uvector<int32_t> new_offsets{offsets.size(), stream, mr};
+    auto const it = cudf::detail::make_counting_transform_iterator(
+      int32_t{0},
+      [ofs        = offsets.data(),
+       delim_size = static_cast<int32_t>(delimiter.size()),
+       last_row   = static_cast<int32_t>(string_count) - 1,
+       insert_end] __device__(int32_t i) {
+        // if we inserted a virtual delimiter at the end, we musn't strip it
+        if (i == last_row && insert_end) {
+          return ofs[i + 1] - ofs[i];
+        } else {
+          return std::max(ofs[i + 1] - ofs[i] - delim_size, 0);
+        }
+      });
+    new_offsets.set_element_to_zero_async(0, stream);
+    thrust::inclusive_scan(
+      rmm::exec_policy(stream), it, it + offsets.size() - 1, new_offsets.begin() + 1);
+    auto const new_char_count = static_cast<std::size_t>(new_offsets.back_element(stream));
+    rmm::device_uvector<char> new_chars{new_char_count, stream, mr};
+    // copy the chars over in striped accesses of slice_size chars, 8 chars per thread on avg
+    auto const slice_size  = cudf::util::div_rounding_up_safe(new_char_count, string_count * 8);
+    auto const counting_it = thrust::make_counting_iterator(std::size_t{});
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       counting_it,
+                       string_count * slice_size,
+                       [chars     = chars.data(),
+                        new_chars = new_chars.data(),
+                        ofs       = offsets.data(),
+                        new_ofs   = new_offsets.data(),
+                        slice_size] __device__(std::size_t i) {
+                         auto const row       = i / slice_size;
+                         auto const stripe    = i % slice_size;
+                         auto const in_begin  = ofs[row];
+                         auto const out_begin = new_ofs[row];
+                         auto const out_size  = new_ofs[row + 1] - out_begin;
+                         for (int32_t i = stripe; i < out_size; i += slice_size) {
+                           new_chars[out_begin + i] = chars[in_begin + i];
+                         }
+                       });
+    offsets = std::move(new_offsets);
+    chars   = std::move(new_chars);
+  }
 
   return cudf::make_strings_column(string_count, std::move(offsets), std::move(chars));
 }
@@ -783,12 +828,21 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
                                               std::string const& delimiter,
                                               std::optional<byte_range_info> byte_range,
                                               rmm::mr::device_memory_resource* mr)
+{
+  return multibyte_split(
+    source, delimiter, parse_options{byte_range.value_or(create_byte_range_info_max())}, mr);
+}
+
+std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
+                                              std::string const& delimiter,
+                                              parse_options options,
+                                              rmm::mr::device_memory_resource* mr)
 {
   auto stream      = cudf::get_default_stream();
   auto stream_pool = rmm::cuda_stream_pool(2);
 
   auto result = detail::multibyte_split(
-    source, delimiter, byte_range.value_or(create_byte_range_info_max()), stream, mr, stream_pool);
+    source, delimiter, options.byte_range, options.strip_delimiters, stream, mr, stream_pool);
 
   return result;
 }
@@ -797,7 +851,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
                                               std::string const& delimiter,
                                               rmm::mr::device_memory_resource* mr)
 {
-  return multibyte_split(source, delimiter, std::nullopt, mr);
+  return multibyte_split(source, delimiter, parse_options{}, mr);
 }
 
 }  // namespace text
diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp
index 43debf3d5b3..d84d1a5bb24 100644
--- a/cpp/tests/io/text/multibyte_split_test.cpp
+++ b/cpp/tests/io/text/multibyte_split_test.cpp
@@ -146,6 +146,36 @@ TEST_F(MultibyteSplitTest, OverlappingMatchErasure)
   // CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); // this use case it not yet supported.
 }
 
+TEST_F(MultibyteSplitTest, DelimiterErasure)
+{
+  auto delimiter = "\r\n";
+
+  auto host_input = std::string("line\r\nanother line\r\nthird line\r\n");
+  auto expected   = strings_column_wrapper{"line", "another line", "third line", ""};
+
+  cudf::io::text::parse_options options;
+  options.strip_delimiters = true;
+  auto source              = cudf::io::text::make_source(host_input);
+  auto out                 = cudf::io::text::multibyte_split(*source, delimiter, options);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
+TEST_F(MultibyteSplitTest, DelimiterErasureOverlap)
+{
+  auto delimiter = "::";
+
+  auto host_input = std::string("::a:::b::c::::d");
+  auto expected   = strings_column_wrapper{"", "a", "", "b", "c", "", "", "d"};
+
+  cudf::io::text::parse_options options;
+  options.strip_delimiters = true;
+  auto source              = cudf::io::text::make_source(host_input);
+  auto out                 = cudf::io::text::multibyte_split(*source, delimiter, options);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
 TEST_F(MultibyteSplitTest, HandpickedInput)
 {
   auto delimiters = "::|";
diff --git a/python/cudf/cudf/_lib/cpp/io/text.pxd b/python/cudf/cudf/_lib/cpp/io/text.pxd
index 7bbe870dad3..368b014ea4b 100644
--- a/python/cudf/cudf/_lib/cpp/io/text.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/text.pxd
@@ -1,6 +1,7 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint64_t
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
@@ -37,9 +38,12 @@ cdef extern from "cudf/io/text/data_chunk_source_factories.hpp" \
 cdef extern from "cudf/io/text/multibyte_split.hpp" \
         namespace "cudf::io::text" nogil:
 
-    unique_ptr[column] multibyte_split(data_chunk_source source,
-                                       string delimiter) except +
+    cdef cppclass parse_options:
+        byte_range_info byte_range
+        bool strip_delimiters
+
+        parse_options() except +
 
     unique_ptr[column] multibyte_split(data_chunk_source source,
                                        string delimiter,
-                                       byte_range_info byte_range) except +
+                                       parse_options options) except +
diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx
index 31a5617af58..be11132497e 100644
--- a/python/cudf/cudf/_lib/text.pyx
+++ b/python/cudf/cudf/_lib/text.pyx
@@ -19,12 +19,14 @@ from cudf._lib.cpp.io.text cimport (
     make_source_from_bgzip_file,
     make_source_from_file,
     multibyte_split,
+    parse_options,
 )
 
 
 def read_text(object filepaths_or_buffers,
               object delimiter=None,
               object byte_range=None,
+              object strip_delimiters=False,
               object compression=None,
               object compression_offsets=None):
     """
@@ -44,6 +46,7 @@ def read_text(object filepaths_or_buffers,
     cdef byte_range_info c_byte_range
     cdef uint64_t c_compression_begin_offset
     cdef uint64_t c_compression_end_offset
+    cdef parse_options c_options
 
     if compression is None:
         if isinstance(filepaths_or_buffers, TextIOBase):
@@ -71,19 +74,18 @@ def read_text(object filepaths_or_buffers,
     else:
         raise ValueError("Only bgzip compression is supported at the moment")
 
-    if (byte_range is None):
-        with nogil:
-            c_col = move(multibyte_split(dereference(datasource), delim))
-    else:
+    c_options = parse_options()
+    if byte_range is not None:
         c_byte_range_offset = byte_range[0]
         c_byte_range_size = byte_range[1]
-        c_byte_range = byte_range_info(
+        c_options.byte_range = byte_range_info(
             c_byte_range_offset,
             c_byte_range_size)
-        with nogil:
-            c_col = move(multibyte_split(
-                dereference(datasource),
-                delim,
-                c_byte_range))
+    c_options.strip_delimiters = strip_delimiters
+    with nogil:
+        c_col = move(multibyte_split(
+            dereference(datasource),
+            delim,
+            c_options))
 
     return {None: Column.from_unique_ptr(move(c_col))}
diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py
index 23983f01966..f341edbf6c1 100644
--- a/python/cudf/cudf/io/text.py
+++ b/python/cudf/cudf/io/text.py
@@ -14,6 +14,7 @@ def read_text(
     filepath_or_buffer,
     delimiter=None,
     byte_range=None,
+    strip_delimiters=False,
     compression=None,
     compression_offsets=None,
     **kwargs,
@@ -35,6 +36,7 @@ def read_text(
             filepath_or_buffer,
             delimiter=delimiter,
             byte_range=byte_range,
+            strip_delimiters=strip_delimiters,
             compression=compression,
             compression_offsets=compression_offsets,
         )
diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py
index 7f41d606473..2c5c96781fe 100644
--- a/python/cudf/cudf/tests/test_text.py
+++ b/python/cudf/cudf/tests/test_text.py
@@ -847,6 +847,18 @@ def test_read_text_in_memory(datadir):
     assert_eq(expected, actual)
 
 
+def test_read_text_in_memory_strip_delimiter(datadir):
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(["x", "y", "z"])
+
+    actual = cudf.read_text(
+        StringIO("x::y::z"), delimiter="::", strip_delimiters=True
+    )
+
+    assert_eq(expected, actual)
+
+
 def test_read_text_bgzip(datadir):
     chess_file_compressed = str(datadir) + "/chess.pgn.gz"
     chess_file = str(datadir) + "/chess.pgn"

From 3e4d464eff25e02bd6828da2626f8f17459c8bbf Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <tribizel@nvidia.com>
Date: Wed, 19 Oct 2022 18:22:00 +0000
Subject: [PATCH 02/10] use gathering make_string_column

---
 cpp/src/io/text/multibyte_split.cu | 55 ++++++++----------------------
 1 file changed, 15 insertions(+), 40 deletions(-)

diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 86e6d5cd931..9989bf42d63 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -30,6 +30,7 @@
 #include <cudf/io/text/detail/tile_state.hpp>
 #include <cudf/io/text/multibyte_split.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -776,50 +777,24 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
                     });
   auto string_count = offsets.size() - 1;
   if (strip_delimiters) {
-    rmm::device_uvector<int32_t> new_offsets{offsets.size(), stream, mr};
-    auto const it = cudf::detail::make_counting_transform_iterator(
-      int32_t{0},
+    auto it = cudf::detail::make_counting_transform_iterator(
+      0,
       [ofs        = offsets.data(),
-       delim_size = static_cast<int32_t>(delimiter.size()),
-       last_row   = static_cast<int32_t>(string_count) - 1,
-       insert_end] __device__(int32_t i) {
-        // if we inserted a virtual delimiter at the end, we musn't strip it
-        if (i == last_row && insert_end) {
-          return ofs[i + 1] - ofs[i];
+       chars      = chars.data(),
+       delim_size = static_cast<size_type>(delimiter.size()),
+       last_row   = static_cast<size_type>(string_count) - 1] __device__(size_type row) {
+        auto const begin = ofs[row];
+        auto const len   = ofs[row + 1] - begin;
+        if (row == last_row) {
+          return thrust::make_pair(chars + begin, len);
         } else {
-          return std::max(ofs[i + 1] - ofs[i] - delim_size, 0);
-        }
+          return thrust::make_pair(chars + begin, std::max<size_type>(0, len - delim_size));
+        };
       });
-    new_offsets.set_element_to_zero_async(0, stream);
-    thrust::inclusive_scan(
-      rmm::exec_policy(stream), it, it + offsets.size() - 1, new_offsets.begin() + 1);
-    auto const new_char_count = static_cast<std::size_t>(new_offsets.back_element(stream));
-    rmm::device_uvector<char> new_chars{new_char_count, stream, mr};
-    // copy the chars over in striped accesses of slice_size chars, 8 chars per thread on avg
-    auto const slice_size  = cudf::util::div_rounding_up_safe(new_char_count, string_count * 8);
-    auto const counting_it = thrust::make_counting_iterator(std::size_t{});
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       counting_it,
-                       string_count * slice_size,
-                       [chars     = chars.data(),
-                        new_chars = new_chars.data(),
-                        ofs       = offsets.data(),
-                        new_ofs   = new_offsets.data(),
-                        slice_size] __device__(std::size_t i) {
-                         auto const row       = i / slice_size;
-                         auto const stripe    = i % slice_size;
-                         auto const in_begin  = ofs[row];
-                         auto const out_begin = new_ofs[row];
-                         auto const out_size  = new_ofs[row + 1] - out_begin;
-                         for (int32_t i = stripe; i < out_size; i += slice_size) {
-                           new_chars[out_begin + i] = chars[in_begin + i];
-                         }
-                       });
-    offsets = std::move(new_offsets);
-    chars   = std::move(new_chars);
+    return cudf::strings::detail::make_strings_column(it, it + string_count, stream, mr);
+  } else {
+    return cudf::make_strings_column(string_count, std::move(offsets), std::move(chars));
   }
-
-  return cudf::make_strings_column(string_count, std::move(offsets), std::move(chars));
 }
 
 }  // namespace detail

From 84a975ab7479eeb26402f11a8878aa2af23f3bee Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <tribizel@nvidia.com>
Date: Wed, 19 Oct 2022 18:45:56 +0000
Subject: [PATCH 03/10] restore original handling of delimiter in last row

---
 cpp/src/io/text/multibyte_split.cu         | 11 ++++++--
 cpp/tests/io/text/multibyte_split_test.cpp | 32 ++++++++++++++++++++++
 2 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 9989bf42d63..2416ff95e6c 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -761,7 +761,11 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   auto global_offsets = row_offset_storage.gather(stream, mr);
 
   bool const insert_begin = *first_row_offset == 0;
-  bool const insert_end   = not last_row_offset.has_value() or last_row_offset == chunk_offset;
+  // insert a value at the end either if we found no delimiter past the byte range (to complete the
+  // last row), or if we found a delimiter at the very end (to add an empty row for the entry after
+  // the delimiter)
+  bool const insert_end = not last_row_offset.has_value() or
+                          (last_row_offset == chunk_offset and chunk_offset <= byte_range_end);
   rmm::device_uvector<int32_t> offsets{
     global_offsets.size() + insert_begin + insert_end, stream, mr};
   if (insert_begin) { offsets.set_element_to_zero_async(0, stream); }
@@ -782,10 +786,11 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
       [ofs        = offsets.data(),
        chars      = chars.data(),
        delim_size = static_cast<size_type>(delimiter.size()),
-       last_row   = static_cast<size_type>(string_count) - 1] __device__(size_type row) {
+       last_row   = static_cast<size_type>(string_count) - 1,
+       insert_end] __device__(size_type row) {
         auto const begin = ofs[row];
         auto const len   = ofs[row + 1] - begin;
-        if (row == last_row) {
+        if (row == last_row && insert_end) {
           return thrust::make_pair(chars + begin, len);
         } else {
           return thrust::make_pair(chars + begin, std::max<size_type>(0, len - delim_size));
diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp
index d84d1a5bb24..d923deba2fd 100644
--- a/cpp/tests/io/text/multibyte_split_test.cpp
+++ b/cpp/tests/io/text/multibyte_split_test.cpp
@@ -91,6 +91,22 @@ TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
 }
 
+TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange2)
+{
+  auto delimiter  = std::string(":");
+  auto host_input = std::string("abcdefg:");
+
+  auto expected = strings_column_wrapper{"abcdefg:"};
+
+  auto source = cudf::io::text::make_source(host_input);
+  auto out    = cudf::io::text::multibyte_split(
+    *source,
+    delimiter,
+    cudf::io::text::byte_range_info{0, static_cast<int64_t>(host_input.size() - 1)});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
 TEST_F(MultibyteSplitTest, LargeInputSparse)
 {
   auto host_input    = std::string(1024 * 1024 * 32, '.');
@@ -161,6 +177,22 @@ TEST_F(MultibyteSplitTest, DelimiterErasure)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
 }
 
+TEST_F(MultibyteSplitTest, DelimiterErasureByteRange)
+{
+  auto delimiter = "\r\n";
+
+  auto host_input = std::string("line\r\nanother line\r\nthird line\r\n");
+  auto expected   = strings_column_wrapper{"line", "another line", "third line"};
+
+  cudf::io::text::parse_options options;
+  options.strip_delimiters = true;
+  options.byte_range       = cudf::io::text::byte_range_info(0, host_input.size() - 1);
+  auto source              = cudf::io::text::make_source(host_input);
+  auto out                 = cudf::io::text::multibyte_split(*source, delimiter, options);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
 TEST_F(MultibyteSplitTest, DelimiterErasureOverlap)
 {
   auto delimiter = "::";

From 7c3028341c29b11b31ee0b0806aa3876e19062e0 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <tribizel@nvidia.com>
Date: Thu, 20 Oct 2022 09:01:05 +0000
Subject: [PATCH 04/10] fix byte range matching last delimiter at end

---
 cpp/src/io/text/multibyte_split.cu         | 11 ++++++-----
 cpp/tests/io/text/multibyte_split_test.cpp | 16 ++++++++++++++++
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 2416ff95e6c..72a6d94acf8 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -751,19 +751,20 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   join_stream(streams, stream);
 
   // if the input was empty, we didn't find a delimiter at all,
-  // or the first delimiter was also the last: empty output
+  // or the first delimiter was also the last (but not the last in the file): empty output
   if (chunk_offset == 0 or not first_row_offset.has_value() or
-      first_row_offset == last_row_offset) {
+      (first_row_offset == last_row_offset and first_row_offset < byte_range_end)) {
     return make_empty_column(type_id::STRING);
   }
 
   auto chars          = char_storage.gather(stream, mr);
   auto global_offsets = row_offset_storage.gather(stream, mr);
 
+  // insert an offset at the beginning if we started at the beginning of the input
   bool const insert_begin = *first_row_offset == 0;
-  // insert a value at the end either if we found no delimiter past the byte range (to complete the
-  // last row), or if we found a delimiter at the very end (to add an empty row for the entry after
-  // the delimiter)
+  // insert an offset at the end either if we found no delimiter past the byte range (to complete
+  // the last row), or if we found a delimiter at the very end (to add an empty row for the entry
+  // after the delimiter)
   bool const insert_end = not last_row_offset.has_value() or
                           (last_row_offset == chunk_offset and chunk_offset <= byte_range_end);
   rmm::device_uvector<int32_t> offsets{
diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp
index d923deba2fd..a841c77a64f 100644
--- a/cpp/tests/io/text/multibyte_split_test.cpp
+++ b/cpp/tests/io/text/multibyte_split_test.cpp
@@ -421,6 +421,22 @@ TEST_F(MultibyteSplitTest, SmallInputAllPossibleRangesSingleByte)
   }
 }
 
+TEST_F(MultibyteSplitTest, SingletonRangeAtEnd)
+{
+  // we want a delimiter at the end of the file to create a new empty row. To make concatenation of
+  // disjoint byte ranges work, we need to make sure this property is preserved even if the byte
+  // range contains only the last delimiter
+  using namespace cudf::io::text;
+  auto host_input = std::string("ab:cd:");
+  auto delimiter  = std::string(":");
+  auto source     = make_source(host_input);
+  auto expected   = strings_column_wrapper{""};
+
+  auto out = multibyte_split(*source, delimiter, byte_range_info{5, 1});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, debug_output_level::ALL_ERRORS);
+}
+
 TEST_F(MultibyteSplitTest, EmptyInput)
 {
   using namespace cudf::io::text;

From d7c3afc190ab81b38aa1ea2a1fe65607e0dd1bde Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <tribizel@nvidia.com>
Date: Thu, 20 Oct 2022 16:44:07 +0000
Subject: [PATCH 05/10] strip empty row with delimiter at end

---
 cpp/src/io/text/multibyte_split.cu         | 14 ++++++--------
 cpp/tests/io/text/multibyte_split_test.cpp | 17 +++++++----------
 2 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 72a6d94acf8..465564856fe 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -751,9 +751,9 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   join_stream(streams, stream);
 
   // if the input was empty, we didn't find a delimiter at all,
-  // or the first delimiter was also the last (but not the last in the file): empty output
+  // or the first delimiter was also the last: empty output
   if (chunk_offset == 0 or not first_row_offset.has_value() or
-      (first_row_offset == last_row_offset and first_row_offset < byte_range_end)) {
+      first_row_offset == last_row_offset) {
     return make_empty_column(type_id::STRING);
   }
 
@@ -761,12 +761,10 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   auto global_offsets = row_offset_storage.gather(stream, mr);
 
   // insert an offset at the beginning if we started at the beginning of the input
-  bool const insert_begin = *first_row_offset == 0;
-  // insert an offset at the end either if we found no delimiter past the byte range (to complete
-  // the last row), or if we found a delimiter at the very end (to add an empty row for the entry
-  // after the delimiter)
-  bool const insert_end = not last_row_offset.has_value() or
-                          (last_row_offset == chunk_offset and chunk_offset <= byte_range_end);
+  bool const insert_begin = first_row_offset.value_or(0) == 0;
+  // insert an offset at the end if we have not terminated the last row
+  bool const insert_end =
+    not(last_row_offset.has_value() or global_offsets.back_element(stream) == chunk_offset);
   rmm::device_uvector<int32_t> offsets{
     global_offsets.size() + insert_begin + insert_end, stream, mr};
   if (insert_begin) { offsets.set_element_to_zero_async(0, stream); }
diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp
index a841c77a64f..40cd8c47ee7 100644
--- a/cpp/tests/io/text/multibyte_split_test.cpp
+++ b/cpp/tests/io/text/multibyte_split_test.cpp
@@ -67,7 +67,7 @@ TEST_F(MultibyteSplitTest, DelimiterAtEnd)
   auto delimiter  = std::string(":");
   auto host_input = std::string("abcdefg:");
 
-  auto expected = strings_column_wrapper{"abcdefg:", ""};
+  auto expected = strings_column_wrapper{"abcdefg:"};
 
   auto source = cudf::io::text::make_source(host_input);
   auto out    = cudf::io::text::multibyte_split(*source, delimiter);
@@ -80,7 +80,7 @@ TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange)
   auto delimiter  = std::string(":");
   auto host_input = std::string("abcdefg:");
 
-  auto expected = strings_column_wrapper{"abcdefg:", ""};
+  auto expected = strings_column_wrapper{"abcdefg:"};
 
   auto source = cudf::io::text::make_source(host_input);
   auto out    = cudf::io::text::multibyte_split(
@@ -136,8 +136,6 @@ TEST_F(MultibyteSplitTest, LargeInput)
     host_expected.emplace_back(std::string("...:|"));
   }
 
-  host_expected.emplace_back(std::string(""));
-
   auto expected = strings_column_wrapper{host_expected.begin(), host_expected.end()};
 
   auto delimiter = std::string("...:|");
@@ -167,7 +165,7 @@ TEST_F(MultibyteSplitTest, DelimiterErasure)
   auto delimiter = "\r\n";
 
   auto host_input = std::string("line\r\nanother line\r\nthird line\r\n");
-  auto expected   = strings_column_wrapper{"line", "another line", "third line", ""};
+  auto expected   = strings_column_wrapper{"line", "another line", "third line"};
 
   cudf::io::text::parse_options options;
   options.strip_delimiters = true;
@@ -246,7 +244,7 @@ TEST_F(MultibyteSplitTest, HandpickedInput)
     "ggg::|",         "hhh::|",      "___::|",       "here::|", "is::|",     "another::|",
     "simple::|",      "text::|",     "seperated::|", "by::|",   "emojis::|", "which::|",
     "are::|",         "multiple::|", "bytes::|",     "and::|",  "used::|",   "as::|",
-    "delimiters.::|", "::|",         "::|",          "::|",     ""};
+    "delimiters.::|", "::|",         "::|",          "::|"};
 
   auto source = cudf::io::text::make_source(host_input);
   auto out    = cudf::io::text::multibyte_split(*source, delimiters);
@@ -423,14 +421,13 @@ TEST_F(MultibyteSplitTest, SmallInputAllPossibleRangesSingleByte)
 
 TEST_F(MultibyteSplitTest, SingletonRangeAtEnd)
 {
-  // we want a delimiter at the end of the file to create a new empty row. To make concatenation of
-  // disjoint byte ranges work, we need to make sure this property is preserved even if the byte
-  // range contains only the last delimiter
+  // we want a delimiter at the end of the file to not create a new empty row even if it is the only
+  // character in the byte range
   using namespace cudf::io::text;
   auto host_input = std::string("ab:cd:");
   auto delimiter  = std::string(":");
   auto source     = make_source(host_input);
-  auto expected   = strings_column_wrapper{""};
+  auto expected   = strings_column_wrapper{};
 
   auto out = multibyte_split(*source, delimiter, byte_range_info{5, 1});
 

From 8ae57072ed675784beb84f8b2f76605c12c929c5 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <tribizel@nvidia.com>
Date: Wed, 26 Oct 2022 09:58:56 +0000
Subject: [PATCH 06/10] fix OOB access when no delimiter was found

---
 cpp/src/io/text/multibyte_split.cu         |  3 ++-
 cpp/tests/io/text/multibyte_split_test.cpp | 13 +++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 465564856fe..0d699fc72fd 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -764,7 +764,8 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   bool const insert_begin = first_row_offset.value_or(0) == 0;
   // insert an offset at the end if we have not terminated the last row
   bool const insert_end =
-    not(last_row_offset.has_value() or global_offsets.back_element(stream) == chunk_offset);
+    not(last_row_offset.has_value() or
+        (global_offsets.size() > 0 and global_offsets.back_element(stream) == chunk_offset));
   rmm::device_uvector<int32_t> offsets{
     global_offsets.size() + insert_begin + insert_end, stream, mr};
   if (insert_begin) { offsets.set_element_to_zero_async(0, stream); }
diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp
index 40cd8c47ee7..2da7073b334 100644
--- a/cpp/tests/io/text/multibyte_split_test.cpp
+++ b/cpp/tests/io/text/multibyte_split_test.cpp
@@ -62,6 +62,19 @@ TEST_F(MultibyteSplitTest, NondeterministicMatching)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
 }
 
+TEST_F(MultibyteSplitTest, NoDelimiter)
+{
+  auto delimiter  = std::string(":");
+  auto host_input = std::string("abcdefg");
+
+  auto expected = strings_column_wrapper{"abcdefg"};
+
+  auto source = cudf::io::text::make_source(host_input);
+  auto out    = cudf::io::text::multibyte_split(*source, delimiter);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
 TEST_F(MultibyteSplitTest, DelimiterAtEnd)
 {
   auto delimiter  = std::string(":");

From d99bfe599e4bf25d3dc5f2417210a83958e82dad Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <tribizel@nvidia.com>
Date: Wed, 26 Oct 2022 09:59:36 +0000
Subject: [PATCH 07/10] fix python test

---
 python/cudf/cudf/tests/test_text.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py
index 2c5c96781fe..627bf0a68bb 100644
--- a/python/cudf/cudf/tests/test_text.py
+++ b/python/cudf/cudf/tests/test_text.py
@@ -827,14 +827,20 @@ def test_read_text_byte_range(datadir):
 
 
 def test_read_text_byte_range_large(tmpdir):
-    content = str([["\n" if x % 5 == 0 else "x"] for x in range(0, 3000)])
-    delimiter = "1."
+    content = "".join(("\n" if x % 5 == 4 else "x") for x in range(0, 3000))
+    delimiter = "\n"
     temp_file = str(tmpdir) + "/temp.txt"
 
     with open(temp_file, "w") as f:
         f.write(content)
 
-    cudf.read_text(temp_file, delimiter=delimiter)
+    expected = cudf.Series(["xxxx\n" for i in range(0, 200)])
+
+    actual = cudf.read_text(
+        temp_file, delimiter=delimiter, byte_range=[1000, 1000]
+    )
+
+    assert_eq(expected, actual)
 
 
 def test_read_text_in_memory(datadir):

From 01ba6f11243774db8c6aaeac4473ba4bfa557fcc Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <tribizel@nvidia.com>
Date: Wed, 26 Oct 2022 10:03:21 +0000
Subject: [PATCH 08/10] ad missing docstring

---
 python/cudf/cudf/utils/ioutils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 0a0647f1297..f7eaef57942 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -1215,6 +1215,8 @@
 delimiter : string, default None
     The delimiter that should be used for splitting text chunks into
     separate cudf column rows. The delimiter may be one or more characters.
+strip_delimiters : boolean, default False
+    If True, removes delimiters that are normally kept at the end of strings.
 byte_range : list or tuple, default None
     Byte range within the input file to be read. The first number is the
     offset in bytes, the second number is the range size in bytes.

From 6c2a3b0a40c4f76b49209a1c1d711d126ca931f9 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Wed, 26 Oct 2022 22:49:56 +0200
Subject: [PATCH 09/10] Reword parse_options docstring

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 cpp/include/cudf/io/text/multibyte_split.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp
index 8524033d10c..a7edc9be0e4 100644
--- a/cpp/include/cudf/io/text/multibyte_split.hpp
+++ b/cpp/include/cudf/io/text/multibyte_split.hpp
@@ -39,7 +39,7 @@ struct parse_options {
    */
   byte_range_info byte_range = create_byte_range_info_max();
   /**
-   * @brief Should delimiters at the end of rows be stripped from the output column?
+   * @brief Whether delimiters at the end of rows should be stripped from the output column
    */
   bool strip_delimiters = false;
 };

From ca5568ab4577ac856a5e91578a04a670fa7b3ab2 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 27 Oct 2022 15:11:17 +0200
Subject: [PATCH 10/10] Extend docstring

---
 python/cudf/cudf/utils/ioutils.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index f7eaef57942..5298e470a91 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -1216,7 +1216,12 @@
     The delimiter that should be used for splitting text chunks into
     separate cudf column rows. The delimiter may be one or more characters.
 strip_delimiters : boolean, default False
-    If True, removes delimiters that are normally kept at the end of strings.
+    Unlike the `str.split()` function, `read_text` preserves the delimiter
+    at the end of a field in output by default, meaning `a;b;c` will turn into
+    `['a;','b;','c']` when using `;` as a delimiter.
+    Setting this option to `True` will strip these trailing delimiters,
+    leaving only the contents between delimiters in the resulting column:
+    `['a','b','c']`
 byte_range : list or tuple, default None
     Byte range within the input file to be read. The first number is the
     offset in bytes, the second number is the range size in bytes.