rapidsai · rapids-bot · Oct 27, 2022 · Oct 19, 2022 · Oct 19, 2022 · Oct 19, 2022
@@ -121,6 +121,7 @@ static void bench_multibyte_split(nvbench::state& state,
   auto const delim_percent      = state.get_int64("delim_percent");
   auto const file_size_approx   = state.get_int64("size_approx");
   auto const byte_range_percent = state.get_int64("byte_range_percent");
+  auto const strip_delimiters   = bool(state.get_int64("strip_delimiters"));
 
   auto const byte_range_factor = static_cast<double>(byte_range_percent) / 100;
   CUDF_EXPECTS(delim_percent >= 1, "delimiter percent must be at least 1");
@@ -182,12 +183,13 @@ static void bench_multibyte_split(nvbench::state& state,
   auto const range_size   = static_cast<int64_t>(device_input.size() * byte_range_factor);
   auto const range_offset = (device_input.size() - range_size) / 2;
   cudf::io::text::byte_range_info range{range_offset, range_size};
+  cudf::io::text::parse_options options{range, strip_delimiters};
   std::unique_ptr<cudf::column> output;
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     try_drop_l3_cache();
-    output = cudf::io::text::multibyte_split(*source, delim, range);
+    output = cudf::io::text::multibyte_split(*source, delim, options);
   });
 
   state.add_buffer_size(mem_stats_logger.peak_memory_usage(), "pmu", "Peak Memory Usage");
@@ -203,6 +205,7 @@ using source_type_list = nvbench::enum_type_list<data_chunk_source_type::device,
 
 NVBENCH_BENCH_TYPES(bench_multibyte_split, NVBENCH_TYPE_AXES(source_type_list))
   .set_name("multibyte_split")
+  .add_int64_axis("strip_delimiters", {0, 1})
   .add_int64_axis("delim_size", {1, 4, 7})
   .add_int64_axis("delim_percent", {1, 25})
   .add_int64_power_of_two_axis("size_approx", {15, 30})

@@ -30,11 +30,25 @@ namespace cudf {
 namespace io {
 namespace text {
 
+/**
+ * @brief Parsing options for multibyte_split.
+ */
+struct parse_options {
+  /**
+   * @brief Only rows starting inside this byte range will be part of the output column.
+   */
+  byte_range_info byte_range = create_byte_range_info_max();
+  /**
+   * @brief Should delimiters at the end of rows be stripped from the output column?
+   */
+  bool strip_delimiters = false;
+};
+
 /**
  * @brief Splits the source text into a strings column using a multiple byte delimiter.
  *
- * Providing a byte range allows multibyte_split to read a whole file, but only return the offsets
- * of delimiters which begin within the range. If thinking in terms of "records", where each
+ * Providing a byte range allows multibyte_split to read a file partially, only returning the
+ * offsets of delimiters which begin within the range. If thinking in terms of "records", where each
  * delimiter dictates the end of a record, all records which begin within the byte range provided
  * will be returned, including any record which may begin in the range but end outside of the
  * range. Records which begin outside of the range will ignored, even if those records end inside
@@ -63,16 +77,22 @@ namespace text {
  *
  * @param source The source string
  * @param delimiter UTF-8 encoded string for which to find offsets in the source
- * @param byte_range range in which to consider offsets relevant
+ * @param options the parsing options to use (including byte range)
  * @param mr Memory resource to use for the device memory allocation
  * @return The strings found by splitting the source by the delimiter within the relevant byte
  * range.
  */
 std::unique_ptr<cudf::column> multibyte_split(
   data_chunk_source const& source,
   std::string const& delimiter,
-  std::optional<byte_range_info> byte_range = std::nullopt,
-  rmm::mr::device_memory_resource* mr       = rmm::mr::get_current_device_resource());
+  parse_options options               = {},
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+std::unique_ptr<cudf::column> multibyte_split(
+  data_chunk_source const& source,
+  std::string const& delimiter,
+  std::optional<byte_range_info> byte_range,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::column> multibyte_split(data_chunk_source const& source,
                                               std::string const& delimiter,

@@ -21,13 +21,16 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/io/text/byte_range_info.hpp>
 #include <cudf/io/text/data_chunk_source.hpp>
 #include <cudf/io/text/detail/multistate.hpp>
 #include <cudf/io/text/detail/tile_state.hpp>
+#include <cudf/io/text/multibyte_split.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -551,6 +554,7 @@ class output_builder {
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
                                               byte_range_info byte_range,
+                                              bool strip_delimiters,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr,
                                               rmm::cuda_stream_pool& stream_pool)
@@ -756,8 +760,12 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   auto chars          = char_storage.gather(stream, mr);
   auto global_offsets = row_offset_storage.gather(stream, mr);
 
-  bool const insert_begin = *first_row_offset == 0;
-  bool const insert_end   = not last_row_offset.has_value() or last_row_offset == chunk_offset;
+  // insert an offset at the beginning if we started at the beginning of the input
+  bool const insert_begin = first_row_offset.value_or(0) == 0;
+  // insert an offset at the end if we have not terminated the last row
+  bool const insert_end =
+    not(last_row_offset.has_value() or
+        (global_offsets.size() > 0 and global_offsets.back_element(stream) == chunk_offset));
   rmm::device_uvector<int32_t> offsets{
     global_offsets.size() + insert_begin + insert_end, stream, mr};
   if (insert_begin) { offsets.set_element_to_zero_async(0, stream); }
@@ -771,10 +779,27 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
                     [baseline = *first_row_offset] __device__(byte_offset global_offset) {
                       return static_cast<int32_t>(global_offset - baseline);
                     });
-
   auto string_count = offsets.size() - 1;
-
-  return cudf::make_strings_column(string_count, std::move(offsets), std::move(chars));
+  if (strip_delimiters) {
+    auto it = cudf::detail::make_counting_transform_iterator(
+      0,
+      [ofs        = offsets.data(),
+       chars      = chars.data(),
+       delim_size = static_cast<size_type>(delimiter.size()),
+       last_row   = static_cast<size_type>(string_count) - 1,
+       insert_end] __device__(size_type row) {
+        auto const begin = ofs[row];
+        auto const len   = ofs[row + 1] - begin;
+        if (row == last_row && insert_end) {
+          return thrust::make_pair(chars + begin, len);
+        } else {
+          return thrust::make_pair(chars + begin, std::max<size_type>(0, len - delim_size));
+        };
+      });
+    return cudf::strings::detail::make_strings_column(it, it + string_count, stream, mr);
+  } else {
+    return cudf::make_strings_column(string_count, std::move(offsets), std::move(chars));
+  }
 }
 
 }  // namespace detail
@@ -783,12 +808,21 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
                                               std::string const& delimiter,
                                               std::optional<byte_range_info> byte_range,
                                               rmm::mr::device_memory_resource* mr)
+{
+  return multibyte_split(
+    source, delimiter, parse_options{byte_range.value_or(create_byte_range_info_max())}, mr);
+}
+
+std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
+                                              std::string const& delimiter,
+                                              parse_options options,
+                                              rmm::mr::device_memory_resource* mr)
 {
   auto stream      = cudf::get_default_stream();
   auto stream_pool = rmm::cuda_stream_pool(2);
 
   auto result = detail::multibyte_split(
-    source, delimiter, byte_range.value_or(create_byte_range_info_max()), stream, mr, stream_pool);
+    source, delimiter, options.byte_range, options.strip_delimiters, stream, mr, stream_pool);
 
   return result;
 }
@@ -797,7 +831,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
                                               std::string const& delimiter,
                                               rmm::mr::device_memory_resource* mr)
 {
-  return multibyte_split(source, delimiter, std::nullopt, mr);
+  return multibyte_split(source, delimiter, parse_options{}, mr);
 }
 
 }  // namespace text

@@ -62,12 +62,25 @@ TEST_F(MultibyteSplitTest, NondeterministicMatching)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
 }
 
+TEST_F(MultibyteSplitTest, NoDelimiter)
+{
+  auto delimiter  = std::string(":");
+  auto host_input = std::string("abcdefg");
+
+  auto expected = strings_column_wrapper{"abcdefg"};
+
+  auto source = cudf::io::text::make_source(host_input);
+  auto out    = cudf::io::text::multibyte_split(*source, delimiter);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
 TEST_F(MultibyteSplitTest, DelimiterAtEnd)
 {
   auto delimiter  = std::string(":");
   auto host_input = std::string("abcdefg:");
 
-  auto expected = strings_column_wrapper{"abcdefg:", ""};
+  auto expected = strings_column_wrapper{"abcdefg:"};
 
   auto source = cudf::io::text::make_source(host_input);
   auto out    = cudf::io::text::multibyte_split(*source, delimiter);
@@ -80,7 +93,7 @@ TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange)
   auto delimiter  = std::string(":");
   auto host_input = std::string("abcdefg:");
 
-  auto expected = strings_column_wrapper{"abcdefg:", ""};
+  auto expected = strings_column_wrapper{"abcdefg:"};
 
   auto source = cudf::io::text::make_source(host_input);
   auto out    = cudf::io::text::multibyte_split(
@@ -91,6 +104,22 @@ TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
 }
 
+TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange2)
+{
+  auto delimiter  = std::string(":");
+  auto host_input = std::string("abcdefg:");
+
+  auto expected = strings_column_wrapper{"abcdefg:"};
+
+  auto source = cudf::io::text::make_source(host_input);
+  auto out    = cudf::io::text::multibyte_split(
+    *source,
+    delimiter,
+    cudf::io::text::byte_range_info{0, static_cast<int64_t>(host_input.size() - 1)});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
 TEST_F(MultibyteSplitTest, LargeInputSparse)
 {
   auto host_input    = std::string(1024 * 1024 * 32, '.');
@@ -120,8 +149,6 @@ TEST_F(MultibyteSplitTest, LargeInput)
     host_expected.emplace_back(std::string("...:|"));
   }
 
-  host_expected.emplace_back(std::string(""));
-
   auto expected = strings_column_wrapper{host_expected.begin(), host_expected.end()};
 
   auto delimiter = std::string("...:|");
@@ -146,6 +173,52 @@ TEST_F(MultibyteSplitTest, OverlappingMatchErasure)
   // CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); // this use case it not yet supported.
 }
 
+TEST_F(MultibyteSplitTest, DelimiterErasure)
+{
+  auto delimiter = "\r\n";
+
+  auto host_input = std::string("line\r\nanother line\r\nthird line\r\n");
+  auto expected   = strings_column_wrapper{"line", "another line", "third line"};
+
+  cudf::io::text::parse_options options;
+  options.strip_delimiters = true;
+  auto source              = cudf::io::text::make_source(host_input);
+  auto out                 = cudf::io::text::multibyte_split(*source, delimiter, options);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
+TEST_F(MultibyteSplitTest, DelimiterErasureByteRange)
+{
+  auto delimiter = "\r\n";
+
+  auto host_input = std::string("line\r\nanother line\r\nthird line\r\n");
+  auto expected   = strings_column_wrapper{"line", "another line", "third line"};
+
+  cudf::io::text::parse_options options;
+  options.strip_delimiters = true;
+  options.byte_range       = cudf::io::text::byte_range_info(0, host_input.size() - 1);
+  auto source              = cudf::io::text::make_source(host_input);
+  auto out                 = cudf::io::text::multibyte_split(*source, delimiter, options);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
+TEST_F(MultibyteSplitTest, DelimiterErasureOverlap)
+{
+  auto delimiter = "::";
+
+  auto host_input = std::string("::a:::b::c::::d");
+  auto expected   = strings_column_wrapper{"", "a", "", "b", "c", "", "", "d"};
+
+  cudf::io::text::parse_options options;
+  options.strip_delimiters = true;
+  auto source              = cudf::io::text::make_source(host_input);
+  auto out                 = cudf::io::text::multibyte_split(*source, delimiter, options);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
 TEST_F(MultibyteSplitTest, HandpickedInput)
 {
   auto delimiters = "::|";
@@ -184,7 +257,7 @@ TEST_F(MultibyteSplitTest, HandpickedInput)
     "ggg::|",         "hhh::|",      "___::|",       "here::|", "is::|",     "another::|",
     "simple::|",      "text::|",     "seperated::|", "by::|",   "emojis::|", "which::|",
     "are::|",         "multiple::|", "bytes::|",     "and::|",  "used::|",   "as::|",
-    "delimiters.::|", "::|",         "::|",          "::|",     ""};
+    "delimiters.::|", "::|",         "::|",          "::|"};
 
   auto source = cudf::io::text::make_source(host_input);
   auto out    = cudf::io::text::multibyte_split(*source, delimiters);
@@ -359,6 +432,21 @@ TEST_F(MultibyteSplitTest, SmallInputAllPossibleRangesSingleByte)
   }
 }
 
+TEST_F(MultibyteSplitTest, SingletonRangeAtEnd)
+{
+  // we want a delimiter at the end of the file to not create a new empty row even if it is the only
+  // character in the byte range
+  using namespace cudf::io::text;
+  auto host_input = std::string("ab:cd:");
+  auto delimiter  = std::string(":");
+  auto source     = make_source(host_input);
+  auto expected   = strings_column_wrapper{};
+
+  auto out = multibyte_split(*source, delimiter, byte_range_info{5, 1});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, debug_output_level::ALL_ERRORS);
+}
+
 TEST_F(MultibyteSplitTest, EmptyInput)
 {
   using namespace cudf::io::text;

@@ -1,6 +1,7 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint64_t
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
@@ -37,9 +38,12 @@ cdef extern from "cudf/io/text/data_chunk_source_factories.hpp" \
 cdef extern from "cudf/io/text/multibyte_split.hpp" \
         namespace "cudf::io::text" nogil:
 
-    unique_ptr[column] multibyte_split(data_chunk_source source,
-                                       string delimiter) except +
+    cdef cppclass parse_options:
+        byte_range_info byte_range
+        bool strip_delimiters
+
+        parse_options() except +
 
     unique_ptr[column] multibyte_split(data_chunk_source source,
                                        string delimiter,
-                                       byte_range_info byte_range) except +
+                                       parse_options options) except +