Merge remote-tracking branch 'upstream/branch-22.12' into dataframe-p…

…ivot_table
rapidsai · Oct 27, 2022 · fc2f3e7 · fc2f3e7
2 parents 05a748a + bac2004
commit fc2f3e7
Show file tree

Hide file tree

Showing 13 changed files with 641 additions and 251 deletions.
diff --git a/.gitignore b/.gitignore
@@ -165,3 +165,5 @@ dask-worker-space/
 # Sphinx docs & build artifacts
 docs/cudf/source/api_docs/generated/*
 docs/cudf/source/api_docs/api/*
+docs/cudf/source/user_guide/example_output/*
+docs/cudf/source/user_guide/cudf.*Dtype.*.rst
diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp
@@ -121,6 +121,7 @@ static void bench_multibyte_split(nvbench::state& state,
   auto const delim_percent      = state.get_int64("delim_percent");
   auto const file_size_approx   = state.get_int64("size_approx");
   auto const byte_range_percent = state.get_int64("byte_range_percent");
+  auto const strip_delimiters   = bool(state.get_int64("strip_delimiters"));
 
   auto const byte_range_factor = static_cast<double>(byte_range_percent) / 100;
   CUDF_EXPECTS(delim_percent >= 1, "delimiter percent must be at least 1");
@@ -182,12 +183,13 @@ static void bench_multibyte_split(nvbench::state& state,
   auto const range_size   = static_cast<int64_t>(device_input.size() * byte_range_factor);
   auto const range_offset = (device_input.size() - range_size) / 2;
   cudf::io::text::byte_range_info range{range_offset, range_size};
+  cudf::io::text::parse_options options{range, strip_delimiters};
   std::unique_ptr<cudf::column> output;
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     try_drop_l3_cache();
-    output = cudf::io::text::multibyte_split(*source, delim, range);
+    output = cudf::io::text::multibyte_split(*source, delim, options);
   });
 
   state.add_buffer_size(mem_stats_logger.peak_memory_usage(), "pmu", "Peak Memory Usage");
@@ -203,6 +205,7 @@ using source_type_list = nvbench::enum_type_list<data_chunk_source_type::device,
 
 NVBENCH_BENCH_TYPES(bench_multibyte_split, NVBENCH_TYPE_AXES(source_type_list))
   .set_name("multibyte_split")
+  .add_int64_axis("strip_delimiters", {0, 1})
   .add_int64_axis("delim_size", {1, 4, 7})
   .add_int64_axis("delim_percent", {1, 25})
   .add_int64_power_of_two_axis("size_approx", {15, 30})

diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp
@@ -30,11 +30,25 @@ namespace cudf {
 namespace io {
 namespace text {
 
+/**
+ * @brief Parsing options for multibyte_split.
+ */
+struct parse_options {
+  /**
+   * @brief Only rows starting inside this byte range will be part of the output column.
+   */
+  byte_range_info byte_range = create_byte_range_info_max();
+  /**
+   * @brief Whether delimiters at the end of rows should be stripped from the output column
+   */
+  bool strip_delimiters = false;
+};
+
 /**
  * @brief Splits the source text into a strings column using a multiple byte delimiter.
  *
- * Providing a byte range allows multibyte_split to read a whole file, but only return the offsets
- * of delimiters which begin within the range. If thinking in terms of "records", where each
+ * Providing a byte range allows multibyte_split to read a file partially, only returning the
+ * offsets of delimiters which begin within the range. If thinking in terms of "records", where each
  * delimiter dictates the end of a record, all records which begin within the byte range provided
  * will be returned, including any record which may begin in the range but end outside of the
  * range. Records which begin outside of the range will ignored, even if those records end inside
@@ -63,16 +77,22 @@ namespace text {
  *
  * @param source The source string
  * @param delimiter UTF-8 encoded string for which to find offsets in the source
- * @param byte_range range in which to consider offsets relevant
+ * @param options the parsing options to use (including byte range)
  * @param mr Memory resource to use for the device memory allocation
  * @return The strings found by splitting the source by the delimiter within the relevant byte
  * range.
  */
 std::unique_ptr<cudf::column> multibyte_split(
   data_chunk_source const& source,
   std::string const& delimiter,
-  std::optional<byte_range_info> byte_range = std::nullopt,
-  rmm::mr::device_memory_resource* mr       = rmm::mr::get_current_device_resource());
+  parse_options options               = {},
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+std::unique_ptr<cudf::column> multibyte_split(
+  data_chunk_source const& source,
+  std::string const& delimiter,
+  std::optional<byte_range_info> byte_range,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::column> multibyte_split(data_chunk_source const& source,
                                               std::string const& delimiter,