rapidsai · rapids-bot · Aug 24, 2021 · Jun 27, 2021 · Jun 27, 2021 · Jul 2, 2021
@@ -118,19 +118,25 @@ test:
     - test -f $PREFIX/include/cudf/hashing.hpp
     - test -f $PREFIX/include/cudf/interop.hpp
     - test -f $PREFIX/include/cudf/io/avro.hpp
+    - test -f $PREFIX/include/cudf/io/csv.hpp
     - test -f $PREFIX/include/cudf/io/data_sink.hpp
     - test -f $PREFIX/include/cudf/io/datasource.hpp
-    - test -f $PREFIX/include/cudf/io/orc_metadata.hpp
-    - test -f $PREFIX/include/cudf/io/csv.hpp
     - test -f $PREFIX/include/cudf/io/detail/avro.hpp
     - test -f $PREFIX/include/cudf/io/detail/csv.hpp
     - test -f $PREFIX/include/cudf/io/detail/json.hpp
     - test -f $PREFIX/include/cudf/io/detail/orc.hpp
     - test -f $PREFIX/include/cudf/io/detail/parquet.hpp
     - test -f $PREFIX/include/cudf/io/detail/utils.hpp
     - test -f $PREFIX/include/cudf/io/json.hpp
+    - test -f $PREFIX/include/cudf/io/orc_metadata.hpp
     - test -f $PREFIX/include/cudf/io/orc.hpp
     - test -f $PREFIX/include/cudf/io/parquet.hpp
+    - test -f $PREFIX/include/cudf/io/text/data_chunk_source_factories.hpp
+    - test -f $PREFIX/include/cudf/io/text/data_chunk_source.hpp
+    - test -f $PREFIX/include/cudf/io/text/detail/multistate.hpp
+    - test -f $PREFIX/include/cudf/io/text/detail/tile_state.hpp
+    - test -f $PREFIX/include/cudf/io/text/detail/trie.hpp
+    - test -f $PREFIX/include/cudf/io/text/multibyte_split.hpp
     - test -f $PREFIX/include/cudf/io/types.hpp
     - test -f $PREFIX/include/cudf/ipc.hpp
     - test -f $PREFIX/include/cudf/join.hpp

@@ -307,6 +307,7 @@ add_library(cudf
     src/io/parquet/writer_impl.cu
     src/io/statistics/orc_column_statistics.cu
     src/io/statistics/parquet_column_statistics.cu
+    src/io/text/multibyte_split.cu
     src/io/utilities/column_buffer.cpp
     src/io/utilities/data_sink.cpp
     src/io/utilities/datasource.cpp

@@ -245,3 +245,8 @@ ConfigureBench(STRINGS_BENCH
 # - json benchmark -------------------------------------------------------------------
 ConfigureBench(JSON_BENCH
   string/json_benchmark.cpp)
+
+###################################################################################################
+# - io benchmark ---------------------------------------------------------------------
+ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK
+  io/text/multibyte_split_benchmark.cpp)
@@ -33,6 +33,8 @@ using cudf::io::io_type;
   benchmark(name##_buffer_output, type_or_group, static_cast<uint32_t>(io_type::HOST_BUFFER)); \
   benchmark(name##_void_output, type_or_group, static_cast<uint32_t>(io_type::VOID));
 
+std::string random_file_in_dir(std::string const& dir_path);
+
 /**
  * @brief Class to create a coupled `source_info` and `sink_info` of given type.
  */

@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/io/cuio_benchmark_common.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf_test/file_utilities.hpp>
+
+#include <cudf/io/text/data_chunk_source_factories.hpp>
+#include <cudf/io/text/multibyte_split.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/combine.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/transform.h>
+
+#include <cstdio>
+#include <fstream>
+#include <memory>
+
+using cudf::test::fixed_width_column_wrapper;
+
+temp_directory const temp_dir("cudf_gbench");
+
+enum data_chunk_source_type {
+  device,
+  file,
+  host,
+};
+
+static cudf::string_scalar create_random_input(int32_t num_chars,
+                                               double delim_factor,
+                                               double deviation,
+                                               std::string delim)
+{
+  auto const num_delims      = static_cast<int32_t>((num_chars * delim_factor) / delim.size());
+  auto const num_delim_chars = num_delims * delim.size();
+  auto const num_value_chars = num_chars - num_delim_chars;
+  auto const num_rows        = num_delims;
+  auto const value_size_avg  = static_cast<int32_t>(num_value_chars / num_rows);
+  auto const value_size_min  = static_cast<int32_t>(value_size_avg * (1 - deviation));
+  auto const value_size_max  = static_cast<int32_t>(value_size_avg * (1 + deviation));
+
+  data_profile table_profile;
+
+  table_profile.set_distribution_params(  //
+    cudf::type_id::STRING,
+    distribution_id::NORMAL,
+    value_size_min,
+    value_size_max);
+
+  auto const values_table = create_random_table(  //
+    {cudf::type_id::STRING},
+    1,
+    row_count{num_rows},
+    table_profile);
+
+  auto delim_scalar  = cudf::make_string_scalar(delim);
+  auto delims_column = cudf::make_column_from_scalar(*delim_scalar, num_rows);
+  auto input_table  = cudf::table_view({values_table->get_column(0).view(), delims_column->view()});
+  auto input_column = cudf::strings::concatenate(input_table);
+
+  // extract the chars from the returned strings column.
+  auto input_column_contents = input_column->release();
+  auto chars_column_contents = input_column_contents.children[1]->release();
+  auto chars_buffer          = chars_column_contents.data.release();
+
+  // turn the chars in to a string scalar.
+  return cudf::string_scalar(std::move(*chars_buffer));
+}
+
+static void BM_multibyte_split(benchmark::State& state)
+{
+  auto source_type      = static_cast<data_chunk_source_type>(state.range(0));
+  auto delim_size       = state.range(1);
+  auto delim_percent    = state.range(2);
+  auto file_size_approx = state.range(3);
+
+  CUDF_EXPECTS(delim_percent >= 1, "delimiter percent must be at least 1");
+  CUDF_EXPECTS(delim_percent <= 50, "delimiter percent must be at most 50");
+
+  auto delim = std::string(":", delim_size);
+
+  auto delim_factor = static_cast<double>(delim_percent) / 100;
+  auto device_input = create_random_input(file_size_approx, delim_factor, 0.05, delim);
+  auto host_input   = thrust::host_vector<char>(device_input.size());
+  auto host_string  = std::string(host_input.data(), host_input.size());
+
+  cudaMemcpyAsync(host_input.data(),
+                  device_input.data(),
+                  device_input.size() * sizeof(char),
+                  cudaMemcpyDeviceToHost,
+                  rmm::cuda_stream_default);
+
+  auto temp_file_name = random_file_in_dir(temp_dir.path());
+
+  {
+    auto temp_fostream = std::ofstream(temp_file_name, std::ofstream::out);
+    temp_fostream.write(host_input.data(), host_input.size());
+  }
+
+  cudaDeviceSynchronize();
+
+  auto source = std::unique_ptr<cudf::io::text::data_chunk_source>(nullptr);
+
+  switch (source_type) {
+    case data_chunk_source_type::file:  //
+      source = cudf::io::text::make_source_from_file(temp_file_name);
+      break;
+    case data_chunk_source_type::host:  //
+      source = cudf::io::text::make_source(host_string);
+      break;
+    case data_chunk_source_type::device:  //
+      source = cudf::io::text::make_source(device_input);
+      break;
+    default: CUDF_FAIL();
+  }
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true);
+    auto output = cudf::io::text::multibyte_split(*source, delim);
+  }
+
+  state.SetBytesProcessed(state.iterations() * device_input.size());
+}
+
+class MultibyteSplitBenchmark : public cudf::benchmark {
+};
+
+#define TRANSPOSE_BM_BENCHMARK_DEFINE(name)                                     \
+  BENCHMARK_DEFINE_F(MultibyteSplitBenchmark, name)(::benchmark::State & state) \
+  {                                                                             \
+    BM_multibyte_split(state);                                                  \
+  }                                                                             \
+  BENCHMARK_REGISTER_F(MultibyteSplitBenchmark, name)                           \
+    ->ArgsProduct({{data_chunk_source_type::device,                             \
+                    data_chunk_source_type::file,                               \
+                    data_chunk_source_type::host},                              \
+                   {1, 4, 7},                                                   \
+                   {1, 25},                                                     \
+                   {1 << 15, 1 << 30}})                                         \
+    ->UseManualTime()                                                           \
+    ->Unit(::benchmark::kMillisecond);
+
+TRANSPOSE_BM_BENCHMARK_DEFINE(multibyte_split_simple);
@@ -442,6 +442,26 @@ std::unique_ptr<column> make_strings_column(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Construct a STRING type column given offsets, columns, and optional null count and null
+ * mask.
+ *
+ * @param[in] num_strings The number of strings the column represents.
+ * @param[in] offsets The offset values for this column. The number of elements is one more than the
+ * total number of strings so the `offset[last] - offset[0]` is the total number of bytes in the
+ * strings vector.
+ * @param[in] chars The char bytes for all the strings for this column. Individual strings are
+ * identified by the offsets and the nullmask.
+ * @param[in] null_mask The bits specifying the null strings in device memory. Arrow format for
+ *  nulls is used for interpreting this bitmask.
+ * @param[in] null_count The number of null string entries.
+ */
+std::unique_ptr<column> make_strings_column(size_type num_strings,
+                                            rmm::device_uvector<size_type>&& offsets,
+                                            rmm::device_uvector<char>&& chars,
+                                            rmm::device_buffer&& null_mask = {},
+                                            size_type null_count = cudf::UNKNOWN_NULL_COUNT);
+
 /**
  * @brief Construct a LIST type column given offsets column, child column, null mask and null
  * count.

@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_pool.hpp>
+#include <rmm/device_buffer.hpp>
+
+namespace cudf {
+namespace io {
+namespace text {
+
+/**
+ * @brief a reader capable of producing views over device memory.
+ *
+ * The data chunk reader API encapsulates the idea of statefully traversing and loading a data
+ * source. A data source may be a file, a region of device memory, or a region of host memory.
+ * Reading data from these data sources efficiently requires different strategies dependings on the
+ * type of data source, type of compression, capabilities of the host and device, the data's
+ * destination. Whole-file decompression should be hidden behind this interface
+ *
+ */
+class data_chunk_reader {
+ public:
+  /**
+   * @brief Get the next chunk of bytes from the data source
+   *
+   * Performs any necessary work to read and prepare the underlying data source for consumption as a
+   * view over device memory. Common implementations may read from a file, copy data from host
+   * memory, allocate temporary memory, perform iterative decompression, or even launch device
+   * kernels.
+   *
+   * @param size desired number of bytes
+   * @param stream stream to associate allocations or perform work required to obtain chunk
+   * @return a chunk of data up to @param size bytes, or less if no more data is avaialable
+   */
+  virtual device_span<char const> get_next_chunk(uint32_t size, rmm::cuda_stream_view stream) = 0;
+};
+
+/**
+ * @brief a data source capable of creating a reader which can produce views of the data source in
+ * device memory.
+ *
+ */
+class data_chunk_source {
+ public:
+  virtual std::unique_ptr<data_chunk_reader> create_reader() const = 0;
+};
+
+}  // namespace text
+}  // namespace io
+}  // namespace cudf