byte_range support for multibyte_split/read_text (#10150)

Adding byte_range support to multibyte_split/read_text. Closes #9655 providing a byte range in terms of `(offset, size)` allows multibyte_split to read a whole file, but only return the offsets within those ranges as well as one additional offset (unless it's the end of the file). If thinking in terms of "records", where each delimiter dictates the end of a record, we effectively return all records which _begin_ within the byte range provided, and ignore all other records, including any record which may end (but not begin) within the range, and including any record which may begin in the range but _end_ outside of the range. examples: ``` input: "abc..def..ghi..jkl.." delimiter: .. ``` ``` range offset: 0 range size: 2 output: ["abc.."] ``` ``` range offset: 2 range size: 9 output: ["def..", "ghi.."] ``` ``` range offset: 11 range size: 2 output: [] ``` ``` range offset: 13 range size: 7 output: ["jkl..", ""] ``` Authors: - Christopher Harris (https://github.com/cwharris) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - Vukasin Milovanovic (https://github.com/vuule) - David Wendt (https://github.com/davidwendt) - Robert Maynard (https://github.com/robertmaynard) - Vyas Ramasubramani (https://github.com/vyasr) URL: #10150
rapidsai · Mar 1, 2022 · 78b316c · 78b316c
1 parent 5d8ea19
commit 78b316c
Show file tree

Hide file tree

Showing 14 changed files with 401 additions and 73 deletions.
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
@@ -137,6 +137,7 @@ test:
     - test -f $PREFIX/include/cudf/io/orc_metadata.hpp
     - test -f $PREFIX/include/cudf/io/orc.hpp
     - test -f $PREFIX/include/cudf/io/parquet.hpp
+    - test -f $PREFIX/include/cudf/io/text/byte_range_info.hpp
     - test -f $PREFIX/include/cudf/io/text/data_chunk_source_factories.hpp
     - test -f $PREFIX/include/cudf/io/text/data_chunk_source.hpp
     - test -f $PREFIX/include/cudf/io/text/detail/multistate.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -311,6 +311,7 @@ add_library(
   src/io/parquet/writer_impl.cu
   src/io/statistics/orc_column_statistics.cu
   src/io/statistics/parquet_column_statistics.cu
+  src/io/text/byte_range_info.cpp
   src/io/text/multibyte_split.cu
   src/io/utilities/column_buffer.cpp
   src/io/utilities/config_utils.cpp

diff --git a/cpp/include/cudf/io/text/byte_range_info.hpp b/cpp/include/cudf/io/text/byte_range_info.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/error.hpp>
+
+#include <cstdint>
+#include <vector>
+
+namespace cudf {
+namespace io {
+namespace text {
+
+/**
+ * @brief stores offset and size used to indicate a byte range
+ */
+class byte_range_info {
+ private:
+  int64_t _offset;
+  int64_t _size;
+
+ public:
+  constexpr byte_range_info() noexcept : _offset(0), _size(0) {}
+  constexpr byte_range_info(int64_t offset, int64_t size) : _offset(offset), _size(size)
+  {
+    CUDF_EXPECTS(offset >= 0, "offset must be non-negative");
+    CUDF_EXPECTS(size >= 0, "size must be non-negative");
+  }
+
+  constexpr byte_range_info(byte_range_info const& other) noexcept = default;
+  constexpr byte_range_info& operator=(byte_range_info const& other) noexcept = default;
+
+  [[nodiscard]] constexpr int64_t offset() { return _offset; }
+  [[nodiscard]] constexpr int64_t size() { return _size; }
+};
+
+/**
+ * @brief Create a collection of consecutive ranges between [0, total_bytes).
+ *
+ * Each range wil be the same size except if `total_bytes` is not evenly divisible by
+ * `range_count`, in which case the last range size will be the remainder.
+ *
+ * @param total_bytes total number of bytes in all ranges
+ * @param range_count total number of ranges in which to divide bytes
+ * @return Vector of range objects
+ */
+std::vector<byte_range_info> create_byte_range_infos_consecutive(int64_t total_bytes,
+                                                                 int64_t range_count);
+
+/**
+ * @brief Create a byte_range_info which represents as much of a file as possible. Specifically,
+ * `[0, numeric_limit<int64_t>::max())`.
+ *
+ * @return `[0, numeric_limit<int64_t>::max())`
+ */
+byte_range_info create_byte_range_info_max();
+
+}  // namespace text
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,6 +36,7 @@ namespace text {
  */
 class device_data_chunk {
  public:
+  virtual ~device_data_chunk()                     = default;
   [[nodiscard]] virtual char const* data() const   = 0;
   [[nodiscard]] virtual std::size_t size() const   = 0;
   virtual operator device_span<char const>() const = 0;
@@ -52,6 +53,9 @@ class device_data_chunk {
  */
 class data_chunk_reader {
  public:
+  virtual ~data_chunk_reader()              = default;
+  virtual void skip_bytes(std::size_t size) = 0;
+
   /**
    * @brief Get the next chunk of bytes from the data source
    *
@@ -76,6 +80,7 @@ class data_chunk_reader {
  */
 class data_chunk_source {
  public:
+  virtual ~data_chunk_source()                                                   = default;
   [[nodiscard]] virtual std::unique_ptr<data_chunk_reader> create_reader() const = 0;
 };
 

diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -89,6 +89,8 @@ class istream_data_chunk_reader : public data_chunk_reader {
     }
   }
 
+  void skip_bytes(std::size_t size) override { _datastream->ignore(size); };
+
   std::unique_ptr<device_data_chunk> get_next_chunk(std::size_t read_size,
                                                     rmm::cuda_stream_view stream) override
   {
@@ -143,6 +145,12 @@ class device_span_data_chunk_reader : public data_chunk_reader {
  public:
   device_span_data_chunk_reader(device_span<char const> data) : _data(data) {}
 
+  void skip_bytes(std::size_t read_size) override
+  {
+    if (read_size > _data.size() - _position) { read_size = _data.size() - _position; }
+    _position += read_size;
+  };
+
   std::unique_ptr<device_data_chunk> get_next_chunk(std::size_t read_size,
                                                     rmm::cuda_stream_view stream) override
   {

diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -89,20 +89,6 @@ struct trie_device_view {
    */
   constexpr uint8_t get_match_length(uint16_t idx) { return _nodes[idx].match_length; }
 
-  /**
-   * @brief returns the longest matching state of any state in the multistate.
-   */
-  template <uint32_t N>
-  constexpr uint8_t get_match_length(multistate const& states)
-  {
-    int8_t val = 0;
-    for (uint8_t i = 0; i < states.size(); i++) {
-      auto match_length = get_match_length(states.get_tail(i));
-      if (match_length > val) { val = match_length; }
-    }
-    return val;
-  }
-
  private:
   constexpr void transition_enqueue_all(  //
     char c,

diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/io/text/byte_range_info.hpp>
 #include <cudf/io/text/data_chunk_source.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
@@ -27,10 +28,53 @@ namespace cudf {
 namespace io {
 namespace text {
 
+/**
+ * @brief Splits the source text into a strings column using a multiple byte delimiter.
+ *
+ * Providing a byte range allows multibyte_split to read a whole file, but only return the offsets
+ * of delimiters which begin within the range. If thinking in terms of "records", where each
+ * delimiter dictates the end of a record, all records which begin within the byte range provided
+ * will be returned, including any record which may begin in the range but end outside of the
+ * range. Records which begin outside of the range will ignored, even if those records end inside
+ * the range.
+ *
+ * @code{.pseudo}
+ * Examples:
+ *  source:     "abc..def..ghi..jkl.."
+ *  delimiter:  ".."
+ *
+ *  byte_range: nullopt
+ *  return:     ["abc..", "def..", "ghi..", jkl..", ""]
+ *
+ *  byte_range: [0, 2)
+ *  return:     ["abc.."]
+ *
+ *  byte_range: [2, 9)
+ *  return:     ["def..", "ghi.."]
+ *
+ *  byte_range: [11, 2)
+ *  return:     []
+ *
+ *  byte_range: [13, 7)
+ *  return:     ["jkl..", ""]
+ * @endcode
+ *
+ * @param source The source string
+ * @param delimiter UTF-8 encoded string for which to find offsets in the source
+ * @param byte_range range in which to consider offsets relevant
+ * @param mr Memory resource to use for the device memory allocation
+ * @return The strings found by splitting the source by the delimiter within the relevant byte
+ * range.
+ */
 std::unique_ptr<cudf::column> multibyte_split(
   data_chunk_source const& source,
   std::string const& delimiter,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  std::optional<byte_range_info> byte_range = std::nullopt,
+  rmm::mr::device_memory_resource* mr       = rmm::mr::get_current_device_resource());
+
+std::unique_ptr<cudf::column> multibyte_split(data_chunk_source const& source,
+                                              std::string const& delimiter,
+                                              rmm::mr::device_memory_resource* mr);
 
 }  // namespace text
 }  // namespace io

diff --git a/cpp/src/io/text/byte_range_info.cpp b/cpp/src/io/text/byte_range_info.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/io/text/byte_range_info.hpp>
+
+#include <limits>
+
+namespace cudf {
+namespace io {
+namespace text {
+
+byte_range_info create_byte_range_info_max() { return {0, std::numeric_limits<int64_t>::max()}; }
+
+std::vector<byte_range_info> create_byte_range_infos_consecutive(int64_t total_bytes,
+                                                                 int64_t range_count)
+{
+  auto range_size = util::div_rounding_up_safe(total_bytes, range_count);
+  auto ranges     = std::vector<byte_range_info>();
+
+  ranges.reserve(range_size);
+
+  for (int64_t i = 0; i < range_count; i++) {
+    auto offset = i * range_size;
+    auto size   = std::min(range_size, total_bytes - offset);
+    ranges.emplace_back(offset, size);
+  }
+
+  return ranges;
+}
+
+}  // namespace text
+}  // namespace io
+}  // namespace cudf