Skip to content

Commit

Permalink
byte_range support for multibyte_split/read_text (#10150)
Browse files Browse the repository at this point in the history
Adding byte_range support to multibyte_split/read_text.

Closes #9655

providing a byte range in terms of `(offset, size)` allows multibyte_split to read a whole file, but only return the offsets within those ranges as well as one additional offset (unless it's the end of the file). If thinking in terms of "records", where each delimiter dictates the end of a record, we effectively return all records which _begin_ within the byte range provided, and ignore all other records, including any record which may end (but not begin) within the range, and including any record which may begin in the range but _end_ outside of the range.

examples:
```
input: "abc..def..ghi..jkl.."
delimiter: ..
```
```
range offset: 0
range size: 2
output: ["abc.."]
```
```
range offset: 2
range size: 9
output: ["def..", "ghi.."]
```
```
range offset: 11
range size: 2
output: []
```
```
range offset: 13
range size: 7
output: ["jkl..", ""]
```

Authors:
  - Christopher Harris (https://github.com/cwharris)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Vukasin Milovanovic (https://github.com/vuule)
  - David Wendt (https://github.com/davidwendt)
  - Robert Maynard (https://github.com/robertmaynard)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #10150
  • Loading branch information
cwharris authored Mar 1, 2022
1 parent 5d8ea19 commit 78b316c
Show file tree
Hide file tree
Showing 14 changed files with 401 additions and 73 deletions.
1 change: 1 addition & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ test:
- test -f $PREFIX/include/cudf/io/orc_metadata.hpp
- test -f $PREFIX/include/cudf/io/orc.hpp
- test -f $PREFIX/include/cudf/io/parquet.hpp
- test -f $PREFIX/include/cudf/io/text/byte_range_info.hpp
- test -f $PREFIX/include/cudf/io/text/data_chunk_source_factories.hpp
- test -f $PREFIX/include/cudf/io/text/data_chunk_source.hpp
- test -f $PREFIX/include/cudf/io/text/detail/multistate.hpp
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,7 @@ add_library(
src/io/parquet/writer_impl.cu
src/io/statistics/orc_column_statistics.cu
src/io/statistics/parquet_column_statistics.cu
src/io/text/byte_range_info.cpp
src/io/text/multibyte_split.cu
src/io/utilities/column_buffer.cpp
src/io/utilities/config_utils.cpp
Expand Down
74 changes: 74 additions & 0 deletions cpp/include/cudf/io/text/byte_range_info.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/utilities/error.hpp>

#include <cstdint>
#include <vector>

namespace cudf {
namespace io {
namespace text {

/**
* @brief stores offset and size used to indicate a byte range
*/
class byte_range_info {
private:
int64_t _offset;
int64_t _size;

public:
constexpr byte_range_info() noexcept : _offset(0), _size(0) {}
constexpr byte_range_info(int64_t offset, int64_t size) : _offset(offset), _size(size)
{
CUDF_EXPECTS(offset >= 0, "offset must be non-negative");
CUDF_EXPECTS(size >= 0, "size must be non-negative");
}

constexpr byte_range_info(byte_range_info const& other) noexcept = default;
constexpr byte_range_info& operator=(byte_range_info const& other) noexcept = default;

[[nodiscard]] constexpr int64_t offset() { return _offset; }
[[nodiscard]] constexpr int64_t size() { return _size; }
};

/**
* @brief Create a collection of consecutive ranges between [0, total_bytes).
*
* Each range wil be the same size except if `total_bytes` is not evenly divisible by
* `range_count`, in which case the last range size will be the remainder.
*
* @param total_bytes total number of bytes in all ranges
* @param range_count total number of ranges in which to divide bytes
* @return Vector of range objects
*/
std::vector<byte_range_info> create_byte_range_infos_consecutive(int64_t total_bytes,
int64_t range_count);

/**
* @brief Create a byte_range_info which represents as much of a file as possible. Specifically,
* `[0, numeric_limit<int64_t>::max())`.
*
* @return `[0, numeric_limit<int64_t>::max())`
*/
byte_range_info create_byte_range_info_max();

} // namespace text
} // namespace io
} // namespace cudf
7 changes: 6 additions & 1 deletion cpp/include/cudf/io/text/data_chunk_source.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -36,6 +36,7 @@ namespace text {
*/
class device_data_chunk {
public:
virtual ~device_data_chunk() = default;
[[nodiscard]] virtual char const* data() const = 0;
[[nodiscard]] virtual std::size_t size() const = 0;
virtual operator device_span<char const>() const = 0;
Expand All @@ -52,6 +53,9 @@ class device_data_chunk {
*/
class data_chunk_reader {
public:
virtual ~data_chunk_reader() = default;
virtual void skip_bytes(std::size_t size) = 0;

/**
* @brief Get the next chunk of bytes from the data source
*
Expand All @@ -76,6 +80,7 @@ class data_chunk_reader {
*/
class data_chunk_source {
public:
virtual ~data_chunk_source() = default;
[[nodiscard]] virtual std::unique_ptr<data_chunk_reader> create_reader() const = 0;
};

Expand Down
10 changes: 9 additions & 1 deletion cpp/include/cudf/io/text/data_chunk_source_factories.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -89,6 +89,8 @@ class istream_data_chunk_reader : public data_chunk_reader {
}
}

void skip_bytes(std::size_t size) override { _datastream->ignore(size); };

std::unique_ptr<device_data_chunk> get_next_chunk(std::size_t read_size,
rmm::cuda_stream_view stream) override
{
Expand Down Expand Up @@ -143,6 +145,12 @@ class device_span_data_chunk_reader : public data_chunk_reader {
public:
device_span_data_chunk_reader(device_span<char const> data) : _data(data) {}

void skip_bytes(std::size_t read_size) override
{
if (read_size > _data.size() - _position) { read_size = _data.size() - _position; }
_position += read_size;
};

std::unique_ptr<device_data_chunk> get_next_chunk(std::size_t read_size,
rmm::cuda_stream_view stream) override
{
Expand Down
16 changes: 1 addition & 15 deletions cpp/include/cudf/io/text/detail/trie.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -89,20 +89,6 @@ struct trie_device_view {
*/
constexpr uint8_t get_match_length(uint16_t idx) { return _nodes[idx].match_length; }

/**
* @brief returns the longest matching state of any state in the multistate.
*/
template <uint32_t N>
constexpr uint8_t get_match_length(multistate const& states)
{
int8_t val = 0;
for (uint8_t i = 0; i < states.size(); i++) {
auto match_length = get_match_length(states.get_tail(i));
if (match_length > val) { val = match_length; }
}
return val;
}

private:
constexpr void transition_enqueue_all( //
char c,
Expand Down
48 changes: 46 additions & 2 deletions cpp/include/cudf/io/text/multibyte_split.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -17,6 +17,7 @@
#pragma once

#include <cudf/column/column.hpp>
#include <cudf/io/text/byte_range_info.hpp>
#include <cudf/io/text/data_chunk_source.hpp>

#include <rmm/mr/device/device_memory_resource.hpp>
Expand All @@ -27,10 +28,53 @@ namespace cudf {
namespace io {
namespace text {

/**
* @brief Splits the source text into a strings column using a multiple byte delimiter.
*
* Providing a byte range allows multibyte_split to read a whole file, but only return the offsets
* of delimiters which begin within the range. If thinking in terms of "records", where each
* delimiter dictates the end of a record, all records which begin within the byte range provided
* will be returned, including any record which may begin in the range but end outside of the
* range. Records which begin outside of the range will ignored, even if those records end inside
* the range.
*
* @code{.pseudo}
* Examples:
* source: "abc..def..ghi..jkl.."
* delimiter: ".."
*
* byte_range: nullopt
* return: ["abc..", "def..", "ghi..", jkl..", ""]
*
* byte_range: [0, 2)
* return: ["abc.."]
*
* byte_range: [2, 9)
* return: ["def..", "ghi.."]
*
* byte_range: [11, 2)
* return: []
*
* byte_range: [13, 7)
* return: ["jkl..", ""]
* @endcode
*
* @param source The source string
* @param delimiter UTF-8 encoded string for which to find offsets in the source
* @param byte_range range in which to consider offsets relevant
* @param mr Memory resource to use for the device memory allocation
* @return The strings found by splitting the source by the delimiter within the relevant byte
* range.
*/
std::unique_ptr<cudf::column> multibyte_split(
data_chunk_source const& source,
std::string const& delimiter,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
std::optional<byte_range_info> byte_range = std::nullopt,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

std::unique_ptr<cudf::column> multibyte_split(data_chunk_source const& source,
std::string const& delimiter,
rmm::mr::device_memory_resource* mr);

} // namespace text
} // namespace io
Expand Down
47 changes: 47 additions & 0 deletions cpp/src/io/text/byte_range_info.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/io/text/byte_range_info.hpp>

#include <limits>

namespace cudf {
namespace io {
namespace text {

byte_range_info create_byte_range_info_max() { return {0, std::numeric_limits<int64_t>::max()}; }

std::vector<byte_range_info> create_byte_range_infos_consecutive(int64_t total_bytes,
int64_t range_count)
{
auto range_size = util::div_rounding_up_safe(total_bytes, range_count);
auto ranges = std::vector<byte_range_info>();

ranges.reserve(range_size);

for (int64_t i = 0; i < range_count; i++) {
auto offset = i * range_size;
auto size = std::min(range_size, total_bytes - offset);
ranges.emplace_back(offset, size);
}

return ranges;
}

} // namespace text
} // namespace io
} // namespace cudf
Loading

0 comments on commit 78b316c

Please sign in to comment.