Skip to content

Commit

Permalink
Merge branch 'branch-22.04' of https://github.com/rapidsai/cudf into …
Browse files Browse the repository at this point in the history
…bug-data_gen-limits
  • Loading branch information
vuule committed Mar 2, 2022
2 parents 4ee7037 + 78b316c commit 7aa752d
Show file tree
Hide file tree
Showing 15 changed files with 431 additions and 74 deletions.
1 change: 1 addition & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ test:
- test -f $PREFIX/include/cudf/io/orc_metadata.hpp
- test -f $PREFIX/include/cudf/io/orc.hpp
- test -f $PREFIX/include/cudf/io/parquet.hpp
- test -f $PREFIX/include/cudf/io/text/byte_range_info.hpp
- test -f $PREFIX/include/cudf/io/text/data_chunk_source_factories.hpp
- test -f $PREFIX/include/cudf/io/text/data_chunk_source.hpp
- test -f $PREFIX/include/cudf/io/text/detail/multistate.hpp
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,7 @@ add_library(
src/io/parquet/writer_impl.cu
src/io/statistics/orc_column_statistics.cu
src/io/statistics/parquet_column_statistics.cu
src/io/text/byte_range_info.cpp
src/io/text/multibyte_split.cu
src/io/utilities/column_buffer.cpp
src/io/utilities/config_utils.cpp
Expand Down
74 changes: 74 additions & 0 deletions cpp/include/cudf/io/text/byte_range_info.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/utilities/error.hpp>

#include <cstdint>
#include <vector>

namespace cudf {
namespace io {
namespace text {

/**
* @brief stores offset and size used to indicate a byte range
*/
class byte_range_info {
private:
int64_t _offset;
int64_t _size;

public:
constexpr byte_range_info() noexcept : _offset(0), _size(0) {}
constexpr byte_range_info(int64_t offset, int64_t size) : _offset(offset), _size(size)
{
CUDF_EXPECTS(offset >= 0, "offset must be non-negative");
CUDF_EXPECTS(size >= 0, "size must be non-negative");
}

constexpr byte_range_info(byte_range_info const& other) noexcept = default;
constexpr byte_range_info& operator=(byte_range_info const& other) noexcept = default;

[[nodiscard]] constexpr int64_t offset() { return _offset; }
[[nodiscard]] constexpr int64_t size() { return _size; }
};

/**
* @brief Create a collection of consecutive ranges between [0, total_bytes).
*
* Each range wil be the same size except if `total_bytes` is not evenly divisible by
* `range_count`, in which case the last range size will be the remainder.
*
* @param total_bytes total number of bytes in all ranges
* @param range_count total number of ranges in which to divide bytes
* @return Vector of range objects
*/
std::vector<byte_range_info> create_byte_range_infos_consecutive(int64_t total_bytes,
int64_t range_count);

/**
* @brief Create a byte_range_info which represents as much of a file as possible. Specifically,
* `[0, numeric_limit<int64_t>::max())`.
*
* @return `[0, numeric_limit<int64_t>::max())`
*/
byte_range_info create_byte_range_info_max();

} // namespace text
} // namespace io
} // namespace cudf
7 changes: 6 additions & 1 deletion cpp/include/cudf/io/text/data_chunk_source.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -36,6 +36,7 @@ namespace text {
*/
class device_data_chunk {
public:
virtual ~device_data_chunk() = default;
[[nodiscard]] virtual char const* data() const = 0;
[[nodiscard]] virtual std::size_t size() const = 0;
virtual operator device_span<char const>() const = 0;
Expand All @@ -52,6 +53,9 @@ class device_data_chunk {
*/
class data_chunk_reader {
public:
virtual ~data_chunk_reader() = default;
virtual void skip_bytes(std::size_t size) = 0;

/**
* @brief Get the next chunk of bytes from the data source
*
Expand All @@ -76,6 +80,7 @@ class data_chunk_reader {
*/
class data_chunk_source {
public:
virtual ~data_chunk_source() = default;
[[nodiscard]] virtual std::unique_ptr<data_chunk_reader> create_reader() const = 0;
};

Expand Down
10 changes: 9 additions & 1 deletion cpp/include/cudf/io/text/data_chunk_source_factories.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -89,6 +89,8 @@ class istream_data_chunk_reader : public data_chunk_reader {
}
}

void skip_bytes(std::size_t size) override { _datastream->ignore(size); };

std::unique_ptr<device_data_chunk> get_next_chunk(std::size_t read_size,
rmm::cuda_stream_view stream) override
{
Expand Down Expand Up @@ -143,6 +145,12 @@ class device_span_data_chunk_reader : public data_chunk_reader {
public:
device_span_data_chunk_reader(device_span<char const> data) : _data(data) {}

void skip_bytes(std::size_t read_size) override
{
if (read_size > _data.size() - _position) { read_size = _data.size() - _position; }
_position += read_size;
};

std::unique_ptr<device_data_chunk> get_next_chunk(std::size_t read_size,
rmm::cuda_stream_view stream) override
{
Expand Down
16 changes: 1 addition & 15 deletions cpp/include/cudf/io/text/detail/trie.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -89,20 +89,6 @@ struct trie_device_view {
*/
constexpr uint8_t get_match_length(uint16_t idx) { return _nodes[idx].match_length; }

/**
* @brief returns the longest matching state of any state in the multistate.
*/
template <uint32_t N>
constexpr uint8_t get_match_length(multistate const& states)
{
int8_t val = 0;
for (uint8_t i = 0; i < states.size(); i++) {
auto match_length = get_match_length(states.get_tail(i));
if (match_length > val) { val = match_length; }
}
return val;
}

private:
constexpr void transition_enqueue_all( //
char c,
Expand Down
48 changes: 46 additions & 2 deletions cpp/include/cudf/io/text/multibyte_split.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -17,6 +17,7 @@
#pragma once

#include <cudf/column/column.hpp>
#include <cudf/io/text/byte_range_info.hpp>
#include <cudf/io/text/data_chunk_source.hpp>

#include <rmm/mr/device/device_memory_resource.hpp>
Expand All @@ -27,10 +28,53 @@ namespace cudf {
namespace io {
namespace text {

/**
* @brief Splits the source text into a strings column using a multiple byte delimiter.
*
* Providing a byte range allows multibyte_split to read a whole file, but only return the offsets
* of delimiters which begin within the range. If thinking in terms of "records", where each
* delimiter dictates the end of a record, all records which begin within the byte range provided
* will be returned, including any record which may begin in the range but end outside of the
* range. Records which begin outside of the range will ignored, even if those records end inside
* the range.
*
* @code{.pseudo}
* Examples:
* source: "abc..def..ghi..jkl.."
* delimiter: ".."
*
* byte_range: nullopt
* return: ["abc..", "def..", "ghi..", jkl..", ""]
*
* byte_range: [0, 2)
* return: ["abc.."]
*
* byte_range: [2, 9)
* return: ["def..", "ghi.."]
*
* byte_range: [11, 2)
* return: []
*
* byte_range: [13, 7)
* return: ["jkl..", ""]
* @endcode
*
* @param source The source string
* @param delimiter UTF-8 encoded string for which to find offsets in the source
* @param byte_range range in which to consider offsets relevant
* @param mr Memory resource to use for the device memory allocation
* @return The strings found by splitting the source by the delimiter within the relevant byte
* range.
*/
std::unique_ptr<cudf::column> multibyte_split(
data_chunk_source const& source,
std::string const& delimiter,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
std::optional<byte_range_info> byte_range = std::nullopt,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

std::unique_ptr<cudf::column> multibyte_split(data_chunk_source const& source,
std::string const& delimiter,
rmm::mr::device_memory_resource* mr);

} // namespace text
} // namespace io
Expand Down
47 changes: 47 additions & 0 deletions cpp/src/io/text/byte_range_info.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/io/text/byte_range_info.hpp>

#include <limits>

namespace cudf {
namespace io {
namespace text {

byte_range_info create_byte_range_info_max() { return {0, std::numeric_limits<int64_t>::max()}; }

std::vector<byte_range_info> create_byte_range_infos_consecutive(int64_t total_bytes,
int64_t range_count)
{
auto range_size = util::div_rounding_up_safe(total_bytes, range_count);
auto ranges = std::vector<byte_range_info>();

ranges.reserve(range_size);

for (int64_t i = 0; i < range_count; i++) {
auto offset = i * range_size;
auto size = std::min(range_size, total_bytes - offset);
ranges.emplace_back(offset, size);
}

return ranges;
}

} // namespace text
} // namespace io
} // namespace cudf
Loading

0 comments on commit 7aa752d

Please sign in to comment.