-
Notifications
You must be signed in to change notification settings - Fork 915
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
byte_range support for multibyte_split/read_text (#10150)
Adding byte_range support to multibyte_split/read_text. Closes #9655 providing a byte range in terms of `(offset, size)` allows multibyte_split to read a whole file, but only return the offsets within those ranges as well as one additional offset (unless it's the end of the file). If thinking in terms of "records", where each delimiter dictates the end of a record, we effectively return all records which _begin_ within the byte range provided, and ignore all other records, including any record which may end (but not begin) within the range, and including any record which may begin in the range but _end_ outside of the range. examples: ``` input: "abc..def..ghi..jkl.." delimiter: .. ``` ``` range offset: 0 range size: 2 output: ["abc.."] ``` ``` range offset: 2 range size: 9 output: ["def..", "ghi.."] ``` ``` range offset: 11 range size: 2 output: [] ``` ``` range offset: 13 range size: 7 output: ["jkl..", ""] ``` Authors: - Christopher Harris (https://github.com/cwharris) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - Vukasin Milovanovic (https://github.com/vuule) - David Wendt (https://github.com/davidwendt) - Robert Maynard (https://github.com/robertmaynard) - Vyas Ramasubramani (https://github.com/vyasr) URL: #10150
- Loading branch information
Showing
14 changed files
with
401 additions
and
73 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
/* | ||
* Copyright (c) 2022, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <cudf/utilities/error.hpp> | ||
|
||
#include <cstdint> | ||
#include <vector> | ||
|
||
namespace cudf { | ||
namespace io { | ||
namespace text { | ||
|
||
/** | ||
* @brief stores offset and size used to indicate a byte range | ||
*/ | ||
class byte_range_info { | ||
private: | ||
int64_t _offset; | ||
int64_t _size; | ||
|
||
public: | ||
constexpr byte_range_info() noexcept : _offset(0), _size(0) {} | ||
constexpr byte_range_info(int64_t offset, int64_t size) : _offset(offset), _size(size) | ||
{ | ||
CUDF_EXPECTS(offset >= 0, "offset must be non-negative"); | ||
CUDF_EXPECTS(size >= 0, "size must be non-negative"); | ||
} | ||
|
||
constexpr byte_range_info(byte_range_info const& other) noexcept = default; | ||
constexpr byte_range_info& operator=(byte_range_info const& other) noexcept = default; | ||
|
||
[[nodiscard]] constexpr int64_t offset() { return _offset; } | ||
[[nodiscard]] constexpr int64_t size() { return _size; } | ||
}; | ||
|
||
/** | ||
* @brief Create a collection of consecutive ranges between [0, total_bytes). | ||
* | ||
* Each range wil be the same size except if `total_bytes` is not evenly divisible by | ||
* `range_count`, in which case the last range size will be the remainder. | ||
* | ||
* @param total_bytes total number of bytes in all ranges | ||
* @param range_count total number of ranges in which to divide bytes | ||
* @return Vector of range objects | ||
*/ | ||
std::vector<byte_range_info> create_byte_range_infos_consecutive(int64_t total_bytes, | ||
int64_t range_count); | ||
|
||
/** | ||
* @brief Create a byte_range_info which represents as much of a file as possible. Specifically, | ||
* `[0, numeric_limit<int64_t>::max())`. | ||
* | ||
* @return `[0, numeric_limit<int64_t>::max())` | ||
*/ | ||
byte_range_info create_byte_range_info_max(); | ||
|
||
} // namespace text | ||
} // namespace io | ||
} // namespace cudf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
/* | ||
* Copyright (c) 2022, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include <cudf/detail/utilities/integer_utils.hpp> | ||
#include <cudf/io/text/byte_range_info.hpp> | ||
|
||
#include <limits> | ||
|
||
namespace cudf { | ||
namespace io { | ||
namespace text { | ||
|
||
byte_range_info create_byte_range_info_max() { return {0, std::numeric_limits<int64_t>::max()}; } | ||
|
||
std::vector<byte_range_info> create_byte_range_infos_consecutive(int64_t total_bytes, | ||
int64_t range_count) | ||
{ | ||
auto range_size = util::div_rounding_up_safe(total_bytes, range_count); | ||
auto ranges = std::vector<byte_range_info>(); | ||
|
||
ranges.reserve(range_size); | ||
|
||
for (int64_t i = 0; i < range_count; i++) { | ||
auto offset = i * range_size; | ||
auto size = std::min(range_size, total_bytes - offset); | ||
ranges.emplace_back(offset, size); | ||
} | ||
|
||
return ranges; | ||
} | ||
|
||
} // namespace text | ||
} // namespace io | ||
} // namespace cudf |
Oops, something went wrong.