From 5e9cf26a6a2ea9053a57befa11f9ea866ebb9e8f Mon Sep 17 00:00:00 2001 From: seidl Date: Mon, 26 Jun 2023 15:54:54 -0700 Subject: [PATCH 01/36] add DELTA_BINARY_PACKED decoder --- cpp/CMakeLists.txt | 1 + cpp/src/io/parquet/delta_binary.cuh | 272 +++++++++++++++++++ cpp/src/io/parquet/page_data.cu | 43 +-- cpp/src/io/parquet/page_decode.cuh | 4 + cpp/src/io/parquet/page_delta_decode.cu | 165 +++++++++++ cpp/src/io/parquet/page_string_decode.cu | 88 +----- cpp/src/io/parquet/page_string_utils.cuh | 110 ++++++++ cpp/src/io/parquet/parquet_gpu.hpp | 19 ++ cpp/src/io/parquet/reader_impl.cpp | 32 ++- cpp/src/io/parquet/reader_impl_preprocess.cu | 7 +- 10 files changed, 626 insertions(+), 115 deletions(-) create mode 100644 cpp/src/io/parquet/delta_binary.cuh create mode 100644 cpp/src/io/parquet/page_delta_decode.cu create mode 100644 cpp/src/io/parquet/page_string_utils.cuh diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 0742d039092..f8e500ac906 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -392,6 +392,7 @@ add_library( src/io/parquet/chunk_dict.cu src/io/parquet/page_enc.cu src/io/parquet/page_hdr.cu + src/io/parquet/page_delta_decode.cu src/io/parquet/page_string_decode.cu src/io/parquet/reader.cpp src/io/parquet/reader_impl.cpp diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh new file mode 100644 index 00000000000..b02164a377f --- /dev/null +++ b/cpp/src/io/parquet/delta_binary.cuh @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "page_decode.cuh" + +namespace cudf::io::parquet::gpu { + +// DELTA_XXX encoding support +// +// DELTA_BINARY_PACKED is used for INT32 and INT64 data types. Encoding begins with a header +// containing a block size, number of mini-blocks in each block, total value count, and first +// value. The first three are ULEB128 variable length ints, and the last is a zigzag ULEB128 +// varint. +// -- the block size is a multiple of 128 +// -- the mini-block count is chosen so that each mini-block will contain a multiple of 32 values +// -- the value count includes the first value stored in the header +// +// It seems most Parquet encoders will stick with a block size of 128, and 4 mini-blocks of 32 +// elements each. arrow-rs will use a block size of 256 for 64-bit ints. +// +// Following the header are the data blocks. Each block is further divided into mini-blocks, with +// each mini-block having its own encoding bitwidth. Each block begins with a header containing a +// zigzag ULEB128 encoded minimum delta value, followed by an array of uint8 bitwidths, one entry +// per mini-block. While encoding, the lowest delta value is subtracted from all the deltas in the +// block to ensure that all encoded values are positive. The deltas for each mini-block are bit +// packed using the same encoding as the RLE/Bit-Packing Hybrid encoder. +// +// DELTA_BYTE_ARRAY encoding (incremental encoding or front compression), is used for BYTE_ARRAY +// columns. For each element in a sequence of strings, a prefix length from the preceding string +// and a suffix is stored. The prefix lengths are DELTA_BINARY_PACKED encoded. The suffixes are +// encoded with DELTA_LENGTH_BYTE_ARRAY encoding, which is a DELTA_BINARY_PACKED list of suffix +// lengths, followed by the concatenated suffix data. + +// TODO: The delta encodings use ULEB128 integers, but for now we're only +// using max 64 bits. Need to see what the performance impact is of useing +// __int128_t rather than int64_t. +using uleb128_t = uint64_t; +using zigzag128_t = int64_t; + +/** + * @brief Read a ULEB128 varint integer + * + * @param[in,out] cur The current data position, updated after the read + * @param[in] end The end data position + * + * @return The value read + */ +inline __device__ uleb128_t get_uleb128(uint8_t const*& cur, uint8_t const* end) +{ + uleb128_t v = 0, l = 0, c; + while (cur < end) { + c = *cur++; + v |= (c & 0x7f) << l; + l += 7; + if ((c & 0x80) == 0) { return v; } + } + return v; +} + +/** + * @brief Read a ULEB128 zig-zag encoded varint integer + * + * @param[in,out] cur The current data position, updated after the read + * @param[in] end The end data position + * + * @return The value read + */ +inline __device__ zigzag128_t get_zz128(uint8_t const*& cur, uint8_t const* end) +{ + uleb128_t u = get_uleb128(cur, end); + return static_cast((u >> 1u) ^ -static_cast(u & 1)); +} + +struct delta_binary_decoder { + uint8_t const* block_start; // start of data, but updated as data is read + uint8_t const* block_end; // end of data + uleb128_t block_size; // usually 128, must be multiple of 128 + uleb128_t mini_block_count; // usually 4, chosen such that block_size/mini_block_count is a + // multiple of 32 + uleb128_t value_count; // total values encoded in the block + zigzag128_t last_value; // last value decoded, initialized to first_value from header + + uint32_t values_per_mb; // block_size / mini_block_count, must be multiple of 32 + uint32_t current_value_idx; // current value index, initialized to 0 at start of block + + zigzag128_t cur_min_delta; // min delta for the block + uint32_t cur_mb; // index of the current mini-block within the block + uint8_t const* cur_mb_start; // pointer to the start of the current mini-block data + uint8_t const* cur_bitwidths; // pointer to the bitwidth array in the block + + uleb128_t value[non_zero_buffer_size]; // circular buffer of delta values + + // returns the number of values encoded in the block data. when is_decode is true, + // account for the first value in the header. otherwise just count the values encoded + // in the mini-block data. + constexpr uint32_t num_encoded_values(bool is_decode) + { + return value_count == 0 ? 0 : is_decode ? value_count : value_count - 1; + } + + // read mini-block header into state object. should only be called from init_binary_block or + // setup_next_mini_block. header format is: + // + // | min delta (int) | bit-width array (1 byte * mini_block_count) | + // + // on exit db->cur_mb is 0 and db->cur_mb_start points to the first mini-block of data, or + // nullptr if out of data. + inline __device__ void init_mini_block(bool is_decode) + { + cur_mb = 0; + cur_mb_start = nullptr; + + if (current_value_idx < num_encoded_values(is_decode)) { + auto d_start = block_start; + cur_min_delta = get_zz128(d_start, block_end); + cur_bitwidths = d_start; + + d_start += mini_block_count; + cur_mb_start = d_start; + } + } + + // read delta binary header into state object. should be called on thread 0. header format is: + // + // | block size (uint) | mini-block count (uint) | value count (uint) | first value (int) | + // + // also initializes the first mini-block before exit + inline __device__ void init_binary_block(uint8_t const* d_start, uint8_t const* d_end) + { + block_end = d_end; + block_size = get_uleb128(d_start, d_end); + mini_block_count = get_uleb128(d_start, d_end); + value_count = get_uleb128(d_start, d_end); + last_value = get_zz128(d_start, d_end); + + current_value_idx = 0; + values_per_mb = block_size / mini_block_count; + + // init the first mini-block + block_start = d_start; + init_mini_block(false); + } + + // skip to the start of the next mini-block. should only be called on thread 0. + // calls init_binary_block if currently on the last mini-block in a block. + inline __device__ void setup_next_mini_block(bool is_decode) + { + if (current_value_idx >= num_encoded_values(is_decode)) { return; } + + current_value_idx += values_per_mb; + + // just set pointer to start of next mini_block + if (cur_mb < mini_block_count - 1) { + cur_mb_start += cur_bitwidths[cur_mb] * values_per_mb / 8; + cur_mb++; + } + // out of mini-blocks, start a new block + else { + block_start = cur_mb_start + cur_bitwidths[cur_mb] * values_per_mb / 8; + init_mini_block(is_decode); + } + } + + // decode the current mini-batch of deltas, and convert to values. + // called by all threads in a warp, currently only one warp supported. + inline __device__ void calc_mini_block_values(int lane_id) + { + using cudf::detail::warp_size; + if (current_value_idx >= value_count) { return; } + + // need to save first value from header on first pass + if (current_value_idx == 0) { + if (lane_id == 0) { + current_value_idx++; + value[0] = last_value; + } + __syncwarp(); + } + + uint32_t const mb_bits = cur_bitwidths[cur_mb]; + + // need to do in multiple passes if values_per_mb != 32 + uint32_t const num_pass = values_per_mb / warp_size; + + auto d_start = cur_mb_start; + + for (int i = 0; i < num_pass; i++) { + // position at end of the current mini-block since the following calculates + // negative indexes + d_start += (warp_size * mb_bits) / 8; + + // unpack deltas. modified from version in gpuDecodeDictionaryIndices(), but + // that one only unpacks up to bitwidths of 24. simplified some since this + // will always do batches of 32. also replaced branching with a loop. + int64_t delta = 0; + if (lane_id + current_value_idx < value_count) { + int32_t ofs = (lane_id - warp_size) * mb_bits; + uint8_t const* p = d_start + (ofs >> 3); + ofs &= 7; + if (p < block_end) { + uint32_t c = 8 - ofs; // 0 - 7 bits + delta = (*p++) >> ofs; + + while (c < mb_bits && p < block_end) { + delta |= (*p++) << c; + c += 8; + } + delta &= (1 << mb_bits) - 1; + } + } + + // add min delta to get true delta + delta += cur_min_delta; + + // do inclusive scan to get value - first_value at each position + __shared__ cub::WarpScan::TempStorage temp_storage; + cub::WarpScan(temp_storage).InclusiveSum(delta, delta); + + // now add first value from header or last value from previous block to get true value + delta += last_value; + value[rolling_index(current_value_idx + warp_size * i + lane_id)] = delta; + + // save value from last lane in warp. this will become the 'first value' added to the + // deltas calculated in the next iteration (or invocation). + if (lane_id == 31) { last_value = delta; } + __syncwarp(); + } + } + + inline __device__ void skip_values(int skip) + { + int const t = threadIdx.x; + int const lane_id = t & 0x1f; + + while (current_value_idx < skip && current_value_idx < num_encoded_values(true)) { + if (t < 32) { + calc_mini_block_values(lane_id); + if (lane_id == 0) { setup_next_mini_block(true); } + } + __syncthreads(); + } + } + + inline __device__ void decode_batch() + { + int const t = threadIdx.x; + int const lane_id = t & 0x1f; + + // unpack deltas and save in db->value + calc_mini_block_values(lane_id); + + // set up for next mini-block + if (lane_id == 0) { setup_next_mini_block(true); } + } +}; + +} // namespace cudf::io::parquet::gpu diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index e49378485fc..b93cbb6c2c5 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -35,8 +35,8 @@ namespace { * @param[in] src_pos Source position * @param[in] dstv Pointer to row output data (string descriptor or 32-bit hash) */ -inline __device__ void gpuOutputString(volatile page_state_s* s, - volatile page_state_buffers_s* sb, +inline __device__ void gpuOutputString(page_state_s volatile* s, + page_state_buffers_s volatile* sb, int src_pos, void* dstv) { @@ -62,7 +62,7 @@ inline __device__ void gpuOutputString(volatile page_state_s* s, * @param[in] src_pos Source position * @param[in] dst Pointer to row output data */ -inline __device__ void gpuOutputBoolean(volatile page_state_buffers_s* sb, +inline __device__ void gpuOutputBoolean(page_state_buffers_s volatile* sb, int src_pos, uint8_t* dst) { @@ -137,8 +137,8 @@ inline __device__ void gpuStoreOutput(uint2* dst, * @param[in] src_pos Source position * @param[out] dst Pointer to row output data */ -inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s, - volatile page_state_buffers_s* sb, +inline __device__ void gpuOutputInt96Timestamp(page_state_s volatile* s, + page_state_buffers_s volatile* sb, int src_pos, int64_t* dst) { @@ -210,8 +210,8 @@ inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s, * @param[in] src_pos Source position * @param[in] dst Pointer to row output data */ -inline __device__ void gpuOutputInt64Timestamp(volatile page_state_s* s, - volatile page_state_buffers_s* sb, +inline __device__ void gpuOutputInt64Timestamp(page_state_s volatile* s, + page_state_buffers_s volatile* sb, int src_pos, int64_t* dst) { @@ -292,8 +292,8 @@ __device__ void gpuOutputByteArrayAsInt(char const* ptr, int32_t len, T* dst) * @param[in] dst Pointer to row output data */ template -__device__ void gpuOutputFixedLenByteArrayAsInt(volatile page_state_s* s, - volatile page_state_buffers_s* sb, +__device__ void gpuOutputFixedLenByteArrayAsInt(page_state_s volatile* s, + page_state_buffers_s volatile* sb, int src_pos, T* dst) { @@ -327,8 +327,8 @@ __device__ void gpuOutputFixedLenByteArrayAsInt(volatile page_state_s* s, * @param[in] dst Pointer to row output data */ template -inline __device__ void gpuOutputFast(volatile page_state_s* s, - volatile page_state_buffers_s* sb, +inline __device__ void gpuOutputFast(page_state_s volatile* s, + page_state_buffers_s volatile* sb, int src_pos, T* dst) { @@ -358,7 +358,7 @@ inline __device__ void gpuOutputFast(volatile page_state_s* s, * @param[in] len Length of element */ static __device__ void gpuOutputGeneric( - volatile page_state_s* s, volatile page_state_buffers_s* sb, int src_pos, uint8_t* dst8, int len) + page_state_s volatile* s, page_state_buffers_s volatile* sb, int src_pos, uint8_t* dst8, int len) { uint8_t const* dict; uint32_t dict_pos, dict_size = s->dict_size; @@ -422,7 +422,7 @@ __device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t) } else if ((s->col.data_type & 7) == BYTE_ARRAY) { str_len = gpuInitStringDescriptors(s, nullptr, target_pos, t); } - if (!t) { *(volatile int32_t*)&s->dict_pos = target_pos; } + if (!t) { *(int32_t volatile*)&s->dict_pos = target_pos; } return str_len; } @@ -736,6 +736,17 @@ __global__ void __launch_bounds__(preprocess_block_size) } } +// skips strings and delta encodings +struct catch_all_filter { + device_span chunks; + + __device__ inline bool operator()(PageInfo const& page) + { + return !(is_string_col(page, chunks) || page.encoding == Encoding::DELTA_BINARY_PACKED || + page.encoding == Encoding::DELTA_BYTE_ARRAY); + } +}; + /** * @brief Kernel for computing the column data stored in the pages * @@ -764,7 +775,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData( [[maybe_unused]] null_count_back_copier _{s, t}; if (!setupLocalPageInfo( - s, &pages[page_idx], chunks, min_row, num_rows, non_string_filter{chunks}, true)) { + s, &pages[page_idx], chunks, min_row, num_rows, catch_all_filter{chunks}, true)) { return; } @@ -814,7 +825,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData( } else if ((s->col.data_type & 7) == BYTE_ARRAY) { gpuInitStringDescriptors(s, sb, src_target_pos, t & 0x1f); } - if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; } + if (t == 32) { *(int32_t volatile*)&s->dict_pos = src_target_pos; } } else { // WARP1..WARP3: Decode values int const dtype = s->col.data_type & 7; @@ -901,7 +912,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData( } } - if (t == out_thread0) { *(volatile int32_t*)&s->src_pos = target_pos; } + if (t == out_thread0) { *(int32_t volatile*)&s->src_pos = target_pos; } } __syncthreads(); } diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index 4469ec59b7a..eba3698730c 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -1248,6 +1248,10 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, if ((s->col.data_type & 7) == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; } break; case Encoding::RLE: s->dict_run = 0; break; + case Encoding::DELTA_BINARY_PACKED: + // nothing to do, just don't error + break; + default: s->error = 1; // Unsupported encoding break; diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu new file mode 100644 index 00000000000..3017783cd44 --- /dev/null +++ b/cpp/src/io/parquet/page_delta_decode.cu @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "delta_binary.cuh" +#include "page_string_utils.cuh" +#include "parquet_gpu.hpp" + +#include + +#include +#include + +namespace cudf::io::parquet::gpu { + +namespace { + +// functor for setupLocalPageInfo +struct delta_binary_filter { + __device__ inline bool operator()(PageInfo const& page) + { + return page.encoding == Encoding::DELTA_BINARY_PACKED; + } +}; + +// Decode page data that is DELTA_BINARY_PACKED encoded. This encoding is +// only used for int32 and int64 physical types (and appears to only be used +// with V2 page headers; see https://www.mail-archive.com/dev@parquet.apache.org/msg11826.html). +// this kernel only needs 96 threads (3 warps)(for now). +template +__global__ void __launch_bounds__(96) gpuDecodeDeltaBinary( + PageInfo* pages, device_span chunks, size_t min_row, size_t num_rows) +{ + __shared__ __align__(16) delta_binary_decoder db_state; + __shared__ __align__(16) page_state_buffers_s state_buffers; + __shared__ __align__(16) page_state_s state_g; + + page_state_s* const s = &state_g; + page_state_buffers_s* const sb = &state_buffers; + int const page_idx = blockIdx.x; + int const t = threadIdx.x; + int const lane_id = t & 0x1f; + auto* const db = &db_state; + [[maybe_unused]] null_count_back_copier _{s, t}; + + if (!setupLocalPageInfo( + s, &pages[page_idx], chunks, min_row, num_rows, delta_binary_filter{}, true)) { + return; + } + + bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0; + + // copying logic from gpuDecodePageData. + PageNestingDecodeInfo const* nesting_info_base = s->nesting_info; + + __shared__ level_t rep[non_zero_buffer_size]; // circular buffer of repetition level values + __shared__ level_t def[non_zero_buffer_size]; // circular buffer of definition level values + + // skipped_leaf_values will always be 0 for flat hierarchies. + uint32_t const skipped_leaf_values = s->page.skipped_leaf_values; + + // initialize delta state + if (t == 0) { db->init_binary_block(s->data_start, s->data_end); } + __syncthreads(); + + auto const batch_size = db->values_per_mb; + + // if skipped_leaf_values is non-zero, then we need to decode up to the first mini-block + // that has a value we need. + if (skipped_leaf_values > 0) { db->skip_values(skipped_leaf_values); } + + while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) { + uint32_t target_pos; + uint32_t const src_pos = s->src_pos; + + if (t < 64) { // warp0..1 + target_pos = min(src_pos + 2 * batch_size, s->nz_count + batch_size); + } else { // warp2... + target_pos = min(s->nz_count, src_pos + batch_size); + } + __syncthreads(); + + // warp0 will decode the rep/def levels, warp1 will unpack a mini-batch of deltas. + // warp2 waits one cycle for warps 0/1 to produce a batch, and then stuffs values + // into the proper location in the output. + if (t < 32) { + // warp 0 + // decode repetition and definition levels. + // - update validity vectors + // - updates offsets (for nested columns) + // - produces non-NULL value indices in s->nz_idx for subsequent decoding + gpuDecodeLevels(s, sb, target_pos, rep, def, t); + } else if (t < 64) { + // warp 1 + db->decode_batch(); + + } else if (t < 96 && src_pos < target_pos) { + // warp 2 + // nesting level that is storing actual leaf values + int const leaf_level_index = s->col.max_nesting_depth - 1; + + // process the mini-block in batches of 32 + for (uint32_t sp = src_pos + lane_id; sp < src_pos + batch_size; sp += 32) { + // the position in the output column/buffer + int32_t dst_pos = sb->nz_idx[rolling_index(sp)]; + + // handle skip_rows here. flat hierarchies can just skip up to first_row. + if (!has_repetition) { dst_pos -= s->first_row; } + + // place value for this thread + if (dst_pos >= 0 && sp < target_pos) { + void* const dst = nesting_info_base[leaf_level_index].data_out + dst_pos * s->dtype_len; + if (s->dtype_len == 8) { + *static_cast(dst) = db->value[rolling_index(sp + skipped_leaf_values)]; + } else if (s->dtype_len == 4) { + *static_cast(dst) = db->value[rolling_index(sp + skipped_leaf_values)]; + } + } + } + + if (lane_id == 0) { s->src_pos = src_pos + batch_size; } + } + __syncthreads(); + } +} + +} // anonymous namespace + +/** + * @copydoc cudf::io::parquet::gpu::DecodeDeltaBinary + */ +void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector& pages, + cudf::detail::hostdevice_vector const& chunks, + size_t num_rows, + size_t min_row, + int level_type_size, + rmm::cuda_stream_view stream) +{ + CUDF_EXPECTS(pages.size() > 0, "There is no page to decode"); + + dim3 dim_block(96, 1); + dim3 dim_grid(pages.size(), 1); // 1 threadblock per page + + if (level_type_size == 1) { + gpuDecodeDeltaBinary + <<>>(pages.device_ptr(), chunks, min_row, num_rows); + } else { + gpuDecodeDeltaBinary + <<>>(pages.device_ptr(), chunks, min_row, num_rows); + } +} + +} // namespace cudf::io::parquet::gpu diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index 9173d408192..20686ba7c84 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -15,6 +15,7 @@ */ #include "page_decode.cuh" +#include "page_string_utils.cuh" #include #include @@ -26,93 +27,6 @@ namespace gpu { namespace { -// stole this from cudf/strings/detail/gather.cuh. modified to run on a single string on one warp. -// copies from src to dst in 16B chunks per thread. -__device__ void wideStrcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t lane_id) -{ - using cudf::detail::warp_size; - using cudf::strings::detail::load_uint4; - - constexpr size_t out_datatype_size = sizeof(uint4); - constexpr size_t in_datatype_size = sizeof(uint); - - auto const alignment_offset = reinterpret_cast(dst) % out_datatype_size; - uint4* out_chars_aligned = reinterpret_cast(dst - alignment_offset); - auto const in_start = src; - - // Both `out_start_aligned` and `out_end_aligned` are indices into `dst`. - // `out_start_aligned` is the first 16B aligned memory location after `dst + 4`. - // `out_end_aligned` is the last 16B aligned memory location before `len - 4`. Characters - // between `[out_start_aligned, out_end_aligned)` will be copied using uint4. - // `dst + 4` and `len - 4` are used instead of `dst` and `len` to avoid - // `load_uint4` reading beyond string boundaries. - // use signed int since out_end_aligned can be negative. - int64_t out_start_aligned = (in_datatype_size + alignment_offset + out_datatype_size - 1) / - out_datatype_size * out_datatype_size - - alignment_offset; - int64_t out_end_aligned = - (len - in_datatype_size + alignment_offset) / out_datatype_size * out_datatype_size - - alignment_offset; - - for (int64_t ichar = out_start_aligned + lane_id * out_datatype_size; ichar < out_end_aligned; - ichar += warp_size * out_datatype_size) { - *(out_chars_aligned + (ichar + alignment_offset) / out_datatype_size) = - load_uint4((const char*)in_start + ichar); - } - - // Tail logic: copy characters of the current string outside - // `[out_start_aligned, out_end_aligned)`. - if (out_end_aligned <= out_start_aligned) { - // In this case, `[out_start_aligned, out_end_aligned)` is an empty set, and we copy the - // entire string. - for (int64_t ichar = lane_id; ichar < len; ichar += warp_size) { - dst[ichar] = in_start[ichar]; - } - } else { - // Copy characters in range `[0, out_start_aligned)`. - if (lane_id < out_start_aligned) { dst[lane_id] = in_start[lane_id]; } - // Copy characters in range `[out_end_aligned, len)`. - int64_t ichar = out_end_aligned + lane_id; - if (ichar < len) { dst[ichar] = in_start[ichar]; } - } -} - -/** - * @brief char-parallel string copy. - */ -__device__ void ll_strcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t lane_id) -{ - using cudf::detail::warp_size; - if (len > 64) { - wideStrcpy(dst, src, len, lane_id); - } else { - for (int i = lane_id; i < len; i += warp_size) { - dst[i] = src[i]; - } - } -} - -/** - * @brief Perform exclusive scan on an array of any length using a single block of threads. - */ -template -__device__ void block_excl_sum(size_type* arr, size_type length, size_type initial_value) -{ - using block_scan = cub::BlockScan; - __shared__ typename block_scan::TempStorage scan_storage; - int const t = threadIdx.x; - - // do a series of block sums, storing results in arr as we go - for (int pos = 0; pos < length; pos += block_size) { - int const tidx = pos + t; - size_type tval = tidx < length ? arr[tidx] : 0; - size_type block_sum; - block_scan(scan_storage).ExclusiveScan(tval, tval, initial_value, cub::Sum(), block_sum); - if (tidx < length) { arr[tidx] = tval; } - initial_value += block_sum; - } -} - /** * @brief Compute the start and end page value bounds for this page * diff --git a/cpp/src/io/parquet/page_string_utils.cuh b/cpp/src/io/parquet/page_string_utils.cuh new file mode 100644 index 00000000000..fb36c09052c --- /dev/null +++ b/cpp/src/io/parquet/page_string_utils.cuh @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cudf::io::parquet::gpu { + +// stole this from cudf/strings/detail/gather.cuh. modified to run on a single string on one warp. +// copies from src to dst in 16B chunks per thread. +inline __device__ void wideStrcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t lane_id) +{ + using cudf::detail::warp_size; + using cudf::strings::detail::load_uint4; + + constexpr size_t out_datatype_size = sizeof(uint4); + constexpr size_t in_datatype_size = sizeof(uint); + + auto const alignment_offset = reinterpret_cast(dst) % out_datatype_size; + uint4* out_chars_aligned = reinterpret_cast(dst - alignment_offset); + auto const in_start = src; + + // Both `out_start_aligned` and `out_end_aligned` are indices into `dst`. + // `out_start_aligned` is the first 16B aligned memory location after `dst + 4`. + // `out_end_aligned` is the last 16B aligned memory location before `len - 4`. Characters + // between `[out_start_aligned, out_end_aligned)` will be copied using uint4. + // `dst + 4` and `len - 4` are used instead of `dst` and `len` to avoid + // `load_uint4` reading beyond string boundaries. + // use signed int since out_end_aligned can be negative. + int64_t out_start_aligned = (in_datatype_size + alignment_offset + out_datatype_size - 1) / + out_datatype_size * out_datatype_size - + alignment_offset; + int64_t out_end_aligned = + (len - in_datatype_size + alignment_offset) / out_datatype_size * out_datatype_size - + alignment_offset; + + for (int64_t ichar = out_start_aligned + lane_id * out_datatype_size; ichar < out_end_aligned; + ichar += warp_size * out_datatype_size) { + *(out_chars_aligned + (ichar + alignment_offset) / out_datatype_size) = + load_uint4((const char*)in_start + ichar); + } + + // Tail logic: copy characters of the current string outside + // `[out_start_aligned, out_end_aligned)`. + if (out_end_aligned <= out_start_aligned) { + // In this case, `[out_start_aligned, out_end_aligned)` is an empty set, and we copy the + // entire string. + for (int64_t ichar = lane_id; ichar < len; ichar += warp_size) { + dst[ichar] = in_start[ichar]; + } + } else { + // Copy characters in range `[0, out_start_aligned)`. + if (lane_id < out_start_aligned) { dst[lane_id] = in_start[lane_id]; } + // Copy characters in range `[out_end_aligned, len)`. + int64_t ichar = out_end_aligned + lane_id; + if (ichar < len) { dst[ichar] = in_start[ichar]; } + } +} + +/** + * @brief char-parallel string copy. + */ +inline __device__ void ll_strcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t lane_id) +{ + using cudf::detail::warp_size; + if (len > 64) { + wideStrcpy(dst, src, len, lane_id); + } else { + for (int i = lane_id; i < len; i += warp_size) { + dst[i] = src[i]; + } + } +} + +/** + * @brief Perform exclusive scan for offsets array. Called for each page. + */ +template +__device__ void block_excl_sum(size_type* arr, size_type length, size_type initial_value) +{ + using block_scan = cub::BlockScan; + __shared__ typename block_scan::TempStorage scan_storage; + int const t = threadIdx.x; + + // do a series of block sums, storing results in arr as we go + for (int pos = 0; pos < length; pos += block_size) { + int const tidx = pos + t; + size_type tval = tidx < length ? arr[tidx] : 0; + size_type block_sum; + block_scan(scan_storage).ExclusiveScan(tval, tval, initial_value, cub::Sum(), block_sum); + if (tidx < length) { arr[tidx] = tval; } + initial_value += block_sum; + } +} + +} // namespace cudf::io::parquet::gpu diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 25d2885b7da..7690dcae611 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -549,6 +549,25 @@ void DecodeStringPageData(cudf::detail::hostdevice_vector& pages, int level_type_size, rmm::cuda_stream_view stream); +/** + * @brief Launches kernel for reading the DELTA_BINARY_PACKED column data stored in the pages + * + * The page data will be written to the output pointed to in the page's + * associated column chunk. + * + * @param[in,out] pages All pages to be decoded + * @param[in] chunks All chunks to be decoded + * @param[in] num_rows Total number of rows to read + * @param[in] min_row Minimum number of rows to read + * @param[in] stream CUDA stream to use, default 0 + */ +void DecodeDeltaBinary(cudf::detail::hostdevice_vector& pages, + cudf::detail::hostdevice_vector const& chunks, + size_t num_rows, + size_t min_row, + int level_type_size, + rmm::cuda_stream_view stream); + /** * @brief Launches kernel for initializing encoder row group fragments * diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 0237bf820b0..44d5e0d4fba 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -25,7 +25,7 @@ namespace cudf::io::detail::parquet { namespace { -int constexpr NUM_DECODERS = 2; // how many decode kernels are there to run +int constexpr NUM_DECODERS = 3; // how many decode kernels are there to run int constexpr APPROX_NUM_THREADS = 4; // guestimate from DaveB int constexpr STREAM_POOL_SIZE = NUM_DECODERS * APPROX_NUM_THREADS; @@ -176,16 +176,30 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) chunk_nested_data.host_to_device_async(_stream); _stream.synchronize(); - auto stream1 = get_stream_pool().get_stream(); - gpu::DecodePageData(pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, stream1); + bool const has_delta_binary = std::any_of(pages.begin(), pages.end(), [](auto& page) { + return page.encoding == Encoding::DELTA_BINARY_PACKED; + }); + + auto const level_type_size = _file_itm_data.level_type_size; + + // launch the catch-all page decoder + std::vector streams; + streams.push_back(get_stream_pool().get_stream()); + gpu::DecodePageData(pages, chunks, num_rows, skip_rows, level_type_size, streams.back()); + + // and then the specializations if (has_strings) { - auto stream2 = get_stream_pool().get_stream(); - chunk_nested_str_data.host_to_device_async(stream2); - gpu::DecodeStringPageData( - pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, stream2); - stream2.synchronize(); + streams.push_back(get_stream_pool().get_stream()); + chunk_nested_str_data.host_to_device_async(streams.back()); + gpu::DecodeStringPageData(pages, chunks, num_rows, skip_rows, level_type_size, streams.back()); } - stream1.synchronize(); + if (has_delta_binary) { + streams.push_back(get_stream_pool().get_stream()); + gpu::DecodeDeltaBinary(pages, chunks, num_rows, skip_rows, level_type_size, streams.back()); + } + + // synchronize the streams + std::for_each(streams.begin(), streams.end(), [](auto& stream) { stream.synchronize(); }); pages.device_to_host_async(_stream); page_nesting.device_to_host_async(_stream); diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index 8c3bdabe6b4..d3ff9651f8a 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -323,7 +323,8 @@ constexpr bool is_supported_encoding(Encoding enc) case Encoding::PLAIN: case Encoding::PLAIN_DICTIONARY: case Encoding::RLE: - case Encoding::RLE_DICTIONARY: return true; + case Encoding::RLE_DICTIONARY: + case Encoding::DELTA_BINARY_PACKED: return true; default: return false; } } @@ -729,8 +730,8 @@ std::pair>> reader::impl::create_and_read_co auto& chunks = _file_itm_data.chunks; // Descriptors for all the chunks that make up the selected columns - const auto num_input_columns = _input_columns.size(); - const auto num_chunks = row_groups_info.size() * num_input_columns; + auto const num_input_columns = _input_columns.size(); + auto const num_chunks = row_groups_info.size() * num_input_columns; chunks = cudf::detail::hostdevice_vector(0, num_chunks, _stream); // Association between each column chunk and its source From 9326321fe3c42bf1bba5d0298a0f0f9ec2009684 Mon Sep 17 00:00:00 2001 From: seidl Date: Mon, 26 Jun 2023 17:51:10 -0700 Subject: [PATCH 02/36] start merging in changes from #13622 --- cpp/src/io/parquet/page_data.cu | 22 ++++++++++++++++++++++ cpp/src/io/parquet/page_hdr.cu | 16 ++++++++++++++++ cpp/src/io/parquet/parquet_gpu.hpp | 18 ++++++++++++++++++ cpp/src/io/parquet/reader_impl.cpp | 12 +++++------- 4 files changed, 61 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index b93cbb6c2c5..1edc32de158 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -20,6 +20,9 @@ #include +#include +#include + namespace cudf { namespace io { namespace parquet { @@ -920,6 +923,25 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData( } // anonymous namespace +uint32_t GetKernelMasks(cudf::detail::hostdevice_vector& pages, + rmm::cuda_stream_view stream) +{ + // determine which kernels to invoke + // FIXME: when running on device I get and 'invalid device function' error +#if 0 + auto mask_iter = thrust::make_transform_iterator( + pages.d_begin(), [] __device__(auto const& p) { return p.kernel_mask; }); + auto const kernel_mask = thrust::reduce( + rmm::exec_policy(stream), mask_iter, mask_iter + pages.size(), 0U, thrust::bit_or{}); +#else + auto mask_iter = + thrust::make_transform_iterator(pages.begin(), [](auto const& p) { return p.kernel_mask; }); + auto const kernel_mask = + thrust::reduce(mask_iter, mask_iter + pages.size(), 0U, thrust::bit_or{}); +#endif + return kernel_mask; +} + /** * @copydoc cudf::io::parquet::gpu::ComputePageSizes */ diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index 16886d91fc9..bc00e48a69c 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -154,6 +154,20 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type) } while (rep_cnt || struct_depth); } +__device__ uint32_t get_kernel_mask(gpu::PageInfo const& page, gpu::ColumnChunkDesc const& chunk) +{ + if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return 0; } + + // non-string, non-nested, non-dict, non-boolean types + if (page.encoding == Encoding::DELTA_BINARY_PACKED) { + return KERNEL_MASK_DELTA_BINARY; + } else if (is_string_col(chunk)) { + return KERNEL_MASK_STRING; + } + + return KERNEL_MASK_GENERAL; +} + /** * @brief Functor to set value to 32 bit integer read from byte stream * @@ -370,6 +384,7 @@ __global__ void __launch_bounds__(128) bs->page.skipped_values = -1; bs->page.skipped_leaf_values = 0; bs->page.str_bytes = 0; + bs->page.kernel_mask = 0; } num_values = bs->ck.num_values; page_info = bs->ck.page_info; @@ -420,6 +435,7 @@ __global__ void __launch_bounds__(128) } bs->page.page_data = const_cast(bs->cur); bs->cur += bs->page.compressed_page_size; + bs->page.kernel_mask = get_kernel_mask(bs->page, bs->ck); } else { bs->cur = bs->end; } diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 7690dcae611..92bcd947b4b 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -86,6 +86,16 @@ enum level_type { NUM_LEVEL_TYPES }; +enum kernel_mask_bits { + KERNEL_MASK_GENERAL = (1 << 0), + KERNEL_MASK_STRING = (1 << 1), + KERNEL_MASK_DELTA_BINARY = (1 << 2) + // KERNEL_MASK_FIXED_WIDTH_DICT, + // KERNEL_MASK_STRINGS, + // KERNEL_NESTED_ + // etc +}; + /** * @brief Nesting information specifically needed by the decode and preprocessing * kernels. @@ -203,6 +213,8 @@ struct PageInfo { // level decode buffers uint8_t* lvl_decode_buf[level_type::NUM_LEVEL_TYPES]; + + uint32_t kernel_mask; }; /** @@ -454,6 +466,12 @@ void BuildStringDictionaryIndex(ColumnChunkDesc* chunks, int32_t num_chunks, rmm::cuda_stream_view stream); +/** + * @brief Get OR'd sum of page kernel masks. + */ +uint32_t GetKernelMasks(cudf::detail::hostdevice_vector& pages, + rmm::cuda_stream_view stream); + /** * @brief Compute page output size information. * diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 44d5e0d4fba..1b6302fa92a 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -56,14 +56,16 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema); }); + // figure out which kernels to run + auto const kernel_mask = GetKernelMasks(pages, _stream); + // Check to see if there are any string columns present. If so, then we need to get size info // for each string page. This size info will be used to pre-allocate memory for the column, // allowing the page decoder to write string data directly to the column buffer, rather than // doing a gather operation later on. // TODO: This step is somewhat redundant if size info has already been calculated (nested schema, // chunked reader). - auto const has_strings = std::any_of(chunks.begin(), chunks.end(), gpu::is_string_col); - + auto const has_strings = (kernel_mask & gpu::KERNEL_MASK_STRING) != 0; std::vector col_sizes(_input_columns.size(), 0L); if (has_strings) { gpu::ComputePageStringSizes( @@ -176,10 +178,6 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) chunk_nested_data.host_to_device_async(_stream); _stream.synchronize(); - bool const has_delta_binary = std::any_of(pages.begin(), pages.end(), [](auto& page) { - return page.encoding == Encoding::DELTA_BINARY_PACKED; - }); - auto const level_type_size = _file_itm_data.level_type_size; // launch the catch-all page decoder @@ -193,7 +191,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) chunk_nested_str_data.host_to_device_async(streams.back()); gpu::DecodeStringPageData(pages, chunks, num_rows, skip_rows, level_type_size, streams.back()); } - if (has_delta_binary) { + if ((kernel_mask & gpu::KERNEL_MASK_DELTA_BINARY) != 0) { streams.push_back(get_stream_pool().get_stream()); gpu::DecodeDeltaBinary(pages, chunks, num_rows, skip_rows, level_type_size, streams.back()); } From ee7511d2826c3a400d0a9040fec3f7de62ee2a65 Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 27 Jun 2023 11:58:17 -0700 Subject: [PATCH 03/36] get reduce working on device --- cpp/src/io/parquet/page_data.cu | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 1edc32de158..6a0ba62e134 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -927,18 +927,12 @@ uint32_t GetKernelMasks(cudf::detail::hostdevice_vector& pages, rmm::cuda_stream_view stream) { // determine which kernels to invoke - // FIXME: when running on device I get and 'invalid device function' error -#if 0 + // TODO: if lambda doesn't also have the __host__ decorator an 'invalid device function' exception + // is sometimes thrown auto mask_iter = thrust::make_transform_iterator( - pages.d_begin(), [] __device__(auto const& p) { return p.kernel_mask; }); + pages.d_begin(), [] __host__ __device__(PageInfo const& p) { return p.kernel_mask; }); auto const kernel_mask = thrust::reduce( rmm::exec_policy(stream), mask_iter, mask_iter + pages.size(), 0U, thrust::bit_or{}); -#else - auto mask_iter = - thrust::make_transform_iterator(pages.begin(), [](auto const& p) { return p.kernel_mask; }); - auto const kernel_mask = - thrust::reduce(mask_iter, mask_iter + pages.size(), 0U, thrust::bit_or{}); -#endif return kernel_mask; } From 2cafe62701828e76ca2a36394b4333cbad92d9ac Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 27 Jun 2023 12:06:01 -0700 Subject: [PATCH 04/36] use functor for transform iterator --- cpp/src/io/parquet/page_data.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 6a0ba62e134..e8c6d2dd32a 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -921,19 +921,19 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData( } } +struct mask_tform { + __device__ uint32_t operator()(PageInfo const& p) { return p.kernel_mask; } +}; + } // anonymous namespace uint32_t GetKernelMasks(cudf::detail::hostdevice_vector& pages, rmm::cuda_stream_view stream) { // determine which kernels to invoke - // TODO: if lambda doesn't also have the __host__ decorator an 'invalid device function' exception - // is sometimes thrown - auto mask_iter = thrust::make_transform_iterator( - pages.d_begin(), [] __host__ __device__(PageInfo const& p) { return p.kernel_mask; }); - auto const kernel_mask = thrust::reduce( + auto mask_iter = thrust::make_transform_iterator(pages.d_begin(), mask_tform{}); + return thrust::reduce( rmm::exec_policy(stream), mask_iter, mask_iter + pages.size(), 0U, thrust::bit_or{}); - return kernel_mask; } /** From 219ff0bf198c0f249fc9537c0136de28505a416e Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 27 Jun 2023 12:32:41 -0700 Subject: [PATCH 05/36] change filter functors to use kernel_mask --- cpp/src/io/parquet/page_data.cu | 13 +------------ cpp/src/io/parquet/page_decode.cuh | 18 ++++-------------- cpp/src/io/parquet/page_delta_decode.cu | 17 +++++++---------- cpp/src/io/parquet/page_string_decode.cu | 8 +++----- 4 files changed, 15 insertions(+), 41 deletions(-) diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index e8c6d2dd32a..648a286e392 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -739,17 +739,6 @@ __global__ void __launch_bounds__(preprocess_block_size) } } -// skips strings and delta encodings -struct catch_all_filter { - device_span chunks; - - __device__ inline bool operator()(PageInfo const& page) - { - return !(is_string_col(page, chunks) || page.encoding == Encoding::DELTA_BINARY_PACKED || - page.encoding == Encoding::DELTA_BYTE_ARRAY); - } -}; - /** * @brief Kernel for computing the column data stored in the pages * @@ -778,7 +767,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData( [[maybe_unused]] null_count_back_copier _{s, t}; if (!setupLocalPageInfo( - s, &pages[page_idx], chunks, min_row, num_rows, catch_all_filter{chunks}, true)) { + s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{KERNEL_MASK_GENERAL}, true)) { return; } diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index eba3698730c..8e8c15fefe5 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -934,21 +934,11 @@ struct all_types_filter { }; /** - * @brief Functor for setupLocalPageInfo that returns true if this is not a string column. + * @brief Functor for setupLocalPageInfo that takes a mask of allowed types. */ -struct non_string_filter { - device_span chunks; - - __device__ inline bool operator()(PageInfo const& page) { return !is_string_col(page, chunks); } -}; - -/** - * @brief Functor for setupLocalPageInfo that returns true if this is a string column. - */ -struct string_filter { - device_span chunks; - - __device__ inline bool operator()(PageInfo const& page) { return is_string_col(page, chunks); } +struct mask_filter { + int mask; + __device__ inline bool operator()(PageInfo const& page) { return (page.kernel_mask & mask) != 0; } }; /** diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu index 3017783cd44..9d345571495 100644 --- a/cpp/src/io/parquet/page_delta_decode.cu +++ b/cpp/src/io/parquet/page_delta_decode.cu @@ -27,14 +27,6 @@ namespace cudf::io::parquet::gpu { namespace { -// functor for setupLocalPageInfo -struct delta_binary_filter { - __device__ inline bool operator()(PageInfo const& page) - { - return page.encoding == Encoding::DELTA_BINARY_PACKED; - } -}; - // Decode page data that is DELTA_BINARY_PACKED encoded. This encoding is // only used for int32 and int64 physical types (and appears to only be used // with V2 page headers; see https://www.mail-archive.com/dev@parquet.apache.org/msg11826.html). @@ -55,8 +47,13 @@ __global__ void __launch_bounds__(96) gpuDecodeDeltaBinary( auto* const db = &db_state; [[maybe_unused]] null_count_back_copier _{s, t}; - if (!setupLocalPageInfo( - s, &pages[page_idx], chunks, min_row, num_rows, delta_binary_filter{}, true)) { + if (!setupLocalPageInfo(s, + &pages[page_idx], + chunks, + min_row, + num_rows, + mask_filter{KERNEL_MASK_DELTA_BINARY}, + true)) { return; } diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index 20686ba7c84..96b4f8f5842 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -463,9 +463,6 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz { __shared__ __align__(16) page_state_s state_g; - // only count if it's a string column - if (not is_string_col(pages[blockIdx.x], chunks)) { return; } - page_state_s* const s = &state_g; int const page_idx = blockIdx.x; int const t = threadIdx.x; @@ -483,7 +480,8 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz rle_stream decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}}; // setup page info - if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, string_filter{chunks}, false)) { + if (!setupLocalPageInfo( + s, pp, chunks, min_row, num_rows, mask_filter{KERNEL_MASK_STRING}, false)) { return; } @@ -580,7 +578,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData( [[maybe_unused]] null_count_back_copier _{s, t}; if (!setupLocalPageInfo( - s, &pages[page_idx], chunks, min_row, num_rows, string_filter{chunks}, true)) { + s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{KERNEL_MASK_STRING}, true)) { return; } From 0e181a801c7437f3a6113ba200fc238b4ac37ec4 Mon Sep 17 00:00:00 2001 From: seidl Date: Wed, 28 Jun 2023 15:47:01 -0700 Subject: [PATCH 06/36] pull in changes from #13622 --- cpp/CMakeLists.txt | 1 + cpp/src/io/parquet/decode_preprocess.cu | 417 +++++++++++++++++++++ cpp/src/io/parquet/delta_binary.cuh | 14 +- cpp/src/io/parquet/page_data.cu | 441 +++-------------------- cpp/src/io/parquet/page_decode.cuh | 120 +++--- cpp/src/io/parquet/page_delta_decode.cu | 36 +- cpp/src/io/parquet/page_string_decode.cu | 73 ++-- cpp/src/io/parquet/parquet_gpu.hpp | 6 + cpp/src/io/parquet/reader_impl.cpp | 14 +- cpp/src/io/parquet/rle_stream.cuh | 36 +- 10 files changed, 630 insertions(+), 528 deletions(-) create mode 100644 cpp/src/io/parquet/decode_preprocess.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f8e500ac906..810ebb8198e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -388,6 +388,7 @@ add_library( src/io/orc/writer_impl.cu src/io/parquet/compact_protocol_reader.cpp src/io/parquet/compact_protocol_writer.cpp + src/io/parquet/decode_preprocess.cu src/io/parquet/page_data.cu src/io/parquet/chunk_dict.cu src/io/parquet/page_enc.cu diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu new file mode 100644 index 00000000000..439422d3554 --- /dev/null +++ b/cpp/src/io/parquet/decode_preprocess.cu @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "page_decode.cuh" + +#include + +#include + +#include +#include + +namespace cudf { +namespace io { +namespace parquet { +namespace gpu { + +namespace { + +// # of threads we're decoding with +constexpr int preprocess_block_size = 512; + +// the required number of runs in shared memory we will need to provide the +// rle_stream object +constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size(preprocess_block_size); + +// the size of the rolling batch buffer +constexpr int rolling_buf_size = LEVEL_DECODE_BUF_SIZE; + +using unused_state_buf = page_state_buffers_s<0, 0, 0>; + +/** + * + * This function expects the dictionary position to be at 0 and will traverse + * the entire thing. + * + * Operates on a single warp only. Expects t < 32 + * + * @param s The local page info + * @param t Thread index + */ +__device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t) +{ + size_type target_pos = s->num_input_values; + size_type str_len = 0; + if (s->dict_base) { + auto const [new_target_pos, len] = + gpuDecodeDictionaryIndices(s, nullptr, target_pos, t); + target_pos = new_target_pos; + str_len = len; + } else if ((s->col.data_type & 7) == BYTE_ARRAY) { + str_len = gpuInitStringDescriptors(s, nullptr, target_pos, t); + } + if (!t) { *(int32_t volatile*)&s->dict_pos = target_pos; } + return str_len; +} + +/** + * @brief Update output column sizes for every nesting level based on a batch + * of incoming decoded definition and repetition level values. + * + * If bounds_set is true, computes skipped_values and skipped_leaf_values for the + * page to indicate where we need to skip to based on min/max row. + * + * Operates at the block level. + * + * @param s The local page info + * @param target_value_count The target value count to process up to + * @param rep Repetition level buffer + * @param def Definition level buffer + * @param t Thread index + * @param bounds_set A boolean indicating whether or not min/max row bounds have been set + */ +template +static __device__ void gpuUpdatePageSizes(page_state_s* s, + int target_value_count, + level_t const* const rep, + level_t const* const def, + int t, + bool bounds_set) +{ + // max nesting depth of the column + int const max_depth = s->col.max_nesting_depth; + + constexpr int num_warps = preprocess_block_size / 32; + constexpr int max_batch_size = num_warps * 32; + + using block_reduce = cub::BlockReduce; + using block_scan = cub::BlockScan; + __shared__ union { + typename block_reduce::TempStorage reduce_storage; + typename block_scan::TempStorage scan_storage; + } temp_storage; + + // how many input level values we've processed in the page so far + int value_count = s->input_value_count; + // how many rows we've processed in the page so far + int row_count = s->input_row_count; + // how many leaf values we've processed in the page so far + int leaf_count = s->input_leaf_count; + // whether or not we need to continue checking for the first row + bool skipped_values_set = s->page.skipped_values >= 0; + + while (value_count < target_value_count) { + int const batch_size = min(max_batch_size, target_value_count - value_count); + + // start/end depth + int start_depth, end_depth, d; + get_nesting_bounds( + start_depth, end_depth, d, s, rep, def, value_count, value_count + batch_size, t); + + // is this thread within row bounds? in the non skip_rows/num_rows case this will always + // be true. + int in_row_bounds = 1; + + // if we are in the skip_rows/num_rows case, we need to check against these limits + if (bounds_set) { + // get absolute thread row index + int const is_new_row = start_depth == 0; + int thread_row_count, block_row_count; + block_scan(temp_storage.scan_storage) + .InclusiveSum(is_new_row, thread_row_count, block_row_count); + __syncthreads(); + + // get absolute thread leaf index + int const is_new_leaf = (d >= s->nesting_info[max_depth - 1].max_def_level); + int thread_leaf_count, block_leaf_count; + block_scan(temp_storage.scan_storage) + .InclusiveSum(is_new_leaf, thread_leaf_count, block_leaf_count); + __syncthreads(); + + // if this thread is in row bounds + int const row_index = (thread_row_count + row_count) - 1; + in_row_bounds = + (row_index >= s->row_index_lower_bound) && (row_index < (s->first_row + s->num_rows)); + + // if we have not set skipped values yet, see if we found the first in-bounds row + if (!skipped_values_set) { + int local_count, global_count; + block_scan(temp_storage.scan_storage) + .InclusiveSum(in_row_bounds, local_count, global_count); + __syncthreads(); + + // we found it + if (global_count > 0) { + // this is the thread that represents the first row. + if (local_count == 1 && in_row_bounds) { + s->page.skipped_values = value_count + t; + s->page.skipped_leaf_values = + leaf_count + (is_new_leaf ? thread_leaf_count - 1 : thread_leaf_count); + } + skipped_values_set = true; + } + } + + row_count += block_row_count; + leaf_count += block_leaf_count; + } + + // increment value counts across all nesting depths + for (int s_idx = 0; s_idx < max_depth; s_idx++) { + int const in_nesting_bounds = (s_idx >= start_depth && s_idx <= end_depth && in_row_bounds); + int const count = block_reduce(temp_storage.reduce_storage).Sum(in_nesting_bounds); + __syncthreads(); + if (!t) { + PageNestingInfo* pni = &s->page.nesting[s_idx]; + pni->batch_size += count; + } + } + + value_count += batch_size; + } + + // update final outputs + if (!t) { + s->input_value_count = value_count; + + // only used in the skip_rows/num_rows case + s->input_leaf_count = leaf_count; + s->input_row_count = row_count; + } +} + +/** + * @brief Kernel for computing per-page column size information for all nesting levels. + * + * This function will write out the size field for each level of nesting. + * + * @param pages List of pages + * @param chunks List of column chunks + * @param min_row Row index to start reading at + * @param num_rows Maximum number of rows to read. Pass as INT_MAX to guarantee reading all rows + * @param is_base_pass Whether or not this is the base pass. We first have to compute + * the full size information of every page before we come through in a second (trim) pass + * to determine what subset of rows in this page we should be reading + * @param compute_string_sizes Whether or not we should be computing string sizes + * (PageInfo::str_bytes) as part of the pass + */ +template +__global__ void __launch_bounds__(preprocess_block_size) + gpuComputePageSizes(PageInfo* pages, + device_span chunks, + size_t min_row, + size_t num_rows, + bool is_base_pass, + bool compute_string_sizes) +{ + __shared__ __align__(16) page_state_s state_g; + + page_state_s* const s = &state_g; + int page_idx = blockIdx.x; + int t = threadIdx.x; + PageInfo* pp = &pages[page_idx]; + + // whether or not we have repetition levels (lists) + bool has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0; + + // the level stream decoders + __shared__ rle_run def_runs[rle_run_buffer_size]; + __shared__ rle_run rep_runs[rle_run_buffer_size]; + rle_stream decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, + {rep_runs}}; + + // setup page info + if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, all_types_filter{}, false)) { return; } + + // initialize the stream decoders (requires values computed in setupLocalPageInfo) + // the size of the rolling batch buffer + int const max_batch_size = rolling_buf_size; + level_t* rep = reinterpret_cast(pp->lvl_decode_buf[level_type::REPETITION]); + level_t* def = reinterpret_cast(pp->lvl_decode_buf[level_type::DEFINITION]); + decoders[level_type::DEFINITION].init(s->col.level_bits[level_type::DEFINITION], + s->abs_lvl_start[level_type::DEFINITION], + s->abs_lvl_end[level_type::DEFINITION], + max_batch_size, + def, + s->page.num_input_values); + if (has_repetition) { + decoders[level_type::REPETITION].init(s->col.level_bits[level_type::REPETITION], + s->abs_lvl_start[level_type::REPETITION], + s->abs_lvl_end[level_type::REPETITION], + max_batch_size, + rep, + s->page.num_input_values); + } + __syncthreads(); + + if (!t) { + s->page.skipped_values = -1; + s->page.skipped_leaf_values = 0; + s->page.str_bytes = 0; + s->input_row_count = 0; + s->input_value_count = 0; + + // in the base pass, we're computing the number of rows, make sure we visit absolutely + // everything + if (is_base_pass) { + s->first_row = 0; + s->num_rows = INT_MAX; + s->row_index_lower_bound = -1; + } + } + + // we only need to preprocess hierarchies with repetition in them (ie, hierarchies + // containing lists anywhere within). + compute_string_sizes = + compute_string_sizes && ((s->col.data_type & 7) == BYTE_ARRAY && s->dtype_len != 4); + + // early out optimizations: + + // - if this is a flat hierarchy (no lists) and is not a string column. in this case we don't need + // to do the expensive work of traversing the level data to determine sizes. we can just compute + // it directly. + if (!has_repetition && !compute_string_sizes) { + int depth = 0; + while (depth < s->page.num_output_nesting_levels) { + auto const thread_depth = depth + t; + if (thread_depth < s->page.num_output_nesting_levels) { + if (is_base_pass) { pp->nesting[thread_depth].size = pp->num_input_values; } + pp->nesting[thread_depth].batch_size = pp->num_input_values; + } + depth += blockDim.x; + } + return; + } + + // in the trim pass, for anything with lists, we only need to fully process bounding pages (those + // at the beginning or the end of the row bounds) + if (!is_base_pass && !is_bounds_page(s, min_row, num_rows, has_repetition)) { + int depth = 0; + while (depth < s->page.num_output_nesting_levels) { + auto const thread_depth = depth + t; + if (thread_depth < s->page.num_output_nesting_levels) { + // if we are not a bounding page (as checked above) then we are either + // returning all rows/values from this page, or 0 of them + pp->nesting[thread_depth].batch_size = + (s->num_rows == 0 && !is_page_contained(s, min_row, num_rows)) + ? 0 + : pp->nesting[thread_depth].size; + } + depth += blockDim.x; + } + return; + } + + // zero sizes + int depth = 0; + while (depth < s->page.num_output_nesting_levels) { + auto const thread_depth = depth + t; + if (thread_depth < s->page.num_output_nesting_levels) { + s->page.nesting[thread_depth].batch_size = 0; + } + depth += blockDim.x; + } + __syncthreads(); + + // the core loop. decode batches of level stream data using rle_stream objects + // and pass the results to gpuUpdatePageSizes + int processed = 0; + while (processed < s->page.num_input_values) { + // TODO: it would not take much more work to make it so that we could run both of these + // decodes concurrently. there are a couple of shared variables internally that would have to + // get dealt with but that's about it. + if (has_repetition) { + decoders[level_type::REPETITION].decode_next(t); + __syncthreads(); + } + // the # of rep/def levels will always be the same size + processed += decoders[level_type::DEFINITION].decode_next(t); + __syncthreads(); + + // update page sizes + gpuUpdatePageSizes(s, processed, rep, def, t, !is_base_pass); + __syncthreads(); + } + + // retrieve total string size. + // TODO: make this block-based instead of just 1 warp + if (compute_string_sizes) { + if (t < 32) { s->page.str_bytes = gpuDecodeTotalPageStringSize(s, t); } + } + + // update output results: + // - real number of rows for the whole page + // - nesting sizes for the whole page + // - skipped value information for trimmed pages + // - string bytes + if (is_base_pass) { + // nesting level 0 is the root column, so the size is also the # of rows + if (!t) { pp->num_rows = s->page.nesting[0].batch_size; } + + // store off this batch size as the "full" size + int depth = 0; + while (depth < s->page.num_output_nesting_levels) { + auto const thread_depth = depth + t; + if (thread_depth < s->page.num_output_nesting_levels) { + pp->nesting[thread_depth].size = pp->nesting[thread_depth].batch_size; + } + depth += blockDim.x; + } + } + + if (!t) { + pp->skipped_values = s->page.skipped_values; + pp->skipped_leaf_values = s->page.skipped_leaf_values; + pp->str_bytes = s->page.str_bytes; + } +} + +} // anonymous namespace + +/** + * @copydoc cudf::io::parquet::gpu::ComputePageSizes + */ +void ComputePageSizes(cudf::detail::hostdevice_vector& pages, + cudf::detail::hostdevice_vector const& chunks, + size_t min_row, + size_t num_rows, + bool compute_num_rows, + bool compute_string_sizes, + int level_type_size, + rmm::cuda_stream_view stream) +{ + dim3 dim_block(preprocess_block_size, 1); + dim3 dim_grid(pages.size(), 1); // 1 threadblock per page + + // computes: + // PageNestingInfo::size for each level of nesting, for each page. + // This computes the size for the entire page, not taking row bounds into account. + // If uses_custom_row_bounds is set to true, we have to do a second pass later that "trims" + // the starting and ending read values to account for these bounds. + if (level_type_size == 1) { + gpuComputePageSizes<<>>( + pages.device_ptr(), chunks, min_row, num_rows, compute_num_rows, compute_string_sizes); + } else { + gpuComputePageSizes<<>>( + pages.device_ptr(), chunks, min_row, num_rows, compute_num_rows, compute_string_sizes); + } +} + +} // namespace gpu +} // namespace parquet +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh index b02164a377f..8e33cc33a0c 100644 --- a/cpp/src/io/parquet/delta_binary.cuh +++ b/cpp/src/io/parquet/delta_binary.cuh @@ -52,6 +52,8 @@ namespace cudf::io::parquet::gpu { using uleb128_t = uint64_t; using zigzag128_t = int64_t; +constexpr int delta_rolling_buf_size = 256; + /** * @brief Read a ULEB128 varint integer * @@ -103,7 +105,7 @@ struct delta_binary_decoder { uint8_t const* cur_mb_start; // pointer to the start of the current mini-block data uint8_t const* cur_bitwidths; // pointer to the bitwidth array in the block - uleb128_t value[non_zero_buffer_size]; // circular buffer of delta values + uleb128_t value[delta_rolling_buf_size]; // circular buffer of delta values // returns the number of values encoded in the block data. when is_decode is true, // account for the first value in the header. otherwise just count the values encoded @@ -207,7 +209,7 @@ struct delta_binary_decoder { // unpack deltas. modified from version in gpuDecodeDictionaryIndices(), but // that one only unpacks up to bitwidths of 24. simplified some since this // will always do batches of 32. also replaced branching with a loop. - int64_t delta = 0; + zigzag128_t delta = 0; if (lane_id + current_value_idx < value_count) { int32_t ofs = (lane_id - warp_size) * mb_bits; uint8_t const* p = d_start + (ofs >> 3); @@ -217,10 +219,10 @@ struct delta_binary_decoder { delta = (*p++) >> ofs; while (c < mb_bits && p < block_end) { - delta |= (*p++) << c; + delta |= static_cast(*p++) << c; c += 8; } - delta &= (1 << mb_bits) - 1; + delta &= (static_cast(1) << mb_bits) - 1; } } @@ -233,7 +235,9 @@ struct delta_binary_decoder { // now add first value from header or last value from previous block to get true value delta += last_value; - value[rolling_index(current_value_idx + warp_size * i + lane_id)] = delta; + int const value_idx = + rolling_index(current_value_idx + warp_size * i + lane_id); + value[value_idx] = delta; // save value from last lane in warp. this will become the 'first value' added to the // deltas calculated in the next iteration (or invocation). diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 648a286e392..512364d7c1a 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -30,6 +30,9 @@ namespace gpu { namespace { +constexpr int decode_block_size = 128; +constexpr int rolling_buf_size = decode_block_size * 2; + /** * @brief Output a string descriptor * @@ -38,8 +41,9 @@ namespace { * @param[in] src_pos Source position * @param[in] dstv Pointer to row output data (string descriptor or 32-bit hash) */ +template inline __device__ void gpuOutputString(page_state_s volatile* s, - page_state_buffers_s volatile* sb, + state_buf volatile* sb, int src_pos, void* dstv) { @@ -65,11 +69,10 @@ inline __device__ void gpuOutputString(page_state_s volatile* s, * @param[in] src_pos Source position * @param[in] dst Pointer to row output data */ -inline __device__ void gpuOutputBoolean(page_state_buffers_s volatile* sb, - int src_pos, - uint8_t* dst) +template +inline __device__ void gpuOutputBoolean(state_buf volatile* sb, int src_pos, uint8_t* dst) { - *dst = sb->dict_idx[rolling_index(src_pos)]; + *dst = sb->dict_idx[rolling_index(src_pos)]; } /** @@ -140,8 +143,9 @@ inline __device__ void gpuStoreOutput(uint2* dst, * @param[in] src_pos Source position * @param[out] dst Pointer to row output data */ +template inline __device__ void gpuOutputInt96Timestamp(page_state_s volatile* s, - page_state_buffers_s volatile* sb, + state_buf volatile* sb, int src_pos, int64_t* dst) { @@ -152,8 +156,9 @@ inline __device__ void gpuOutputInt96Timestamp(page_state_s volatile* s, if (s->dict_base) { // Dictionary - dict_pos = (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] : 0; - src8 = s->dict_base; + dict_pos = + (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] : 0; + src8 = s->dict_base; } else { // Plain dict_pos = src_pos; @@ -213,8 +218,9 @@ inline __device__ void gpuOutputInt96Timestamp(page_state_s volatile* s, * @param[in] src_pos Source position * @param[in] dst Pointer to row output data */ +template inline __device__ void gpuOutputInt64Timestamp(page_state_s volatile* s, - page_state_buffers_s volatile* sb, + state_buf volatile* sb, int src_pos, int64_t* dst) { @@ -224,8 +230,9 @@ inline __device__ void gpuOutputInt64Timestamp(page_state_s volatile* s, if (s->dict_base) { // Dictionary - dict_pos = (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] : 0; - src8 = s->dict_base; + dict_pos = + (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] : 0; + src8 = s->dict_base; } else { // Plain dict_pos = src_pos; @@ -294,16 +301,18 @@ __device__ void gpuOutputByteArrayAsInt(char const* ptr, int32_t len, T* dst) * @param[in] src_pos Source position * @param[in] dst Pointer to row output data */ -template +template __device__ void gpuOutputFixedLenByteArrayAsInt(page_state_s volatile* s, - page_state_buffers_s volatile* sb, + state_buf volatile* sb, int src_pos, T* dst) { uint32_t const dtype_len_in = s->dtype_len_in; uint8_t const* data = s->dict_base ? s->dict_base : s->data_start; uint32_t const pos = - (s->dict_base ? ((s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] : 0) : src_pos) * + (s->dict_base + ? ((s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] : 0) + : src_pos) * dtype_len_in; uint32_t const dict_size = s->dict_size; @@ -329,9 +338,9 @@ __device__ void gpuOutputFixedLenByteArrayAsInt(page_state_s volatile* s, * @param[in] src_pos Source position * @param[in] dst Pointer to row output data */ -template +template inline __device__ void gpuOutputFast(page_state_s volatile* s, - page_state_buffers_s volatile* sb, + state_buf volatile* sb, int src_pos, T* dst) { @@ -340,8 +349,9 @@ inline __device__ void gpuOutputFast(page_state_s volatile* s, if (s->dict_base) { // Dictionary - dict_pos = (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] : 0; - dict = s->dict_base; + dict_pos = + (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] : 0; + dict = s->dict_base; } else { // Plain dict_pos = src_pos; @@ -360,16 +370,18 @@ inline __device__ void gpuOutputFast(page_state_s volatile* s, * @param[in] dst8 Pointer to row output data * @param[in] len Length of element */ +template static __device__ void gpuOutputGeneric( - page_state_s volatile* s, page_state_buffers_s volatile* sb, int src_pos, uint8_t* dst8, int len) + page_state_s volatile* s, state_buf volatile* sb, int src_pos, uint8_t* dst8, int len) { uint8_t const* dict; uint32_t dict_pos, dict_size = s->dict_size; if (s->dict_base) { // Dictionary - dict_pos = (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] : 0; - dict = s->dict_base; + dict_pos = + (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] : 0; + dict = s->dict_base; } else { // Plain dict_pos = src_pos; @@ -404,341 +416,6 @@ static __device__ void gpuOutputGeneric( } } -/** - * - * This function expects the dictionary position to be at 0 and will traverse - * the entire thing. - * - * Operates on a single warp only. Expects t < 32 - * - * @param s The local page info - * @param t Thread index - */ -__device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t) -{ - size_type target_pos = s->num_input_values; - size_type str_len = 0; - if (s->dict_base) { - auto const [new_target_pos, len] = gpuDecodeDictionaryIndices(s, nullptr, target_pos, t); - target_pos = new_target_pos; - str_len = len; - } else if ((s->col.data_type & 7) == BYTE_ARRAY) { - str_len = gpuInitStringDescriptors(s, nullptr, target_pos, t); - } - if (!t) { *(int32_t volatile*)&s->dict_pos = target_pos; } - return str_len; -} - -/** - * @brief Update output column sizes for every nesting level based on a batch - * of incoming decoded definition and repetition level values. - * - * If bounds_set is true, computes skipped_values and skipped_leaf_values for the - * page to indicate where we need to skip to based on min/max row. - * - * Operates at the block level. - * - * @param s The local page info - * @param target_value_count The target value count to process up to - * @param rep Repetition level buffer - * @param def Definition level buffer - * @param t Thread index - * @param bounds_set A boolean indicating whether or not min/max row bounds have been set - */ -template -static __device__ void gpuUpdatePageSizes(page_state_s* s, - int target_value_count, - level_t const* const rep, - level_t const* const def, - int t, - bool bounds_set) -{ - // max nesting depth of the column - int const max_depth = s->col.max_nesting_depth; - - constexpr int num_warps = preprocess_block_size / 32; - constexpr int max_batch_size = num_warps * 32; - - using block_reduce = cub::BlockReduce; - using block_scan = cub::BlockScan; - __shared__ union { - typename block_reduce::TempStorage reduce_storage; - typename block_scan::TempStorage scan_storage; - } temp_storage; - - // how many input level values we've processed in the page so far - int value_count = s->input_value_count; - // how many rows we've processed in the page so far - int row_count = s->input_row_count; - // how many leaf values we've processed in the page so far - int leaf_count = s->input_leaf_count; - // whether or not we need to continue checking for the first row - bool skipped_values_set = s->page.skipped_values >= 0; - - while (value_count < target_value_count) { - int const batch_size = min(max_batch_size, target_value_count - value_count); - - // start/end depth - int start_depth, end_depth, d; - get_nesting_bounds( - start_depth, end_depth, d, s, rep, def, value_count, value_count + batch_size, t); - - // is this thread within row bounds? in the non skip_rows/num_rows case this will always - // be true. - int in_row_bounds = 1; - - // if we are in the skip_rows/num_rows case, we need to check against these limits - if (bounds_set) { - // get absolute thread row index - int const is_new_row = start_depth == 0; - int thread_row_count, block_row_count; - block_scan(temp_storage.scan_storage) - .InclusiveSum(is_new_row, thread_row_count, block_row_count); - __syncthreads(); - - // get absolute thread leaf index - int const is_new_leaf = (d >= s->nesting_info[max_depth - 1].max_def_level); - int thread_leaf_count, block_leaf_count; - block_scan(temp_storage.scan_storage) - .InclusiveSum(is_new_leaf, thread_leaf_count, block_leaf_count); - __syncthreads(); - - // if this thread is in row bounds - int const row_index = (thread_row_count + row_count) - 1; - in_row_bounds = - (row_index >= s->row_index_lower_bound) && (row_index < (s->first_row + s->num_rows)); - - // if we have not set skipped values yet, see if we found the first in-bounds row - if (!skipped_values_set) { - int local_count, global_count; - block_scan(temp_storage.scan_storage) - .InclusiveSum(in_row_bounds, local_count, global_count); - __syncthreads(); - - // we found it - if (global_count > 0) { - // this is the thread that represents the first row. - if (local_count == 1 && in_row_bounds) { - s->page.skipped_values = value_count + t; - s->page.skipped_leaf_values = - leaf_count + (is_new_leaf ? thread_leaf_count - 1 : thread_leaf_count); - } - skipped_values_set = true; - } - } - - row_count += block_row_count; - leaf_count += block_leaf_count; - } - - // increment value counts across all nesting depths - for (int s_idx = 0; s_idx < max_depth; s_idx++) { - int const in_nesting_bounds = (s_idx >= start_depth && s_idx <= end_depth && in_row_bounds); - int const count = block_reduce(temp_storage.reduce_storage).Sum(in_nesting_bounds); - __syncthreads(); - if (!t) { - PageNestingInfo* pni = &s->page.nesting[s_idx]; - pni->batch_size += count; - } - } - - value_count += batch_size; - } - - // update final outputs - if (!t) { - s->input_value_count = value_count; - - // only used in the skip_rows/num_rows case - s->input_leaf_count = leaf_count; - s->input_row_count = row_count; - } -} - -/** - * @brief Kernel for computing per-page column size information for all nesting levels. - * - * This function will write out the size field for each level of nesting. - * - * @param pages List of pages - * @param chunks List of column chunks - * @param min_row Row index to start reading at - * @param num_rows Maximum number of rows to read. Pass as INT_MAX to guarantee reading all rows - * @param is_base_pass Whether or not this is the base pass. We first have to compute - * the full size information of every page before we come through in a second (trim) pass - * to determine what subset of rows in this page we should be reading - * @param compute_string_sizes Whether or not we should be computing string sizes - * (PageInfo::str_bytes) as part of the pass - */ -template -__global__ void __launch_bounds__(preprocess_block_size) - gpuComputePageSizes(PageInfo* pages, - device_span chunks, - size_t min_row, - size_t num_rows, - bool is_base_pass, - bool compute_string_sizes) -{ - __shared__ __align__(16) page_state_s state_g; - - page_state_s* const s = &state_g; - int page_idx = blockIdx.x; - int t = threadIdx.x; - PageInfo* pp = &pages[page_idx]; - - // whether or not we have repetition levels (lists) - bool has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0; - - // the level stream decoders - __shared__ rle_run def_runs[run_buffer_size]; - __shared__ rle_run rep_runs[run_buffer_size]; - rle_stream decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}}; - - // setup page info - if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, all_types_filter{}, false)) { return; } - - // initialize the stream decoders (requires values computed in setupLocalPageInfo) - int const max_batch_size = lvl_buf_size; - level_t* rep = reinterpret_cast(pp->lvl_decode_buf[level_type::REPETITION]); - level_t* def = reinterpret_cast(pp->lvl_decode_buf[level_type::DEFINITION]); - decoders[level_type::DEFINITION].init(s->col.level_bits[level_type::DEFINITION], - s->abs_lvl_start[level_type::DEFINITION], - s->abs_lvl_end[level_type::DEFINITION], - max_batch_size, - def, - s->page.num_input_values); - if (has_repetition) { - decoders[level_type::REPETITION].init(s->col.level_bits[level_type::REPETITION], - s->abs_lvl_start[level_type::REPETITION], - s->abs_lvl_end[level_type::REPETITION], - max_batch_size, - rep, - s->page.num_input_values); - } - __syncthreads(); - - if (!t) { - s->page.skipped_values = -1; - s->page.skipped_leaf_values = 0; - s->page.str_bytes = 0; - s->input_row_count = 0; - s->input_value_count = 0; - - // in the base pass, we're computing the number of rows, make sure we visit absolutely - // everything - if (is_base_pass) { - s->first_row = 0; - s->num_rows = INT_MAX; - s->row_index_lower_bound = -1; - } - } - - // we only need to preprocess hierarchies with repetition in them (ie, hierarchies - // containing lists anywhere within). - compute_string_sizes = - compute_string_sizes && ((s->col.data_type & 7) == BYTE_ARRAY && s->dtype_len != 4); - - // early out optimizations: - - // - if this is a flat hierarchy (no lists) and is not a string column. in this case we don't need - // to do the expensive work of traversing the level data to determine sizes. we can just compute - // it directly. - if (!has_repetition && !compute_string_sizes) { - int depth = 0; - while (depth < s->page.num_output_nesting_levels) { - auto const thread_depth = depth + t; - if (thread_depth < s->page.num_output_nesting_levels) { - if (is_base_pass) { pp->nesting[thread_depth].size = pp->num_input_values; } - pp->nesting[thread_depth].batch_size = pp->num_input_values; - } - depth += blockDim.x; - } - return; - } - - // in the trim pass, for anything with lists, we only need to fully process bounding pages (those - // at the beginning or the end of the row bounds) - if (!is_base_pass && !is_bounds_page(s, min_row, num_rows, has_repetition)) { - int depth = 0; - while (depth < s->page.num_output_nesting_levels) { - auto const thread_depth = depth + t; - if (thread_depth < s->page.num_output_nesting_levels) { - // if we are not a bounding page (as checked above) then we are either - // returning all rows/values from this page, or 0 of them - pp->nesting[thread_depth].batch_size = - (s->num_rows == 0 && !is_page_contained(s, min_row, num_rows)) - ? 0 - : pp->nesting[thread_depth].size; - } - depth += blockDim.x; - } - return; - } - - // zero sizes - int depth = 0; - while (depth < s->page.num_output_nesting_levels) { - auto const thread_depth = depth + t; - if (thread_depth < s->page.num_output_nesting_levels) { - s->page.nesting[thread_depth].batch_size = 0; - } - depth += blockDim.x; - } - __syncthreads(); - - // the core loop. decode batches of level stream data using rle_stream objects - // and pass the results to gpuUpdatePageSizes - int processed = 0; - while (processed < s->page.num_input_values) { - // TODO: it would not take much more work to make it so that we could run both of these - // decodes concurrently. there are a couple of shared variables internally that would have to - // get dealt with but that's about it. - if (has_repetition) { - decoders[level_type::REPETITION].decode_next(t); - __syncthreads(); - } - // the # of rep/def levels will always be the same size - processed += decoders[level_type::DEFINITION].decode_next(t); - __syncthreads(); - - // update page sizes - gpuUpdatePageSizes(s, processed, rep, def, t, !is_base_pass); - __syncthreads(); - } - - // retrieve total string size. - // TODO: make this block-based instead of just 1 warp - if (compute_string_sizes) { - if (t < 32) { s->page.str_bytes = gpuDecodeTotalPageStringSize(s, t); } - } - - // update output results: - // - real number of rows for the whole page - // - nesting sizes for the whole page - // - skipped value information for trimmed pages - // - string bytes - if (is_base_pass) { - // nesting level 0 is the root column, so the size is also the # of rows - if (!t) { pp->num_rows = s->page.nesting[0].batch_size; } - - // store off this batch size as the "full" size - int depth = 0; - while (depth < s->page.num_output_nesting_levels) { - auto const thread_depth = depth + t; - if (thread_depth < s->page.num_output_nesting_levels) { - pp->nesting[thread_depth].size = pp->nesting[thread_depth].batch_size; - } - depth += blockDim.x; - } - } - - if (!t) { - pp->skipped_values = s->page.skipped_values; - pp->skipped_leaf_values = s->page.skipped_leaf_values; - pp->str_bytes = s->page.str_bytes; - } -} - /** * @brief Kernel for computing the column data stored in the pages * @@ -757,12 +434,14 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData( PageInfo* pages, device_span chunks, size_t min_row, size_t num_rows) { __shared__ __align__(16) page_state_s state_g; - __shared__ __align__(16) page_state_buffers_s state_buffers; + __shared__ __align__(16) + page_state_buffers_s + state_buffers; - page_state_s* const s = &state_g; - page_state_buffers_s* const sb = &state_buffers; - int page_idx = blockIdx.x; - int t = threadIdx.x; + page_state_s* const s = &state_g; + auto* const sb = &state_buffers; + int page_idx = blockIdx.x; + int t = threadIdx.x; int out_thread0; [[maybe_unused]] null_count_back_copier _{s, t}; @@ -782,8 +461,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData( PageNestingDecodeInfo* nesting_info_base = s->nesting_info; - __shared__ level_t rep[non_zero_buffer_size]; // circular buffer of repetition level values - __shared__ level_t def[non_zero_buffer_size]; // circular buffer of definition level values + __shared__ level_t rep[rolling_buf_size]; // circular buffer of repetition level values + __shared__ level_t def[rolling_buf_size]; // circular buffer of definition level values // skipped_leaf_values will always be 0 for flat hierarchies. uint32_t skipped_leaf_values = s->page.skipped_leaf_values; @@ -824,7 +503,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData( src_pos += t - out_thread0; // the position in the output column/buffer - int dst_pos = sb->nz_idx[rolling_index(src_pos)]; + int dst_pos = sb->nz_idx[rolling_index(src_pos)]; // for the flat hierarchy case we will be reading from the beginning of the value stream, // regardless of the value of first_row. so adjust our destination offset accordingly. @@ -925,36 +604,6 @@ uint32_t GetKernelMasks(cudf::detail::hostdevice_vector& pages, rmm::exec_policy(stream), mask_iter, mask_iter + pages.size(), 0U, thrust::bit_or{}); } -/** - * @copydoc cudf::io::parquet::gpu::ComputePageSizes - */ -void ComputePageSizes(cudf::detail::hostdevice_vector& pages, - cudf::detail::hostdevice_vector const& chunks, - size_t min_row, - size_t num_rows, - bool compute_num_rows, - bool compute_string_sizes, - int level_type_size, - rmm::cuda_stream_view stream) -{ - dim3 dim_block(preprocess_block_size, 1); - dim3 dim_grid(pages.size(), 1); // 1 threadblock per page - - // computes: - // PageNestingInfo::size for each level of nesting, for each page. - // This computes the size for the entire page, not taking row bounds into account. - // If uses_custom_row_bounds is set to true, we have to do a second pass later that "trims" - // the starting and ending read values to account for these bounds. - if (level_type_size == 1) { - gpuComputePageSizes<<>>( - pages.device_ptr(), chunks, min_row, num_rows, compute_num_rows, compute_string_sizes); - } else { - gpuComputePageSizes - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, compute_num_rows, compute_string_sizes); - } -} - /** * @copydoc cudf::io::parquet::gpu::DecodePageData */ @@ -971,10 +620,10 @@ void __host__ DecodePageData(cudf::detail::hostdevice_vector& pages, dim3 dim_grid(pages.size(), 1); // 1 threadblock per page if (level_type_size == 1) { - gpuDecodePageData + gpuDecodePageData <<>>(pages.device_ptr(), chunks, min_row, num_rows); } else { - gpuDecodePageData + gpuDecodePageData <<>>(pages.device_ptr(), chunks, min_row, num_rows); } } diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index 8e8c15fefe5..91e19bd380d 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -25,17 +25,6 @@ namespace cudf::io::parquet::gpu { -constexpr int preprocess_block_size = num_rle_stream_decode_threads; // 512 -constexpr int decode_block_size = 128; -constexpr int non_zero_buffer_size = decode_block_size * 2; - -constexpr int rolling_index(int index) { return index & (non_zero_buffer_size - 1); } -template -constexpr int rolling_lvl_index(int index) -{ - return index % lvl_buf_size; -} - struct page_state_s { uint8_t const* data_start; uint8_t const* data_end; @@ -83,10 +72,15 @@ struct page_state_s { // buffers only used in the decode kernel. separated from page_state_s to keep // shared memory usage in other kernels (eg, gpuComputePageSizes) down. +template struct page_state_buffers_s { - uint32_t nz_idx[non_zero_buffer_size]; // circular buffer of non-null value positions - uint32_t dict_idx[non_zero_buffer_size]; // Dictionary index, boolean, or string offset values - uint32_t str_len[non_zero_buffer_size]; // String length for plain encoding of strings + static constexpr int nz_buf_size = _nz_buf_size; + static constexpr int dict_buf_size = _dict_buf_size; + static constexpr int str_buf_size = _str_buf_size; + + uint32_t nz_idx[nz_buf_size]; // circular buffer of non-null value positions + uint32_t dict_idx[dict_buf_size]; // Dictionary index, boolean, or string offset values + uint32_t str_len[str_buf_size]; // String length for plain encoding of strings }; // Copies null counts back to `nesting_decode` at the end of scope @@ -175,8 +169,10 @@ inline __device__ bool is_page_contained(page_state_s* const s, size_t start_row * * @return A pair containing a pointer to the string and its length */ -inline __device__ cuda::std::pair gpuGetStringData( - page_state_s volatile* s, page_state_buffers_s volatile* sb, int src_pos) +template +inline __device__ cuda::std::pair gpuGetStringData(page_state_s volatile* s, + state_buf volatile* sb, + int src_pos) { char const* ptr = nullptr; size_t len = 0; @@ -184,7 +180,9 @@ inline __device__ cuda::std::pair gpuGetStringData( if (s->dict_base) { // String dictionary uint32_t dict_pos = - (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] * sizeof(string_index_pair) : 0; + (s->dict_bits > 0) + ? sb->dict_idx[rolling_index(src_pos)] * sizeof(string_index_pair) + : 0; if (dict_pos < (uint32_t)s->dict_size) { auto const* src = reinterpret_cast(s->dict_base + dict_pos); ptr = src->first; @@ -192,10 +190,10 @@ inline __device__ cuda::std::pair gpuGetStringData( } } else { // Plain encoding - uint32_t dict_pos = sb->dict_idx[rolling_index(src_pos)]; + uint32_t dict_pos = sb->dict_idx[rolling_index(src_pos)]; if (dict_pos <= (uint32_t)s->dict_size) { ptr = reinterpret_cast(s->data_start + dict_pos); - len = sb->str_len[rolling_index(src_pos)]; + len = sb->str_len[rolling_index(src_pos)]; } } @@ -216,12 +214,9 @@ inline __device__ cuda::std::pair gpuGetStringData( * decodes strings beyond target_pos, the total length of strings returned will include these * additional values. */ -template +template __device__ cuda::std::pair gpuDecodeDictionaryIndices( - page_state_s volatile* s, - [[maybe_unused]] page_state_buffers_s volatile* sb, - int target_pos, - int t) + page_state_s volatile* s, [[maybe_unused]] state_buf volatile* sb, int target_pos, int t) { uint8_t const* end = s->data_end; int dict_bits = s->dict_bits; @@ -297,7 +292,9 @@ __device__ cuda::std::pair gpuDecodeDictionaryIndices( } // if we're not computing sizes, store off the dictionary index - if constexpr (!sizes_only) { sb->dict_idx[rolling_index(pos + t)] = dict_idx; } + if constexpr (!sizes_only) { + sb->dict_idx[rolling_index(pos + t)] = dict_idx; + } } // if we're computing sizes, add the length(s) @@ -333,8 +330,9 @@ __device__ cuda::std::pair gpuDecodeDictionaryIndices( * * @return The new output position */ +template inline __device__ int gpuDecodeRleBooleans(page_state_s volatile* s, - page_state_buffers_s volatile* sb, + state_buf volatile* sb, int target_pos, int t) { @@ -383,7 +381,7 @@ inline __device__ int gpuDecodeRleBooleans(page_state_s volatile* s, } else { dict_idx = s->dict_val; } - sb->dict_idx[rolling_index(pos + t)] = dict_idx; + sb->dict_idx[rolling_index(pos + t)] = dict_idx; } pos += batch_len; } @@ -401,9 +399,9 @@ inline __device__ int gpuDecodeRleBooleans(page_state_s volatile* s, * * @return Total length of strings processed */ -template +template __device__ size_type gpuInitStringDescriptors(page_state_s volatile* s, - [[maybe_unused]] page_state_buffers_s volatile* sb, + [[maybe_unused]] state_buf volatile* sb, int target_pos, int t) { @@ -426,8 +424,8 @@ __device__ size_type gpuInitStringDescriptors(page_state_s volatile* s, len = 0; } if constexpr (!sizes_only) { - sb->dict_idx[rolling_index(pos)] = k; - sb->str_len[rolling_index(pos)] = len; + sb->dict_idx[rolling_index(pos)] = k; + sb->str_len[rolling_index(pos)] = len; } k += len; total_len += len; @@ -448,7 +446,7 @@ __device__ size_type gpuInitStringDescriptors(page_state_s volatile* s, * @param[in] t Warp0 thread ID (0..31) * @param[in] lvl The level type we are decoding - DEFINITION or REPETITION */ -template +template __device__ void gpuDecodeStream( level_t* output, page_state_s* s, int32_t target_count, int t, level_type lvl) { @@ -515,8 +513,8 @@ __device__ void gpuDecodeStream( level_run -= batch_len * 2; } if (t < batch_len) { - int idx = value_count + t; - output[rolling_index(idx)] = level_val; + int idx = value_count + t; + output[rolling_index(idx)] = level_val; } batch_coded_count += batch_len; value_count += batch_len; @@ -540,21 +538,22 @@ __device__ void gpuDecodeStream( * @param[in] valid_mask The validity mask to be stored * @param[in] value_count # of bits in the validity mask */ -inline __device__ void store_validity(PageNestingDecodeInfo* nesting_info, +inline __device__ void store_validity(int valid_map_offset, + bitmask_type* valid_map, uint32_t valid_mask, int32_t value_count) { - int word_offset = nesting_info->valid_map_offset / 32; - int bit_offset = nesting_info->valid_map_offset % 32; + int word_offset = valid_map_offset / 32; + int bit_offset = valid_map_offset % 32; // if we fit entirely in the output word if (bit_offset + value_count <= 32) { auto relevant_mask = static_cast((static_cast(1) << value_count) - 1); if (relevant_mask == ~0) { - nesting_info->valid_map[word_offset] = valid_mask; + valid_map[word_offset] = valid_mask; } else { - atomicAnd(nesting_info->valid_map + word_offset, ~(relevant_mask << bit_offset)); - atomicOr(nesting_info->valid_map + word_offset, (valid_mask & relevant_mask) << bit_offset); + atomicAnd(valid_map + word_offset, ~(relevant_mask << bit_offset)); + atomicOr(valid_map + word_offset, (valid_mask & relevant_mask) << bit_offset); } } // we're going to spill over into the next word. @@ -568,17 +567,15 @@ inline __device__ void store_validity(PageNestingDecodeInfo* nesting_info, // first word. strip bits_left bits off the beginning and store that uint32_t relevant_mask = ((1 << bits_left) - 1); uint32_t mask_word0 = valid_mask & relevant_mask; - atomicAnd(nesting_info->valid_map + word_offset, ~(relevant_mask << bit_offset)); - atomicOr(nesting_info->valid_map + word_offset, mask_word0 << bit_offset); + atomicAnd(valid_map + word_offset, ~(relevant_mask << bit_offset)); + atomicOr(valid_map + word_offset, mask_word0 << bit_offset); // second word. strip the remainder of the bits off the end and store that relevant_mask = ((1 << (value_count - bits_left)) - 1); uint32_t mask_word1 = valid_mask & (relevant_mask << bits_left); - atomicAnd(nesting_info->valid_map + word_offset + 1, ~(relevant_mask)); - atomicOr(nesting_info->valid_map + word_offset + 1, mask_word1 >> bits_left); + atomicAnd(valid_map + word_offset + 1, ~(relevant_mask)); + atomicOr(valid_map + word_offset + 1, mask_word1 >> bits_left); } - - nesting_info->valid_map_offset += value_count; } /** @@ -596,7 +593,7 @@ inline __device__ void store_validity(PageNestingDecodeInfo* nesting_info, * @param[in] target_input_value_count The desired # of input level values we want to process * @param[in] t Thread index */ -template +template inline __device__ void get_nesting_bounds(int& start_depth, int& end_depth, int& d, @@ -611,7 +608,7 @@ inline __device__ void get_nesting_bounds(int& start_depth, end_depth = -1; d = -1; if (input_value_count + t < target_input_value_count) { - int const index = rolling_lvl_index(input_value_count + t); + int const index = rolling_index(input_value_count + t); d = static_cast(def[index]); // if we have repetition (there are list columns involved) we have to // bound what nesting levels we apply values to @@ -640,10 +637,10 @@ inline __device__ void get_nesting_bounds(int& start_depth, * @param[in] def Definition level buffer * @param[in] t Thread index */ -template +template __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value_count, page_state_s* s, - page_state_buffers_s* sb, + state_buf* sb, level_t const* const rep, level_t const* const def, int t) @@ -663,7 +660,7 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value // determine the nesting bounds for this thread (the range of nesting depths we // will generate new value indices and validity bits for) int start_depth, end_depth, d; - get_nesting_bounds( + get_nesting_bounds( start_depth, end_depth, d, s, rep, def, input_value_count, target_input_value_count, t); // 4 interesting things to track: @@ -730,7 +727,7 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value int const src_pos = nesting_info->valid_count + thread_valid_count; int const dst_pos = nesting_info->value_count + thread_value_count; // nz_idx is a mapping of src buffer indices to destination buffer indices - sb->nz_idx[rolling_index(src_pos)] = dst_pos; + sb->nz_idx[rolling_index(src_pos)] = dst_pos; } // compute warp and thread value counts for the -next- nesting level. we need to @@ -775,8 +772,11 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value if (!t) { if (nesting_info->valid_map != nullptr && warp_valid_mask_bit_count > 0) { uint32_t const warp_output_valid_mask = warp_valid_mask >> first_thread_in_write_range; - store_validity(nesting_info, warp_output_valid_mask, warp_valid_mask_bit_count); - + store_validity(nesting_info->valid_map_offset, + nesting_info->valid_map, + warp_output_valid_mask, + warp_valid_mask_bit_count); + nesting_info->valid_map_offset += warp_valid_mask_bit_count; nesting_info->null_count += warp_valid_mask_bit_count - __popc(warp_output_valid_mask); } nesting_info->valid_count += warp_valid_count; @@ -819,9 +819,9 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value * @param[in] def Definition level buffer * @param[in] t Thread index */ -template +template __device__ void gpuDecodeLevels(page_state_s* s, - page_state_buffers_s* sb, + state_buf* sb, int32_t target_leaf_count, level_t* const rep, level_t* const def, @@ -833,8 +833,10 @@ __device__ void gpuDecodeLevels(page_state_s* s, int cur_leaf_count = target_leaf_count; while (!s->error && s->nz_count < target_leaf_count && s->input_value_count < s->num_input_values) { - if (has_repetition) { gpuDecodeStream(rep, s, cur_leaf_count, t, level_type::REPETITION); } - gpuDecodeStream(def, s, cur_leaf_count, t, level_type::DEFINITION); + if (has_repetition) { + gpuDecodeStream(rep, s, cur_leaf_count, t, level_type::REPETITION); + } + gpuDecodeStream(def, s, cur_leaf_count, t, level_type::DEFINITION); __syncwarp(); // because the rep and def streams are encoded separately, we cannot request an exact @@ -845,7 +847,7 @@ __device__ void gpuDecodeLevels(page_state_s* s, : s->lvl_count[level_type::DEFINITION]; // process what we got back - gpuUpdateValidityOffsetsAndRowIndices( + gpuUpdateValidityOffsetsAndRowIndices( actual_leaf_count, s, sb, rep, def, t); cur_leaf_count = actual_leaf_count + batch_size; __syncwarp(); diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu index 9d345571495..e8fafe4445f 100644 --- a/cpp/src/io/parquet/page_delta_decode.cu +++ b/cpp/src/io/parquet/page_delta_decode.cu @@ -31,20 +31,20 @@ namespace { // only used for int32 and int64 physical types (and appears to only be used // with V2 page headers; see https://www.mail-archive.com/dev@parquet.apache.org/msg11826.html). // this kernel only needs 96 threads (3 warps)(for now). -template +template __global__ void __launch_bounds__(96) gpuDecodeDeltaBinary( PageInfo* pages, device_span chunks, size_t min_row, size_t num_rows) { __shared__ __align__(16) delta_binary_decoder db_state; - __shared__ __align__(16) page_state_buffers_s state_buffers; __shared__ __align__(16) page_state_s state_g; - - page_state_s* const s = &state_g; - page_state_buffers_s* const sb = &state_buffers; - int const page_idx = blockIdx.x; - int const t = threadIdx.x; - int const lane_id = t & 0x1f; - auto* const db = &db_state; + __shared__ __align__(16) page_state_buffers_s state_buffers; + + page_state_s* const s = &state_g; + auto* const sb = &state_buffers; + int const page_idx = blockIdx.x; + int const t = threadIdx.x; + int const lane_id = t & 0x1f; + auto* const db = &db_state; [[maybe_unused]] null_count_back_copier _{s, t}; if (!setupLocalPageInfo(s, @@ -62,8 +62,8 @@ __global__ void __launch_bounds__(96) gpuDecodeDeltaBinary( // copying logic from gpuDecodePageData. PageNestingDecodeInfo const* nesting_info_base = s->nesting_info; - __shared__ level_t rep[non_zero_buffer_size]; // circular buffer of repetition level values - __shared__ level_t def[non_zero_buffer_size]; // circular buffer of definition level values + __shared__ level_t rep[delta_rolling_buf_size]; // circular buffer of repetition level values + __shared__ level_t def[delta_rolling_buf_size]; // circular buffer of definition level values // skipped_leaf_values will always be 0 for flat hierarchies. uint32_t const skipped_leaf_values = s->page.skipped_leaf_values; @@ -98,7 +98,7 @@ __global__ void __launch_bounds__(96) gpuDecodeDeltaBinary( // - update validity vectors // - updates offsets (for nested columns) // - produces non-NULL value indices in s->nz_idx for subsequent decoding - gpuDecodeLevels(s, sb, target_pos, rep, def, t); + gpuDecodeLevels(s, sb, target_pos, rep, def, t); } else if (t < 64) { // warp 1 db->decode_batch(); @@ -111,7 +111,7 @@ __global__ void __launch_bounds__(96) gpuDecodeDeltaBinary( // process the mini-block in batches of 32 for (uint32_t sp = src_pos + lane_id; sp < src_pos + batch_size; sp += 32) { // the position in the output column/buffer - int32_t dst_pos = sb->nz_idx[rolling_index(sp)]; + int32_t dst_pos = sb->nz_idx[rolling_index(sp)]; // handle skip_rows here. flat hierarchies can just skip up to first_row. if (!has_repetition) { dst_pos -= s->first_row; } @@ -120,9 +120,11 @@ __global__ void __launch_bounds__(96) gpuDecodeDeltaBinary( if (dst_pos >= 0 && sp < target_pos) { void* const dst = nesting_info_base[leaf_level_index].data_out + dst_pos * s->dtype_len; if (s->dtype_len == 8) { - *static_cast(dst) = db->value[rolling_index(sp + skipped_leaf_values)]; + *static_cast(dst) = + db->value[rolling_index(sp + skipped_leaf_values)]; } else if (s->dtype_len == 4) { - *static_cast(dst) = db->value[rolling_index(sp + skipped_leaf_values)]; + *static_cast(dst) = + db->value[rolling_index(sp + skipped_leaf_values)]; } } } @@ -151,10 +153,10 @@ void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector& pages dim3 dim_grid(pages.size(), 1); // 1 threadblock per page if (level_type_size == 1) { - gpuDecodeDeltaBinary + gpuDecodeDeltaBinary <<>>(pages.device_ptr(), chunks, min_row, num_rows); } else { - gpuDecodeDeltaBinary + gpuDecodeDeltaBinary <<>>(pages.device_ptr(), chunks, min_row, num_rows); } } diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index 96b4f8f5842..8ab52f32226 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -27,6 +27,11 @@ namespace gpu { namespace { +constexpr int preprocess_block_size = 512; +constexpr int decode_block_size = 128; +constexpr int rolling_buf_size = decode_block_size * 2; +constexpr int preproc_buf_size = LEVEL_DECODE_BUF_SIZE; + /** * @brief Compute the start and end page value bounds for this page * @@ -40,16 +45,16 @@ namespace { * @param has_repetition True if the schema is nested * @param decoders Definition and repetition level decoders * @return pair containing start and end value indexes - * @tparam lvl_buf_size Size of the buffer used when decoding repetition and definition levels + * @tparam rle_buf_size Size of the buffer used when decoding repetition and definition levels * @tparam level_t Type used to store decoded repetition and definition levels */ -template +template __device__ thrust::pair page_bounds(page_state_s* const s, size_t min_row, size_t num_rows, bool is_bounds_pg, bool has_repetition, - rle_stream* decoders) + rle_stream* decoders) { using block_reduce = cub::BlockReduce; using block_scan = cub::BlockScan; @@ -78,13 +83,12 @@ __device__ thrust::pair page_bounds(page_state_s* const s, auto const col = &s->col; // initialize the stream decoders (requires values computed in setupLocalPageInfo) - int const max_batch_size = lvl_buf_size; - auto const def_decode = reinterpret_cast(pp->lvl_decode_buf[level_type::DEFINITION]); - auto const rep_decode = reinterpret_cast(pp->lvl_decode_buf[level_type::REPETITION]); + auto const def_decode = reinterpret_cast(pp->lvl_decode_buf[level_type::DEFINITION]); + auto const rep_decode = reinterpret_cast(pp->lvl_decode_buf[level_type::REPETITION]); decoders[level_type::DEFINITION].init(s->col.level_bits[level_type::DEFINITION], s->abs_lvl_start[level_type::DEFINITION], s->abs_lvl_end[level_type::DEFINITION], - max_batch_size, + preproc_buf_size, def_decode, s->page.num_input_values); // only need repetition if this is a bounds page. otherwise all we need is def level info @@ -93,7 +97,7 @@ __device__ thrust::pair page_bounds(page_state_s* const s, decoders[level_type::REPETITION].init(s->col.level_bits[level_type::REPETITION], s->abs_lvl_start[level_type::REPETITION], s->abs_lvl_end[level_type::REPETITION], - max_batch_size, + preproc_buf_size, rep_decode, s->page.num_input_values); } @@ -152,7 +156,7 @@ __device__ thrust::pair page_bounds(page_state_s* const s, // do something with the level data while (start_val < processed) { int idx_t = start_val + t; - int idx = rolling_lvl_index(idx_t); + int idx = rolling_index(idx_t); // get absolute thread row index int is_new_row = idx_t < processed && (!has_repetition || rep_decode[idx] == 0); @@ -250,7 +254,7 @@ __device__ thrust::pair page_bounds(page_state_s* const s, while (start_val < processed) { int idx_t = start_val + t; if (idx_t < processed) { - int idx = rolling_lvl_index(idx_t); + int idx = rolling_index(idx_t); if (def_decode[idx] < max_def) { num_nulls++; } } start_val += preprocess_block_size; @@ -454,10 +458,9 @@ __device__ size_t totalPlainEntriesSize(uint8_t const* data, * @param chunks All chunks to be decoded * @param min_rows crop all rows below min_row * @param num_rows Maximum number of rows to read - * @tparam lvl_buf_size Size of the buffer used when decoding repetition and definition levels * @tparam level_t Type used to store decoded repetition and definition levels */ -template +template __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes( PageInfo* pages, device_span chunks, size_t min_row, size_t num_rows) { @@ -474,10 +477,15 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz // whether or not we have repetition levels (lists) bool const has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0; + // the required number of runs in shared memory we will need to provide the + // rle_stream object + constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size(preprocess_block_size); + // the level stream decoders - __shared__ rle_run def_runs[run_buffer_size]; - __shared__ rle_run rep_runs[run_buffer_size]; - rle_stream decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}}; + __shared__ rle_run def_runs[rle_run_buffer_size]; + __shared__ rle_run rep_runs[rle_run_buffer_size]; + rle_stream decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, + {rep_runs}}; // setup page info if (!setupLocalPageInfo( @@ -499,7 +507,7 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz // find start/end value indices auto const [start_value, end_value] = - page_bounds(s, min_row, num_rows, is_bounds_pg, has_repetition, decoders); + page_bounds(s, min_row, num_rows, is_bounds_pg, has_repetition, decoders); // need to save num_nulls and num_valids calculated in page_bounds in this page if (t == 0) { @@ -560,21 +568,22 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz * @param chunks List of column chunks * @param min_row Row index to start reading at * @param num_rows Maximum number of rows to read - * @tparam lvl_buf_size Size of the buffer used when decoding repetition and definition levels * @tparam level_t Type used to store decoded repetition and definition levels */ -template +template __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData( PageInfo* pages, device_span chunks, size_t min_row, size_t num_rows) { __shared__ __align__(16) page_state_s state_g; - __shared__ __align__(16) page_state_buffers_s state_buffers; __shared__ __align__(4) size_type last_offset; + __shared__ __align__(16) + page_state_buffers_s + state_buffers; - page_state_s* const s = &state_g; - page_state_buffers_s* const sb = &state_buffers; - int const page_idx = blockIdx.x; - int const t = threadIdx.x; + page_state_s* const s = &state_g; + auto* const sb = &state_buffers; + int const page_idx = blockIdx.x; + int const t = threadIdx.x; [[maybe_unused]] null_count_back_copier _{s, t}; if (!setupLocalPageInfo( @@ -592,8 +601,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData( int const leaf_level_index = s->col.max_nesting_depth - 1; PageNestingDecodeInfo* const nesting_info_base = s->nesting_info; - __shared__ level_t rep[lvl_buf_size]; // circular buffer of repetition level values - __shared__ level_t def[lvl_buf_size]; // circular buffer of definition level values + __shared__ level_t rep[rolling_buf_size]; // circular buffer of repetition level values + __shared__ level_t def[rolling_buf_size]; // circular buffer of definition level values // skipped_leaf_values will always be 0 for flat hierarchies. uint32_t skipped_leaf_values = s->page.skipped_leaf_values; @@ -614,7 +623,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData( // - update validity vectors // - updates offsets (for nested columns) // - produces non-NULL value indices in s->nz_idx for subsequent decoding - gpuDecodeLevels(s, sb, target_pos, rep, def, t); + gpuDecodeLevels(s, sb, target_pos, rep, def, t); } else if (t < out_thread0) { // skipped_leaf_values will always be 0 for flat hierarchies. uint32_t src_target_pos = target_pos + skipped_leaf_values; @@ -633,7 +642,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData( src_pos += t - out_thread0; // the position in the output column/buffer - int dst_pos = sb->nz_idx[rolling_index(src_pos)]; + int dst_pos = sb->nz_idx[rolling_index(src_pos)]; // for the flat hierarchy case we will be reading from the beginning of the value stream, // regardless of the value of first_row. so adjust our destination offset accordingly. @@ -656,7 +665,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData( if (me < warp_size) { for (int i = 0; i < decode_block_size - out_thread0; i += warp_size) { - dst_pos = sb->nz_idx[rolling_index(src_pos + i)]; + dst_pos = sb->nz_idx[rolling_index(src_pos + i)]; if (!has_repetition) { dst_pos -= s->first_row; } auto [ptr, len] = src_pos + i < target_pos && dst_pos >= 0 @@ -739,10 +748,10 @@ void ComputePageStringSizes(cudf::detail::hostdevice_vector& pages, dim3 dim_block(preprocess_block_size, 1); dim3 dim_grid(pages.size(), 1); // 1 threadblock per page if (level_type_size == 1) { - gpuComputePageStringSizes + gpuComputePageStringSizes <<>>(pages.device_ptr(), chunks, min_row, num_rows); } else { - gpuComputePageStringSizes + gpuComputePageStringSizes <<>>(pages.device_ptr(), chunks, min_row, num_rows); } } @@ -763,10 +772,10 @@ void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector& pa dim3 dim_grid(pages.size(), 1); // 1 threadblock per page if (level_type_size == 1) { - gpuDecodeStringPageData + gpuDecodeStringPageData <<>>(pages.device_ptr(), chunks, min_row, num_rows); } else { - gpuDecodeStringPageData + gpuDecodeStringPageData <<>>(pages.device_ptr(), chunks, min_row, num_rows); } } diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 92bcd947b4b..798bc3cb6e2 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -48,6 +48,12 @@ constexpr size_type MAX_DICT_SIZE = (1 << MAX_DICT_BITS) - 1; // level decode buffer size. constexpr int LEVEL_DECODE_BUF_SIZE = 2048; +template +constexpr int rolling_index(int index) +{ + return index % rolling_size; +} + /** * @brief Struct representing an input column in the file. */ diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 1b6302fa92a..8759bc260c8 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -180,22 +180,28 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) auto const level_type_size = _file_itm_data.level_type_size; - // launch the catch-all page decoder + // vector of launched streams std::vector streams; - streams.push_back(get_stream_pool().get_stream()); - gpu::DecodePageData(pages, chunks, num_rows, skip_rows, level_type_size, streams.back()); - // and then the specializations + // launch string decoder if (has_strings) { streams.push_back(get_stream_pool().get_stream()); chunk_nested_str_data.host_to_device_async(streams.back()); gpu::DecodeStringPageData(pages, chunks, num_rows, skip_rows, level_type_size, streams.back()); } + + // launch delta binary decoder if ((kernel_mask & gpu::KERNEL_MASK_DELTA_BINARY) != 0) { streams.push_back(get_stream_pool().get_stream()); gpu::DecodeDeltaBinary(pages, chunks, num_rows, skip_rows, level_type_size, streams.back()); } + // launch the catch-all page decoder + if ((kernel_mask & gpu::KERNEL_MASK_GENERAL) != 0) { + streams.push_back(get_stream_pool().get_stream()); + gpu::DecodePageData(pages, chunks, num_rows, skip_rows, level_type_size, streams.back()); + } + // synchronize the streams std::for_each(streams.begin(), streams.end(), [](auto& stream) { stream.synchronize(); }); diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh index 473db660238..9d2e8baa1cc 100644 --- a/cpp/src/io/parquet/rle_stream.cuh +++ b/cpp/src/io/parquet/rle_stream.cuh @@ -22,16 +22,11 @@ namespace cudf::io::parquet::gpu { -// TODO: consider if these should be template parameters to rle_stream -constexpr int num_rle_stream_decode_threads = 512; -// the -1 here is for the look-ahead warp that fills in the list of runs to be decoded -// in an overlapped manner. so if we had 16 total warps: -// - warp 0 would be filling in batches of runs to be processed -// - warps 1-15 would be decoding the previous batch of runs generated -constexpr int num_rle_stream_decode_warps = - (num_rle_stream_decode_threads / cudf::detail::warp_size) - 1; -constexpr int run_buffer_size = (num_rle_stream_decode_warps * 2); -constexpr int rolling_run_index(int index) { return index % run_buffer_size; } +constexpr int rle_stream_required_run_buffer_size(int num_threads) +{ + int num_rle_stream_decode_warps = (num_threads / cudf::detail::warp_size) - 1; + return (num_rle_stream_decode_warps * 2); +} /** * @brief Read a 32-bit varint integer @@ -144,8 +139,19 @@ struct rle_run { }; // a stream of rle_runs -template +template struct rle_stream { + // TODO: consider if these should be template parameters to rle_stream + static constexpr int num_rle_stream_decode_threads = decode_threads; + // the -1 here is for the look-ahead warp that fills in the list of runs to be decoded + // in an overlapped manner. so if we had 16 total warps: + // - warp 0 would be filling in batches of runs to be processed + // - warps 1-15 would be decoding the previous batch of runs generated + static constexpr int num_rle_stream_decode_warps = + (num_rle_stream_decode_threads / cudf::detail::warp_size) - 1; + + static constexpr int run_buffer_size = rle_stream_required_run_buffer_size(decode_threads); + int level_bits; uint8_t const* start; uint8_t const* cur; @@ -210,7 +216,7 @@ struct rle_stream { // generate runs until we either run out of warps to decode them with, or // we cross the output limit. while (run_count < num_rle_stream_decode_warps && output_pos < max_count && cur < end) { - auto& run = runs[rolling_run_index(run_index)]; + auto& run = runs[rolling_index(run_index)]; // Encoding::RLE @@ -256,13 +262,13 @@ struct rle_stream { // if we've reached the value output limit on the last run if (output_pos >= max_count) { // first, see if we've spilled over - auto const& src = runs[rolling_run_index(run_index - 1)]; + auto const& src = runs[rolling_index(run_index - 1)]; int const spill_count = output_pos - max_count; // a spill has occurred in the current run. spill the extra values over into the beginning of // the next run. if (spill_count > 0) { - auto& spill_run = runs[rolling_run_index(run_index)]; + auto& spill_run = runs[rolling_index(run_index)]; spill_run = src; spill_run.output_pos = 0; spill_run.remaining = spill_count; @@ -330,7 +336,7 @@ struct rle_stream { // repetition levels for one of the list benchmarks decodes in ~3ms total, while the // definition levels take ~11ms - the difference is entirely due to long runs in the // definition levels. - auto& run = runs[rolling_run_index(run_start + warp_decode_id)]; + auto& run = runs[rolling_index(run_start + warp_decode_id)]; auto batch = run.next_batch(output + run.output_pos, min(run.remaining, (output_count - run.output_pos))); batch.decode(end, level_bits, warp_lane, warp_decode_id); From b58e55ca0542e241fb089453e68c299f52dec39f Mon Sep 17 00:00:00 2001 From: seidl Date: Wed, 28 Jun 2023 16:50:53 -0700 Subject: [PATCH 07/36] use less shared memory for delta binary decoder --- cpp/src/io/parquet/delta_binary.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh index 8e33cc33a0c..9142405c8a4 100644 --- a/cpp/src/io/parquet/delta_binary.cuh +++ b/cpp/src/io/parquet/delta_binary.cuh @@ -52,7 +52,8 @@ namespace cudf::io::parquet::gpu { using uleb128_t = uint64_t; using zigzag128_t = int64_t; -constexpr int delta_rolling_buf_size = 256; +// we decode one mini-block at a time. max mini-block size seen is 64. +constexpr int delta_rolling_buf_size = 128; /** * @brief Read a ULEB128 varint integer From 2c5e087612689f8ff5e5af7e98cfe9787be1ee67 Mon Sep 17 00:00:00 2001 From: seidl Date: Wed, 28 Jun 2023 16:55:20 -0700 Subject: [PATCH 08/36] spelling --- cpp/src/io/parquet/delta_binary.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh index 9142405c8a4..887dd94995a 100644 --- a/cpp/src/io/parquet/delta_binary.cuh +++ b/cpp/src/io/parquet/delta_binary.cuh @@ -47,7 +47,7 @@ namespace cudf::io::parquet::gpu { // lengths, followed by the concatenated suffix data. // TODO: The delta encodings use ULEB128 integers, but for now we're only -// using max 64 bits. Need to see what the performance impact is of useing +// using max 64 bits. Need to see what the performance impact is of using // __int128_t rather than int64_t. using uleb128_t = uint64_t; using zigzag128_t = int64_t; From 996893e8f4b38466642eb108378dd310b72dc8af Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 29 Jun 2023 11:28:46 -0700 Subject: [PATCH 09/36] change encoding to unsupported type --- .../tests/data/parquet/delta_encoding.parquet | Bin 577 -> 577 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/python/cudf/cudf/tests/data/parquet/delta_encoding.parquet b/python/cudf/cudf/tests/data/parquet/delta_encoding.parquet index e129ced34f3b570ba0ae966277f2111f8f539465..29565bef4d2e79033e2631a46eebedd3292db6b7 100644 GIT binary patch delta 28 icmX@ea*#zhz%j^BlucAlR4E2XF^DpW@@y2=V*&tHoCNOx delta 28 icmX@ea*#zhz%j^BlucAlR4E2XF^DpWa%~jWV*&tHk_7Dl From f1f74dc9daa462a68880cade7928fa50f9e663cb Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 29 Jun 2023 12:30:12 -0700 Subject: [PATCH 10/36] add python test of delta parser --- python/cudf/cudf/tests/test_parquet.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 3a35a0088ff..a6d19fe39af 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1306,6 +1306,29 @@ def test_parquet_reader_v2(tmpdir, simple_pdf): assert_eq(cudf.read_parquet(pdf_fname), simple_pdf) +def test_delta_binary(tmpdir): + nrows = 100000 + # Create a pandas dataframe with random data of mixed types + test_pdf = pd.DataFrame( + { + "col_int32": np.random.randint(0, nrows, nrows).astype("int32"), + "col_int64": np.random.randint( + -0x10000000000, 0x10000000000, nrows + ).astype("int64"), + }, + ) + pdf_fname = tmpdir.join("pdfv2.parquet") + test_pdf.to_parquet( + pdf_fname, + version="2.6", + column_encoding="DELTA_BINARY_PACKED", + data_page_version="2.0", + engine="pyarrow", + use_dictionary=False, + ) + assert_eq(cudf.read_parquet(pdf_fname), test_pdf) + + @pytest.mark.parametrize( "data", [ From 639b8abb593d38dc1e31f7ee43f3c235e23a5980 Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 29 Jun 2023 13:50:19 -0700 Subject: [PATCH 11/36] test delta with nulls --- python/cudf/cudf/tests/test_parquet.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index a6d19fe39af..6e354645148 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1306,17 +1306,24 @@ def test_parquet_reader_v2(tmpdir, simple_pdf): assert_eq(cudf.read_parquet(pdf_fname), simple_pdf) -def test_delta_binary(tmpdir): +@pytest.mark.parametrize("add_nulls", [True, False]) +def test_delta_binary(add_nulls, tmpdir): nrows = 100000 # Create a pandas dataframe with random data of mixed types test_pdf = pd.DataFrame( { - "col_int32": np.random.randint(0, nrows, nrows).astype("int32"), - "col_int64": np.random.randint( - -0x10000000000, 0x10000000000, nrows - ).astype("int64"), + "col_int32": pd.Series( + np.random.randint(0, 0x7FFFFFFF, nrows), dtype="Int32" + ), + "col_int64": pd.Series( + np.random.randint(0, 0x7FFFFFFFFFFFFFFF, nrows), dtype="Int64" + ), }, ) + if add_nulls: + for i in range(0, int(nrows / 4)): + test_pdf.iloc[np.random.randint(0, nrows), 0] = pd.NA + test_pdf.iloc[np.random.randint(0, nrows), 1] = pd.NA pdf_fname = tmpdir.join("pdfv2.parquet") test_pdf.to_parquet( pdf_fname, @@ -1326,7 +1333,9 @@ def test_delta_binary(tmpdir): engine="pyarrow", use_dictionary=False, ) - assert_eq(cudf.read_parquet(pdf_fname), test_pdf) + cdf = cudf.read_parquet(pdf_fname) + pcdf = cudf.from_pandas(test_pdf) + assert_eq(cdf, pcdf) @pytest.mark.parametrize( From c1bbb840f342ae4c2bd0f0fd0efad55ccf0bf2a5 Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 29 Jun 2023 17:01:44 -0700 Subject: [PATCH 12/36] add comments to skip_values and decode_batch --- cpp/src/io/parquet/delta_binary.cuh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh index 887dd94995a..be5256b242d 100644 --- a/cpp/src/io/parquet/delta_binary.cuh +++ b/cpp/src/io/parquet/delta_binary.cuh @@ -247,6 +247,8 @@ struct delta_binary_decoder { } } + // decodes and skips values until the block containing the value after `skip` is reached. + // called by all threads in a thread block. inline __device__ void skip_values(int skip) { int const t = threadIdx.x; @@ -261,6 +263,8 @@ struct delta_binary_decoder { } } + // decodes the current mini block and stores the values obtained. should only be called by + // a single warp. inline __device__ void decode_batch() { int const t = threadIdx.x; From 8e66a080d920fedc9cb39a276f2c5b08a93b4ffc Mon Sep 17 00:00:00 2001 From: seidl Date: Fri, 30 Jun 2023 13:44:06 -0700 Subject: [PATCH 13/36] revert east volatile changes --- cpp/src/io/parquet/page_data.cu | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 512364d7c1a..cb92c0807f8 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -42,8 +42,8 @@ constexpr int rolling_buf_size = decode_block_size * 2; * @param[in] dstv Pointer to row output data (string descriptor or 32-bit hash) */ template -inline __device__ void gpuOutputString(page_state_s volatile* s, - state_buf volatile* sb, +inline __device__ void gpuOutputString(volatile page_state_s* s, + volatile state_buf* sb, int src_pos, void* dstv) { @@ -70,7 +70,7 @@ inline __device__ void gpuOutputString(page_state_s volatile* s, * @param[in] dst Pointer to row output data */ template -inline __device__ void gpuOutputBoolean(state_buf volatile* sb, int src_pos, uint8_t* dst) +inline __device__ void gpuOutputBoolean(volatile state_buf* sb, int src_pos, uint8_t* dst) { *dst = sb->dict_idx[rolling_index(src_pos)]; } @@ -144,8 +144,8 @@ inline __device__ void gpuStoreOutput(uint2* dst, * @param[out] dst Pointer to row output data */ template -inline __device__ void gpuOutputInt96Timestamp(page_state_s volatile* s, - state_buf volatile* sb, +inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s, + volatile state_buf* sb, int src_pos, int64_t* dst) { @@ -219,8 +219,8 @@ inline __device__ void gpuOutputInt96Timestamp(page_state_s volatile* s, * @param[in] dst Pointer to row output data */ template -inline __device__ void gpuOutputInt64Timestamp(page_state_s volatile* s, - state_buf volatile* sb, +inline __device__ void gpuOutputInt64Timestamp(volatile page_state_s* s, + volatile state_buf* sb, int src_pos, int64_t* dst) { @@ -302,8 +302,8 @@ __device__ void gpuOutputByteArrayAsInt(char const* ptr, int32_t len, T* dst) * @param[in] dst Pointer to row output data */ template -__device__ void gpuOutputFixedLenByteArrayAsInt(page_state_s volatile* s, - state_buf volatile* sb, +__device__ void gpuOutputFixedLenByteArrayAsInt(volatile page_state_s* s, + volatile state_buf* sb, int src_pos, T* dst) { @@ -339,8 +339,8 @@ __device__ void gpuOutputFixedLenByteArrayAsInt(page_state_s volatile* s, * @param[in] dst Pointer to row output data */ template -inline __device__ void gpuOutputFast(page_state_s volatile* s, - state_buf volatile* sb, +inline __device__ void gpuOutputFast(volatile page_state_s* s, + volatile state_buf* sb, int src_pos, T* dst) { @@ -372,7 +372,7 @@ inline __device__ void gpuOutputFast(page_state_s volatile* s, */ template static __device__ void gpuOutputGeneric( - page_state_s volatile* s, state_buf volatile* sb, int src_pos, uint8_t* dst8, int len) + volatile page_state_s* s, volatile state_buf* sb, int src_pos, uint8_t* dst8, int len) { uint8_t const* dict; uint32_t dict_pos, dict_size = s->dict_size; @@ -496,7 +496,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData( } else if ((s->col.data_type & 7) == BYTE_ARRAY) { gpuInitStringDescriptors(s, sb, src_target_pos, t & 0x1f); } - if (t == 32) { *(int32_t volatile*)&s->dict_pos = src_target_pos; } + if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; } } else { // WARP1..WARP3: Decode values int const dtype = s->col.data_type & 7; @@ -583,7 +583,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData( } } - if (t == out_thread0) { *(int32_t volatile*)&s->src_pos = target_pos; } + if (t == out_thread0) { *(volatile int32_t*)&s->src_pos = target_pos; } } __syncthreads(); } From 7b09c4f55f48533be262112e8f2b798450b6256a Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 7 Jul 2023 12:37:30 -0700 Subject: [PATCH 14/36] update doc string --- cpp/src/io/parquet/page_string_utils.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/page_string_utils.cuh b/cpp/src/io/parquet/page_string_utils.cuh index fb36c09052c..3be52642f3a 100644 --- a/cpp/src/io/parquet/page_string_utils.cuh +++ b/cpp/src/io/parquet/page_string_utils.cuh @@ -87,7 +87,7 @@ inline __device__ void ll_strcpy(uint8_t* dst, uint8_t const* src, size_t len, u } /** - * @brief Perform exclusive scan for offsets array. Called for each page. + * @brief Perform exclusive scan on an array of any length using a single block of threads. */ template __device__ void block_excl_sum(size_type* arr, size_type length, size_type initial_value) From 9b636c7534991abffe600da4d36746ea3228ea4e Mon Sep 17 00:00:00 2001 From: seidl Date: Wed, 19 Jul 2023 09:43:54 -0700 Subject: [PATCH 15/36] fix for header location --- cpp/src/io/parquet/decode_preprocess.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu index 439422d3554..03510cc693e 100644 --- a/cpp/src/io/parquet/decode_preprocess.cu +++ b/cpp/src/io/parquet/decode_preprocess.cu @@ -18,7 +18,7 @@ #include -#include +#include #include #include From ceb22ab04a13777bdc42ce4e09955223cf9bf371 Mon Sep 17 00:00:00 2001 From: seidl Date: Wed, 19 Jul 2023 10:13:42 -0700 Subject: [PATCH 16/36] fix some short-circuit logic --- cpp/src/io/parquet/page_string_decode.cu | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index 8ab52f32226..568457bc17c 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -123,7 +123,13 @@ __device__ thrust::pair page_bounds(page_state_s* const s, int row_fudge = -1; // short circuit for no nulls - if (max_def == 0 && !has_repetition) { return {begin_row, end_row}; } + if (max_def == 0 && !has_repetition) { + if (t == 0) { + pp->num_nulls = 0; + pp->num_valids = end_row - begin_row; + } + return {begin_row, end_row}; + } int row_count = 0; int leaf_count = 0; From ea49a23bc97489de0075b19681d93e23a0c03ce1 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 1 Aug 2023 21:25:22 -0700 Subject: [PATCH 17/36] rename function --- cpp/src/io/parquet/page_data.cu | 4 ++-- cpp/src/io/parquet/parquet_gpu.hpp | 8 ++++++-- cpp/src/io/parquet/reader_impl.cpp | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 51cb810d9c4..ab08368362e 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -596,8 +596,8 @@ struct mask_tform { } // anonymous namespace -uint32_t GetKernelMasks(cudf::detail::hostdevice_vector& pages, - rmm::cuda_stream_view stream) +uint32_t SumPageKernelMasks(cudf::detail::hostdevice_vector& pages, + rmm::cuda_stream_view stream) { // determine which kernels to invoke auto mask_iter = thrust::make_transform_iterator(pages.d_begin(), mask_tform{}); diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 90274921595..2b9887487ba 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -475,9 +475,13 @@ void BuildStringDictionaryIndex(ColumnChunkDesc* chunks, /** * @brief Get OR'd sum of page kernel masks. + * + * @param[in] pages List of pages to aggregate + * @param[in] stream CUDA stream to use + * @return Bitwise OR of all page `kernel_mask` values */ -uint32_t GetKernelMasks(cudf::detail::hostdevice_vector& pages, - rmm::cuda_stream_view stream); +uint32_t SumPageKernelMasks(cudf::detail::hostdevice_vector& pages, + rmm::cuda_stream_view stream); /** * @brief Compute page output size information. diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 1623e0662d4..87ea404e3e7 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -59,7 +59,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) }); // figure out which kernels to run - auto const kernel_mask = GetKernelMasks(pages, _stream); + auto const kernel_mask = SumPageKernelMasks(pages, _stream); // Check to see if there are any string columns present. If so, then we need to get size info // for each string page. This size info will be used to pre-allocate memory for the column, From 3a9f186fd984d6eea70934997c1d67dfadc869fe Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 1 Aug 2023 21:37:35 -0700 Subject: [PATCH 18/36] clean up kernel_mask_for_page() --- cpp/src/io/parquet/page_hdr.cu | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index 2fd2c05aeee..84daed56677 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -154,17 +154,24 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type) } while (rep_cnt || struct_depth); } -__device__ uint32_t get_kernel_mask(gpu::PageInfo const& page, gpu::ColumnChunkDesc const& chunk) +/** + * @brief Determine which decode kernel to run for the given page. + * + * @param page The page to decode + * @param chunk Column chunk the page belongs to + * @return `kernel_mask_bits` value for the given page +*/ +__device__ uint32_t kernel_mask_for_page(gpu::PageInfo const& page, gpu::ColumnChunkDesc const& chunk) { if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return 0; } - // non-string, non-nested, non-dict, non-boolean types if (page.encoding == Encoding::DELTA_BINARY_PACKED) { return KERNEL_MASK_DELTA_BINARY; } else if (is_string_col(chunk)) { return KERNEL_MASK_STRING; } + // non-string, non-delta return KERNEL_MASK_GENERAL; } @@ -435,7 +442,7 @@ __global__ void __launch_bounds__(128) } bs->page.page_data = const_cast(bs->cur); bs->cur += bs->page.compressed_page_size; - bs->page.kernel_mask = get_kernel_mask(bs->page, bs->ck); + bs->page.kernel_mask = kernel_mask_for_page(bs->page, bs->ck); } else { bs->cur = bs->end; } From d7671d7148df173fea756913c9db4aa567579548 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 1 Aug 2023 21:43:21 -0700 Subject: [PATCH 19/36] remove TODO --- cpp/src/io/parquet/rle_stream.cuh | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh index 9d2e8baa1cc..42e1833008a 100644 --- a/cpp/src/io/parquet/rle_stream.cuh +++ b/cpp/src/io/parquet/rle_stream.cuh @@ -141,7 +141,6 @@ struct rle_run { // a stream of rle_runs template struct rle_stream { - // TODO: consider if these should be template parameters to rle_stream static constexpr int num_rle_stream_decode_threads = decode_threads; // the -1 here is for the look-ahead warp that fills in the list of runs to be decoded // in an overlapped manner. so if we had 16 total warps: From 7084d89cfab89017237e23fdb9c2c5a1bbed9f7a Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 1 Aug 2023 22:08:23 -0700 Subject: [PATCH 20/36] add some documentation to kernel_mask_bits --- cpp/src/io/parquet/parquet_gpu.hpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 2b9887487ba..ad41f5b21b0 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -93,14 +93,15 @@ enum level_type { NUM_LEVEL_TYPES }; +/** + * @brief Enum of mask bits for the PageInfo kernel_mask + * + * Used to control which decode kernels to run. +*/ enum kernel_mask_bits { - KERNEL_MASK_GENERAL = (1 << 0), - KERNEL_MASK_STRING = (1 << 1), - KERNEL_MASK_DELTA_BINARY = (1 << 2) - // KERNEL_MASK_FIXED_WIDTH_DICT, - // KERNEL_MASK_STRINGS, - // KERNEL_NESTED_ - // etc + KERNEL_MASK_GENERAL = (1 << 0), // Run catch-all decode kernel + KERNEL_MASK_STRING = (1 << 1), // Run decode kernel for string data + KERNEL_MASK_DELTA_BINARY = (1 << 2) // Run decode kernel for DELTA_BINARY_PACKED data }; /** From 9ebeca9c6b7a23d454363402315800d1c5207a74 Mon Sep 17 00:00:00 2001 From: seidl Date: Wed, 2 Aug 2023 17:22:00 -0700 Subject: [PATCH 21/36] use rand_dataframe() to produce test data --- python/cudf/cudf/tests/test_parquet.py | 33 ++++++++++++++++---------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 907d36ed143..ca3541c4c31 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1289,21 +1289,28 @@ def test_parquet_reader_v2(tmpdir, simple_pdf): @pytest.mark.parametrize("add_nulls", [True, False]) def test_delta_binary(add_nulls, tmpdir): nrows = 100000 + null_frequency = 0.25 if add_nulls else 0 + # Create a pandas dataframe with random data of mixed types - test_pdf = pd.DataFrame( - { - "col_int32": pd.Series( - np.random.randint(0, 0x7FFFFFFF, nrows), dtype="Int32" - ), - "col_int64": pd.Series( - np.random.randint(0, 0x7FFFFFFFFFFFFFFF, nrows), dtype="Int64" - ), - }, + arrow_table = dg.rand_dataframe( + dtypes_meta=[ + { + "dtype": "int32", + "null_frequency": null_frequency, + "cardinality": nrows, + }, + { + "dtype": "int64", + "null_frequency": null_frequency, + "cardinality": nrows, + }, + ], + rows=nrows, + use_threads=False, ) - if add_nulls: - for i in range(0, int(nrows / 4)): - test_pdf.iloc[np.random.randint(0, nrows), 0] = pd.NA - test_pdf.iloc[np.random.randint(0, nrows), 1] = pd.NA + # Roundabout conversion to pandas to preserve nulls/data types + cudf_table = cudf.DataFrame.from_arrow(arrow_table) + test_pdf = cudf_table.to_pandas(nullable=True) pdf_fname = tmpdir.join("pdfv2.parquet") test_pdf.to_parquet( pdf_fname, From 761393f5478979934c860c93ade7b1b497c378dc Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 3 Aug 2023 17:34:44 -0700 Subject: [PATCH 22/36] formatting --- cpp/src/io/parquet/page_hdr.cu | 5 +++-- cpp/src/io/parquet/parquet_gpu.hpp | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index 84daed56677..f99702a540f 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -160,8 +160,9 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type) * @param page The page to decode * @param chunk Column chunk the page belongs to * @return `kernel_mask_bits` value for the given page -*/ -__device__ uint32_t kernel_mask_for_page(gpu::PageInfo const& page, gpu::ColumnChunkDesc const& chunk) + */ +__device__ uint32_t kernel_mask_for_page(gpu::PageInfo const& page, + gpu::ColumnChunkDesc const& chunk) { if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return 0; } diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index ad41f5b21b0..d0fbe397291 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -97,7 +97,7 @@ enum level_type { * @brief Enum of mask bits for the PageInfo kernel_mask * * Used to control which decode kernels to run. -*/ + */ enum kernel_mask_bits { KERNEL_MASK_GENERAL = (1 << 0), // Run catch-all decode kernel KERNEL_MASK_STRING = (1 << 1), // Run decode kernel for string data @@ -476,7 +476,7 @@ void BuildStringDictionaryIndex(ColumnChunkDesc* chunks, /** * @brief Get OR'd sum of page kernel masks. - * + * * @param[in] pages List of pages to aggregate * @param[in] stream CUDA stream to use * @return Bitwise OR of all page `kernel_mask` values From 484603273ef9553a2205d14ff6317c295341c308 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 4 Aug 2023 15:41:34 -0700 Subject: [PATCH 23/36] implement suggestion from review Co-authored-by: Vukasin Milovanovic --- cpp/src/io/parquet/page_string_decode.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index 568457bc17c..561677106ae 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -45,8 +45,8 @@ constexpr int preproc_buf_size = LEVEL_DECODE_BUF_SIZE; * @param has_repetition True if the schema is nested * @param decoders Definition and repetition level decoders * @return pair containing start and end value indexes - * @tparam rle_buf_size Size of the buffer used when decoding repetition and definition levels * @tparam level_t Type used to store decoded repetition and definition levels + * @tparam rle_buf_size Size of the buffer used when decoding repetition and definition levels */ template __device__ thrust::pair page_bounds(page_state_s* const s, From d6156253e3130b2657aad5844bed554dc054167c Mon Sep 17 00:00:00 2001 From: seidl Date: Fri, 4 Aug 2023 16:20:09 -0700 Subject: [PATCH 24/36] more suggestions from review --- cpp/src/io/parquet/delta_binary.cuh | 20 +++++++++++++------- cpp/src/io/parquet/page_delta_decode.cu | 13 +++++++------ 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh index be5256b242d..9632e58eabb 100644 --- a/cpp/src/io/parquet/delta_binary.cuh +++ b/cpp/src/io/parquet/delta_binary.cuh @@ -108,12 +108,12 @@ struct delta_binary_decoder { uleb128_t value[delta_rolling_buf_size]; // circular buffer of delta values - // returns the number of values encoded in the block data. when is_decode is true, + // returns the number of values encoded in the block data. when all_values is true, // account for the first value in the header. otherwise just count the values encoded // in the mini-block data. - constexpr uint32_t num_encoded_values(bool is_decode) + constexpr uint32_t num_encoded_values(bool all_values) { - return value_count == 0 ? 0 : is_decode ? value_count : value_count - 1; + return value_count == 0 ? 0 : all_values ? value_count : value_count - 1; } // read mini-block header into state object. should only be called from init_binary_block or @@ -123,6 +123,8 @@ struct delta_binary_decoder { // // on exit db->cur_mb is 0 and db->cur_mb_start points to the first mini-block of data, or // nullptr if out of data. + // is_decode indicates whether this is being called from initialization code (false) or + // the actual decoding (true) inline __device__ void init_mini_block(bool is_decode) { cur_mb = 0; @@ -161,6 +163,8 @@ struct delta_binary_decoder { // skip to the start of the next mini-block. should only be called on thread 0. // calls init_binary_block if currently on the last mini-block in a block. + // is_decode indicates whether this is being called from initialization code (false) or + // the actual decoding (true) inline __device__ void setup_next_mini_block(bool is_decode) { if (current_value_idx >= num_encoded_values(is_decode)) { return; } @@ -242,7 +246,7 @@ struct delta_binary_decoder { // save value from last lane in warp. this will become the 'first value' added to the // deltas calculated in the next iteration (or invocation). - if (lane_id == 31) { last_value = delta; } + if (lane_id == warp_size - 1) { last_value = delta; } __syncwarp(); } } @@ -251,11 +255,12 @@ struct delta_binary_decoder { // called by all threads in a thread block. inline __device__ void skip_values(int skip) { + using cudf::detail::warp_size; int const t = threadIdx.x; - int const lane_id = t & 0x1f; + int const lane_id = t % warp_size; while (current_value_idx < skip && current_value_idx < num_encoded_values(true)) { - if (t < 32) { + if (t < warp_size) { calc_mini_block_values(lane_id); if (lane_id == 0) { setup_next_mini_block(true); } } @@ -267,8 +272,9 @@ struct delta_binary_decoder { // a single warp. inline __device__ void decode_batch() { + using cudf::detail::warp_size; int const t = threadIdx.x; - int const lane_id = t & 0x1f; + int const lane_id = t % warp_size; // unpack deltas and save in db->value calc_mini_block_values(lane_id); diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu index e8fafe4445f..6e9377627d2 100644 --- a/cpp/src/io/parquet/page_delta_decode.cu +++ b/cpp/src/io/parquet/page_delta_decode.cu @@ -35,6 +35,7 @@ template __global__ void __launch_bounds__(96) gpuDecodeDeltaBinary( PageInfo* pages, device_span chunks, size_t min_row, size_t num_rows) { + using cudf::detail::warp_size; __shared__ __align__(16) delta_binary_decoder db_state; __shared__ __align__(16) page_state_s state_g; __shared__ __align__(16) page_state_buffers_s state_buffers; @@ -43,7 +44,7 @@ __global__ void __launch_bounds__(96) gpuDecodeDeltaBinary( auto* const sb = &state_buffers; int const page_idx = blockIdx.x; int const t = threadIdx.x; - int const lane_id = t & 0x1f; + int const lane_id = t % warp_size; auto* const db = &db_state; [[maybe_unused]] null_count_back_copier _{s, t}; @@ -82,9 +83,9 @@ __global__ void __launch_bounds__(96) gpuDecodeDeltaBinary( uint32_t target_pos; uint32_t const src_pos = s->src_pos; - if (t < 64) { // warp0..1 + if (t < 2 * warp_size) { // warp0..1 target_pos = min(src_pos + 2 * batch_size, s->nz_count + batch_size); - } else { // warp2... + } else { // warp2 target_pos = min(s->nz_count, src_pos + batch_size); } __syncthreads(); @@ -92,18 +93,18 @@ __global__ void __launch_bounds__(96) gpuDecodeDeltaBinary( // warp0 will decode the rep/def levels, warp1 will unpack a mini-batch of deltas. // warp2 waits one cycle for warps 0/1 to produce a batch, and then stuffs values // into the proper location in the output. - if (t < 32) { + if (t < warp_size) { // warp 0 // decode repetition and definition levels. // - update validity vectors // - updates offsets (for nested columns) // - produces non-NULL value indices in s->nz_idx for subsequent decoding gpuDecodeLevels(s, sb, target_pos, rep, def, t); - } else if (t < 64) { + } else if (t < 2 * warp_size) { // warp 1 db->decode_batch(); - } else if (t < 96 && src_pos < target_pos) { + } else if (src_pos < target_pos) { // warp 2 // nesting level that is storing actual leaf values int const leaf_level_index = s->col.max_nesting_depth - 1; From 835e8660ead27353dfc7ba255311165a97be2f92 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 4 Aug 2023 23:35:11 -0700 Subject: [PATCH 25/36] restore old unrolled loop for testing --- cpp/src/io/parquet/delta_binary.cuh | 35 +++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh index 9632e58eabb..bd68193c06d 100644 --- a/cpp/src/io/parquet/delta_binary.cuh +++ b/cpp/src/io/parquet/delta_binary.cuh @@ -223,10 +223,45 @@ struct delta_binary_decoder { uint32_t c = 8 - ofs; // 0 - 7 bits delta = (*p++) >> ofs; +#if 1 while (c < mb_bits && p < block_end) { delta |= static_cast(*p++) << c; c += 8; } +#else + // bring back unrolled unpacker for perf testing + if (c < mb_bits && p < block_end) { // up to 8 bits + delta |= (*p++) << c; + c += 8; + if (c < mb_bits && p < block_end) { // up to 16 bits + delta |= (*p++) << c; + c += 8; + if (c < mb_bits && p < block_end) { // 24 bits + delta |= (*p++) << c; + c += 8; + if (c < mb_bits && p < block_end) { // 32 bits + delta |= static_cast(*p++) << c; + c += 8; + if (c < mb_bits && p < block_end) { // 40 bits + delta |= static_cast(*p++) << c; + c += 8; + if (c < mb_bits && p < block_end) { // 48 bits + delta |= static_cast(*p++) << c; + c += 8; + if (c < mb_bits && p < block_end) { // 56 bits + delta |= static_cast(*p++) << c; + c += 8; + if (c < mb_bits && p < block_end) { // 64 bits + delta |= static_cast(*p++) << c; + } + } + } + } + } + } + } + } +#endif delta &= (static_cast(1) << mb_bits) - 1; } } From 6353a4aa2bb706ab24a8a9dfe3e6c2291726c84c Mon Sep 17 00:00:00 2001 From: seidl Date: Mon, 7 Aug 2023 11:25:51 -0700 Subject: [PATCH 26/36] add note to revisit bit unpacker with delta_byte_array --- cpp/src/io/parquet/delta_binary.cuh | 41 ++++------------------------- 1 file changed, 5 insertions(+), 36 deletions(-) diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh index bd68193c06d..d29c52248d9 100644 --- a/cpp/src/io/parquet/delta_binary.cuh +++ b/cpp/src/io/parquet/delta_binary.cuh @@ -213,7 +213,11 @@ struct delta_binary_decoder { // unpack deltas. modified from version in gpuDecodeDictionaryIndices(), but // that one only unpacks up to bitwidths of 24. simplified some since this - // will always do batches of 32. also replaced branching with a loop. + // will always do batches of 32. + // NOTE: because this needs to handle up to 64 bits, the branching used in the other + // implementation has been replaced with a loop. While this uses more registers, the + // looping version is just as fast and easier to read. Might need to revisit this when + // DELTA_BYTE_ARRAY is implemented. zigzag128_t delta = 0; if (lane_id + current_value_idx < value_count) { int32_t ofs = (lane_id - warp_size) * mb_bits; @@ -223,45 +227,10 @@ struct delta_binary_decoder { uint32_t c = 8 - ofs; // 0 - 7 bits delta = (*p++) >> ofs; -#if 1 while (c < mb_bits && p < block_end) { delta |= static_cast(*p++) << c; c += 8; } -#else - // bring back unrolled unpacker for perf testing - if (c < mb_bits && p < block_end) { // up to 8 bits - delta |= (*p++) << c; - c += 8; - if (c < mb_bits && p < block_end) { // up to 16 bits - delta |= (*p++) << c; - c += 8; - if (c < mb_bits && p < block_end) { // 24 bits - delta |= (*p++) << c; - c += 8; - if (c < mb_bits && p < block_end) { // 32 bits - delta |= static_cast(*p++) << c; - c += 8; - if (c < mb_bits && p < block_end) { // 40 bits - delta |= static_cast(*p++) << c; - c += 8; - if (c < mb_bits && p < block_end) { // 48 bits - delta |= static_cast(*p++) << c; - c += 8; - if (c < mb_bits && p < block_end) { // 56 bits - delta |= static_cast(*p++) << c; - c += 8; - if (c < mb_bits && p < block_end) { // 64 bits - delta |= static_cast(*p++) << c; - } - } - } - } - } - } - } - } -#endif delta &= (static_cast(1) << mb_bits) - 1; } } From e924a97fac2d68a8e91331a7b65299d9a44c15e1 Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 8 Aug 2023 11:27:58 -0700 Subject: [PATCH 27/36] fix and test int8 and int16 handling --- cpp/src/io/parquet/page_delta_decode.cu | 23 +++++++++++++++++------ python/cudf/cudf/tests/test_parquet.py | 10 ++++++++++ 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu index 6e9377627d2..e79a479388f 100644 --- a/cpp/src/io/parquet/page_delta_decode.cu +++ b/cpp/src/io/parquet/page_delta_decode.cu @@ -120,12 +120,23 @@ __global__ void __launch_bounds__(96) gpuDecodeDeltaBinary( // place value for this thread if (dst_pos >= 0 && sp < target_pos) { void* const dst = nesting_info_base[leaf_level_index].data_out + dst_pos * s->dtype_len; - if (s->dtype_len == 8) { - *static_cast(dst) = - db->value[rolling_index(sp + skipped_leaf_values)]; - } else if (s->dtype_len == 4) { - *static_cast(dst) = - db->value[rolling_index(sp + skipped_leaf_values)]; + switch (s->dtype_len) { + case 1: + *static_cast(dst) = + db->value[rolling_index(sp + skipped_leaf_values)]; + break; + case 2: + *static_cast(dst) = + db->value[rolling_index(sp + skipped_leaf_values)]; + break; + case 4: + *static_cast(dst) = + db->value[rolling_index(sp + skipped_leaf_values)]; + break; + case 8: + *static_cast(dst) = + db->value[rolling_index(sp + skipped_leaf_values)]; + break; } } } diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index ca3541c4c31..8b86ce67baa 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1294,6 +1294,16 @@ def test_delta_binary(add_nulls, tmpdir): # Create a pandas dataframe with random data of mixed types arrow_table = dg.rand_dataframe( dtypes_meta=[ + { + "dtype": "int8", + "null_frequency": null_frequency, + "cardinality": nrows, + }, + { + "dtype": "int16", + "null_frequency": null_frequency, + "cardinality": nrows, + }, { "dtype": "int32", "null_frequency": null_frequency, From d0bf0cd3321ee5e09b5a12d906505cf15ea359c2 Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 8 Aug 2023 13:50:06 -0700 Subject: [PATCH 28/36] fix for single row files --- cpp/src/io/parquet/delta_binary.cuh | 5 ++++- python/cudf/cudf/tests/test_parquet.py | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh index d29c52248d9..4fc8b9cfb8e 100644 --- a/cpp/src/io/parquet/delta_binary.cuh +++ b/cpp/src/io/parquet/delta_binary.cuh @@ -158,7 +158,9 @@ struct delta_binary_decoder { // init the first mini-block block_start = d_start; - init_mini_block(false); + + // only call init if there are actually encoded values + if (value_count > 1) { init_mini_block(false); } } // skip to the start of the next mini-block. should only be called on thread 0. @@ -197,6 +199,7 @@ struct delta_binary_decoder { value[0] = last_value; } __syncwarp(); + if (current_value_idx >= value_count) { return; } } uint32_t const mb_bits = cur_bitwidths[cur_mb]; diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 8b86ce67baa..b35526cb397 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1286,9 +1286,9 @@ def test_parquet_reader_v2(tmpdir, simple_pdf): assert_eq(cudf.read_parquet(pdf_fname), simple_pdf) +@pytest.mark.parametrize("nrows", [1, 100000]) @pytest.mark.parametrize("add_nulls", [True, False]) -def test_delta_binary(add_nulls, tmpdir): - nrows = 100000 +def test_delta_binary(nrows, add_nulls, tmpdir): null_frequency = 0.25 if add_nulls else 0 # Create a pandas dataframe with random data of mixed types From 5ac20a08491384aed366550f01b6080225681f81 Mon Sep 17 00:00:00 2001 From: seidl Date: Wed, 9 Aug 2023 16:36:14 -0700 Subject: [PATCH 29/36] clean up some docstrings --- cpp/src/io/parquet/page_decode.cuh | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index fddb82cd3b2..cab4ef8ecbd 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -166,6 +166,7 @@ inline __device__ bool is_page_contained(page_state_s* const s, size_t start_row * @param[in] s Page state input * @param[out] sb Page state buffer output * @param[in] src_pos Source position + * @tparam state_buf Typename of the `state_buf` (usually inferred) * * @return A pair containing a pointer to the string and its length */ @@ -208,6 +209,8 @@ inline __device__ cuda::std::pair gpuGetStringData(page_sta * @param[in] target_pos Target index position in dict_idx buffer (may exceed this value by up to * 31) * @param[in] t Warp1 thread ID (0..31) + * @tparam sizes_only True if only sizes are to be calculated + * @tparam state_buf Typename of the `state_buf` (usually inferred) * * @return A pair containing the new output position, and the total length of strings decoded (this * will only be valid on thread 0 and if sizes_only is true). In the event that this function @@ -327,6 +330,7 @@ __device__ cuda::std::pair gpuDecodeDictionaryIndices( * @param[out] sb Page state buffer output * @param[in] target_pos Target write position * @param[in] t Thread ID + * @tparam state_buf Typename of the `state_buf` (usually inferred) * * @return The new output position */ @@ -396,6 +400,8 @@ inline __device__ int gpuDecodeRleBooleans(page_state_s volatile* s, * @param[out] sb Page state buffer output * @param[in] target_pos Target output position * @param[in] t Thread ID + * @tparam sizes_only True if only sizes are to be calculated + * @tparam state_buf Typename of the `state_buf` (usually inferred) * * @return Total length of strings processed */ @@ -441,10 +447,13 @@ __device__ size_type gpuInitStringDescriptors(page_state_s volatile* s, /** * @brief Decode values out of a definition or repetition stream * + * @param[out] output Pointer to buffer to store level data to * @param[in,out] s Page state input/output - * @param[in] t target_count Target count of stream values on output + * @param[in] target_count Target count of stream values on output * @param[in] t Warp0 thread ID (0..31) * @param[in] lvl The level type we are decoding - DEFINITION or REPETITION + * @tparam level_t Type used to store decoded repetition and definition levels + * @tparam rolling_buf_size Size of the cyclic buffer used to store value data */ template __device__ void gpuDecodeStream( @@ -535,6 +544,7 @@ __device__ void gpuDecodeStream( * * @param[in,out] nesting_info The page/nesting information to store the mask in. The validity map * offset is also updated + * @param[in,out] valid_map Pointer to bitmask to store validity information to * @param[in] valid_mask The validity mask to be stored * @param[in] value_count # of bits in the validity mask */ @@ -592,6 +602,8 @@ inline __device__ void store_validity(int valid_map_offset, * @param[in] input_value_count The current count of input level values we have processed * @param[in] target_input_value_count The desired # of input level values we want to process * @param[in] t Thread index + * @tparam rolling_buf_size Size of the cyclic buffer used to store value data + * @tparam level_t Type used to store decoded repetition and definition levels */ template inline __device__ void get_nesting_bounds(int& start_depth, @@ -636,6 +648,9 @@ inline __device__ void get_nesting_bounds(int& start_depth, * @param[in] rep Repetition level buffer * @param[in] def Definition level buffer * @param[in] t Thread index + * @tparam level_t Type used to store decoded repetition and definition levels + * @tparam state_buf Typename of the `state_buf` (usually inferred) + * @tparam rolling_buf_size Size of the cyclic buffer used to store value data */ template __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value_count, @@ -818,6 +833,9 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value * @param[in] rep Repetition level buffer * @param[in] def Definition level buffer * @param[in] t Thread index + * @tparam rolling_buf_size Size of the cyclic buffer used to store value data + * @tparam level_t Type used to store decoded repetition and definition levels + * @tparam state_buf Typename of the `state_buf` (usually inferred) */ template __device__ void gpuDecodeLevels(page_state_s* s, @@ -861,9 +879,7 @@ __device__ void gpuDecodeLevels(page_state_s* s, * @param[in,out] s The page state * @param[in] cur The current data position * @param[in] end The end of the data - * @param[in] level_bits The bits required - * @param[in] is_decode_step True if we are performing the decode step. - * @param[in,out] decoders The repetition and definition level stream decoders + * @param[in] lvl Enum indicating whether this is to initialize repetition or definition level data * * @return The length of the section */ @@ -973,9 +989,9 @@ struct mask_filter { * @param[in] num_rows Maximum number of rows to read * @param[in] filter Filtering function used to decide which pages to operate on * @param[in] is_decode_step If we are setting up for the decode step (instead of the preprocess) - * @param[in] decoders rle_stream decoders which will be used for decoding levels. Optional. * @tparam Filter Function that takes a PageInfo reference and returns true if the given page should * be operated on Currently only used by gpuComputePageSizes step) + * @return True if this page should be processed further */ template inline __device__ bool setupLocalPageInfo(page_state_s* const s, From 1d137ec058d593f4bd7f2d524f6c3481c8599a5b Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 17 Aug 2023 13:00:28 -0700 Subject: [PATCH 30/36] Apply suggestions from code review Co-authored-by: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com> --- cpp/src/io/parquet/decode_preprocess.cu | 2 +- cpp/src/io/parquet/page_string_utils.cuh | 2 +- cpp/src/io/parquet/rle_stream.cuh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu index 03510cc693e..fb1910f693b 100644 --- a/cpp/src/io/parquet/decode_preprocess.cu +++ b/cpp/src/io/parquet/decode_preprocess.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/io/parquet/page_string_utils.cuh b/cpp/src/io/parquet/page_string_utils.cuh index 3be52642f3a..a68af7cfb16 100644 --- a/cpp/src/io/parquet/page_string_utils.cuh +++ b/cpp/src/io/parquet/page_string_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh index 42e1833008a..189e1bd9750 100644 --- a/cpp/src/io/parquet/rle_stream.cuh +++ b/cpp/src/io/parquet/rle_stream.cuh @@ -24,7 +24,7 @@ namespace cudf::io::parquet::gpu { constexpr int rle_stream_required_run_buffer_size(int num_threads) { - int num_rle_stream_decode_warps = (num_threads / cudf::detail::warp_size) - 1; + constexpr int num_rle_stream_decode_warps = (num_threads / cudf::detail::warp_size) - 1; return (num_rle_stream_decode_warps * 2); } From 6cd3e001f729681bd6d5cc5813310033f0e63765 Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 17 Aug 2023 13:05:32 -0700 Subject: [PATCH 31/36] fix docstring --- cpp/src/io/parquet/parquet_gpu.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 7ac67d27614..cab6d3469b0 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -592,6 +592,7 @@ void DecodeStringPageData(cudf::detail::hostdevice_vector& pages, * @param[in] chunks All chunks to be decoded * @param[in] num_rows Total number of rows to read * @param[in] min_row Minimum number of rows to read + * @param[in] level_type_size Size in bytes of the type for level decoding * @param[in] stream CUDA stream to use, default 0 */ void DecodeDeltaBinary(cudf::detail::hostdevice_vector& pages, From a774ac1099491dde0dcbe060e2ee998a0450157a Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 17 Aug 2023 13:47:07 -0700 Subject: [PATCH 32/36] need to pass num_threads as template param to make constexpr --- cpp/src/io/parquet/decode_preprocess.cu | 2 +- cpp/src/io/parquet/page_string_decode.cu | 2 +- cpp/src/io/parquet/rle_stream.cuh | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu index fb1910f693b..8de3702bc2e 100644 --- a/cpp/src/io/parquet/decode_preprocess.cu +++ b/cpp/src/io/parquet/decode_preprocess.cu @@ -35,7 +35,7 @@ constexpr int preprocess_block_size = 512; // the required number of runs in shared memory we will need to provide the // rle_stream object -constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size(preprocess_block_size); +constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size(); // the size of the rolling batch buffer constexpr int rolling_buf_size = LEVEL_DECODE_BUF_SIZE; diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index 561677106ae..40a7be800f4 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -485,7 +485,7 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz // the required number of runs in shared memory we will need to provide the // rle_stream object - constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size(preprocess_block_size); + constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size(); // the level stream decoders __shared__ rle_run def_runs[rle_run_buffer_size]; diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh index 189e1bd9750..2545a074a38 100644 --- a/cpp/src/io/parquet/rle_stream.cuh +++ b/cpp/src/io/parquet/rle_stream.cuh @@ -22,7 +22,8 @@ namespace cudf::io::parquet::gpu { -constexpr int rle_stream_required_run_buffer_size(int num_threads) +template +constexpr int rle_stream_required_run_buffer_size() { constexpr int num_rle_stream_decode_warps = (num_threads / cudf::detail::warp_size) - 1; return (num_rle_stream_decode_warps * 2); @@ -149,7 +150,7 @@ struct rle_stream { static constexpr int num_rle_stream_decode_warps = (num_rle_stream_decode_threads / cudf::detail::warp_size) - 1; - static constexpr int run_buffer_size = rle_stream_required_run_buffer_size(decode_threads); + static constexpr int run_buffer_size = rle_stream_required_run_buffer_size(); int level_bits; uint8_t const* start; From 6655e86d5dd6f4c5c7b96a3fa3f1c44c63badd21 Mon Sep 17 00:00:00 2001 From: seidl Date: Mon, 21 Aug 2023 11:03:00 -0700 Subject: [PATCH 33/36] more descriptive name and docstring --- cpp/src/io/parquet/page_data.cu | 4 ++-- cpp/src/io/parquet/parquet_gpu.hpp | 9 ++++++--- cpp/src/io/parquet/reader_impl.cpp | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index ab08368362e..e30df2d1f98 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -596,8 +596,8 @@ struct mask_tform { } // anonymous namespace -uint32_t SumPageKernelMasks(cudf::detail::hostdevice_vector& pages, - rmm::cuda_stream_view stream) +uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_vector& pages, + rmm::cuda_stream_view stream) { // determine which kernels to invoke auto mask_iter = thrust::make_transform_iterator(pages.d_begin(), mask_tform{}); diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index c26a3b37490..fd971e342c0 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -478,14 +478,17 @@ void BuildStringDictionaryIndex(ColumnChunkDesc* chunks, rmm::cuda_stream_view stream); /** - * @brief Get OR'd sum of page kernel masks. + * @brief Get the set of kernels that need to be invoked on these pages as a bitmask. + * + * This function performs a bitwise OR on all of the individual `kernel_mask` fields on the pages + * passed in. * * @param[in] pages List of pages to aggregate * @param[in] stream CUDA stream to use * @return Bitwise OR of all page `kernel_mask` values */ -uint32_t SumPageKernelMasks(cudf::detail::hostdevice_vector& pages, - rmm::cuda_stream_view stream); +uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_vector& pages, + rmm::cuda_stream_view stream); /** * @brief Compute page output size information. diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 8b91ac6e175..3f58fc8d42d 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -59,7 +59,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) }); // figure out which kernels to run - auto const kernel_mask = SumPageKernelMasks(pages, _stream); + auto const kernel_mask = GetAggregatedDecodeKernelMask(pages, _stream); // Check to see if there are any string columns present. If so, then we need to get size info // for each string page. This size info will be used to pre-allocate memory for the column, From 373bdd9ec6a9707e09fb9d00b51e0b5282c87b60 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 22 Aug 2023 13:07:57 -0700 Subject: [PATCH 34/36] Apply suggestions from code review Co-authored-by: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com> --- cpp/src/io/parquet/page_string_utils.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/page_string_utils.cuh b/cpp/src/io/parquet/page_string_utils.cuh index a68af7cfb16..ff656c95e8f 100644 --- a/cpp/src/io/parquet/page_string_utils.cuh +++ b/cpp/src/io/parquet/page_string_utils.cuh @@ -41,10 +41,10 @@ inline __device__ void wideStrcpy(uint8_t* dst, uint8_t const* src, size_t len, // `dst + 4` and `len - 4` are used instead of `dst` and `len` to avoid // `load_uint4` reading beyond string boundaries. // use signed int since out_end_aligned can be negative. - int64_t out_start_aligned = (in_datatype_size + alignment_offset + out_datatype_size - 1) / + int64_t const out_start_aligned = (in_datatype_size + alignment_offset + out_datatype_size - 1) / out_datatype_size * out_datatype_size - alignment_offset; - int64_t out_end_aligned = + int64_t const out_end_aligned = (len - in_datatype_size + alignment_offset) / out_datatype_size * out_datatype_size - alignment_offset; From 67be44916f1c19027acfe9392d2fa4246095e929 Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 22 Aug 2023 13:10:08 -0700 Subject: [PATCH 35/36] formatting --- cpp/src/io/parquet/page_string_utils.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/page_string_utils.cuh b/cpp/src/io/parquet/page_string_utils.cuh index ff656c95e8f..9395599b3ff 100644 --- a/cpp/src/io/parquet/page_string_utils.cuh +++ b/cpp/src/io/parquet/page_string_utils.cuh @@ -42,8 +42,8 @@ inline __device__ void wideStrcpy(uint8_t* dst, uint8_t const* src, size_t len, // `load_uint4` reading beyond string boundaries. // use signed int since out_end_aligned can be negative. int64_t const out_start_aligned = (in_datatype_size + alignment_offset + out_datatype_size - 1) / - out_datatype_size * out_datatype_size - - alignment_offset; + out_datatype_size * out_datatype_size - + alignment_offset; int64_t const out_end_aligned = (len - in_datatype_size + alignment_offset) / out_datatype_size * out_datatype_size - alignment_offset; From eb4ff08189e009fd228d35d4fb3725193f7c49e8 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 23 Aug 2023 11:36:56 -0700 Subject: [PATCH 36/36] implement suggestion from review Co-authored-by: GALI PREM SAGAR --- python/cudf/cudf/tests/test_parquet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index c6ffd042036..2cc4b32443d 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1316,6 +1316,7 @@ def test_delta_binary(nrows, add_nulls, tmpdir): }, ], rows=nrows, + seed=0, use_threads=False, ) # Roundabout conversion to pandas to preserve nulls/data types