From 961e7511669e9a4035200142da01a070f33441fd Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Fri, 31 Mar 2023 12:49:57 -0500 Subject: [PATCH 1/2] Reduce shared memory usage in gpuComputePageSizes by ~3k (50%). --- cpp/src/io/parquet/page_data.cu | 150 ++++++++++++++++++++------------ 1 file changed, 92 insertions(+), 58 deletions(-) diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 25b9f7fd285..259d9dd35fe 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -77,9 +77,6 @@ struct page_state_s { int32_t dict_pos; // write position of dictionary indices int32_t src_pos; // input read position of final output value int32_t ts_scale; // timestamp scale: <0: divide by -ts_scale, >0: multiply by ts_scale - uint32_t nz_idx[non_zero_buffer_size]; // circular buffer of non-null value positions - uint32_t dict_idx[non_zero_buffer_size]; // Dictionary index, boolean, or string offset values - uint32_t str_len[non_zero_buffer_size]; // String length for plain encoding of strings // repetition/definition level decoding int32_t input_value_count; // how many values of the input we've processed @@ -99,6 +96,14 @@ struct page_state_s { PageNestingDecodeInfo* nesting_info; }; +// buffers only used in the decode kernel. separated from page_state_s to keep +// shared memory usage in other kernels (eg, gpuComputePageSizes) down. +struct page_state_buffers_s { + uint32_t nz_idx[non_zero_buffer_size]; // circular buffer of non-null value positions + uint32_t dict_idx[non_zero_buffer_size]; // Dictionary index, boolean, or string offset values + uint32_t str_len[non_zero_buffer_size]; // String length for plain encoding of strings +}; + /** * @brief Returns whether or not a page spans either the beginning or the end of the * specified row bounds @@ -334,6 +339,7 @@ __device__ void gpuDecodeStream( */ template __device__ cuda::std::pair gpuDecodeDictionaryIndices(volatile page_state_s* s, + volatile page_state_buffers_s* sb, int target_pos, int t) { @@ -411,7 +417,7 @@ __device__ cuda::std::pair gpuDecodeDictionaryIndices(volatile page_st } // if we're not computing sizes, store off the dictionary index - if constexpr (!sizes_only) { s->dict_idx[rolling_index(pos + t)] = dict_idx; } + if constexpr (!sizes_only) { sb->dict_idx[rolling_index(pos + t)] = dict_idx; } } // if we're computing sizes, add the length(s) @@ -446,7 +452,10 @@ __device__ cuda::std::pair gpuDecodeDictionaryIndices(volatile page_st * * @return The new output position */ -__device__ int gpuDecodeRleBooleans(volatile page_state_s* s, int target_pos, int t) +__device__ int gpuDecodeRleBooleans(volatile page_state_s* s, + volatile page_state_buffers_s* sb, + int target_pos, + int t) { const uint8_t* end = s->data_end; int pos = s->dict_pos; @@ -493,7 +502,7 @@ __device__ int gpuDecodeRleBooleans(volatile page_state_s* s, int target_pos, in } else { dict_idx = s->dict_val; } - s->dict_idx[rolling_index(pos + t)] = dict_idx; + sb->dict_idx[rolling_index(pos + t)] = dict_idx; } pos += batch_len; } @@ -511,7 +520,10 @@ __device__ int gpuDecodeRleBooleans(volatile page_state_s* s, int target_pos, in * @return Total length of strings processed */ template -__device__ size_type gpuInitStringDescriptors(volatile page_state_s* s, int target_pos, int t) +__device__ size_type gpuInitStringDescriptors(volatile page_state_s* s, + volatile page_state_buffers_s* sb, + int target_pos, + int t) { int pos = s->dict_pos; int total_len = 0; @@ -532,8 +544,8 @@ __device__ size_type gpuInitStringDescriptors(volatile page_state_s* s, int targ len = 0; } if constexpr (!sizes_only) { - s->dict_idx[rolling_index(pos)] = k; - s->str_len[rolling_index(pos)] = len; + sb->dict_idx[rolling_index(pos)] = k; + sb->str_len[rolling_index(pos)] = len; } k += len; total_len += len; @@ -554,8 +566,8 @@ __device__ size_type gpuInitStringDescriptors(volatile page_state_s* s, int targ * * @return A pair containing a pointer to the string and its length */ -inline __device__ cuda::std::pair gpuGetStringData(volatile page_state_s* s, - int src_pos) +inline __device__ cuda::std::pair gpuGetStringData( + volatile page_state_s* s, volatile page_state_buffers_s* sb, int src_pos) { const char* ptr = nullptr; size_t len = 0; @@ -563,7 +575,7 @@ inline __device__ cuda::std::pair gpuGetStringData(volatile if (s->dict_base) { // String dictionary uint32_t dict_pos = - (s->dict_bits > 0) ? s->dict_idx[rolling_index(src_pos)] * sizeof(string_index_pair) : 0; + (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] * sizeof(string_index_pair) : 0; if (dict_pos < (uint32_t)s->dict_size) { const auto* src = reinterpret_cast(s->dict_base + dict_pos); ptr = src->first; @@ -571,10 +583,10 @@ inline __device__ cuda::std::pair gpuGetStringData(volatile } } else { // Plain encoding - uint32_t dict_pos = s->dict_idx[rolling_index(src_pos)]; + uint32_t dict_pos = sb->dict_idx[rolling_index(src_pos)]; if (dict_pos <= (uint32_t)s->dict_size) { ptr = reinterpret_cast(s->data_start + dict_pos); - len = s->str_len[rolling_index(src_pos)]; + len = sb->str_len[rolling_index(src_pos)]; } } @@ -588,9 +600,12 @@ inline __device__ cuda::std::pair gpuGetStringData(volatile * @param[in] src_pos Source position * @param[in] dstv Pointer to row output data (string descriptor or 32-bit hash) */ -inline __device__ void gpuOutputString(volatile page_state_s* s, int src_pos, void* dstv) +inline __device__ void gpuOutputString(volatile page_state_s* s, + volatile page_state_buffers_s* sb, + int src_pos, + void* dstv) { - auto [ptr, len] = gpuGetStringData(s, src_pos); + auto [ptr, len] = gpuGetStringData(s, sb, src_pos); if (s->dtype_len == 4) { // Output hash. This hash value is used if the option to convert strings to // categoricals is enabled. The seed value is chosen arbitrarily. @@ -612,9 +627,12 @@ inline __device__ void gpuOutputString(volatile page_state_s* s, int src_pos, vo * @param[in] src_pos Source position * @param[in] dst Pointer to row output data */ -inline __device__ void gpuOutputBoolean(volatile page_state_s* s, int src_pos, uint8_t* dst) +inline __device__ void gpuOutputBoolean(volatile page_state_s* s, + volatile page_state_buffers_s* sb, + int src_pos, + uint8_t* dst) { - *dst = s->dict_idx[rolling_index(src_pos)]; + *dst = sb->dict_idx[rolling_index(src_pos)]; } /** @@ -684,7 +702,10 @@ inline __device__ void gpuStoreOutput(uint2* dst, * @param[in] src_pos Source position * @param[out] dst Pointer to row output data */ -inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s, int src_pos, int64_t* dst) +inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s, + volatile page_state_buffers_s* sb, + int src_pos, + int64_t* dst) { using cuda::std::chrono::duration_cast; @@ -693,7 +714,7 @@ inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s, int src if (s->dict_base) { // Dictionary - dict_pos = (s->dict_bits > 0) ? s->dict_idx[rolling_index(src_pos)] : 0; + dict_pos = (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] : 0; src8 = s->dict_base; } else { // Plain @@ -753,7 +774,10 @@ inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s, int src * @param[in] src_pos Source position * @param[in] dst Pointer to row output data */ -inline __device__ void gpuOutputInt64Timestamp(volatile page_state_s* s, int src_pos, int64_t* dst) +inline __device__ void gpuOutputInt64Timestamp(volatile page_state_s* s, + volatile page_state_buffers_s* sb, + int src_pos, + int64_t* dst) { const uint8_t* src8; uint32_t dict_pos, dict_size = s->dict_size, ofs; @@ -761,7 +785,7 @@ inline __device__ void gpuOutputInt64Timestamp(volatile page_state_s* s, int src if (s->dict_base) { // Dictionary - dict_pos = (s->dict_bits > 0) ? s->dict_idx[rolling_index(src_pos)] : 0; + dict_pos = (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] : 0; src8 = s->dict_base; } else { // Plain @@ -831,12 +855,15 @@ __device__ void gpuOutputByteArrayAsInt(char const* ptr, int32_t len, T* dst) * @param[in] dst Pointer to row output data */ template -__device__ void gpuOutputFixedLenByteArrayAsInt(volatile page_state_s* s, int src_pos, T* dst) +__device__ void gpuOutputFixedLenByteArrayAsInt(volatile page_state_s* s, + volatile page_state_buffers_s* sb, + int src_pos, + T* dst) { uint32_t const dtype_len_in = s->dtype_len_in; uint8_t const* data = s->dict_base ? s->dict_base : s->data_start; uint32_t const pos = - (s->dict_base ? ((s->dict_bits > 0) ? s->dict_idx[rolling_index(src_pos)] : 0) : src_pos) * + (s->dict_base ? ((s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] : 0) : src_pos) * dtype_len_in; uint32_t const dict_size = s->dict_size; @@ -862,14 +889,17 @@ __device__ void gpuOutputFixedLenByteArrayAsInt(volatile page_state_s* s, int sr * @param[in] dst Pointer to row output data */ template -inline __device__ void gpuOutputFast(volatile page_state_s* s, int src_pos, T* dst) +inline __device__ void gpuOutputFast(volatile page_state_s* s, + volatile page_state_buffers_s* sb, + int src_pos, + T* dst) { const uint8_t* dict; uint32_t dict_pos, dict_size = s->dict_size; if (s->dict_base) { // Dictionary - dict_pos = (s->dict_bits > 0) ? s->dict_idx[rolling_index(src_pos)] : 0; + dict_pos = (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] : 0; dict = s->dict_base; } else { // Plain @@ -888,17 +918,15 @@ inline __device__ void gpuOutputFast(volatile page_state_s* s, int src_pos, T* d * @param[in] dst8 Pointer to row output data * @param[in] len Length of element */ -static __device__ void gpuOutputGeneric(volatile page_state_s* s, - int src_pos, - uint8_t* dst8, - int len) +static __device__ void gpuOutputGeneric( + volatile page_state_s* s, volatile page_state_buffers_s* sb, int src_pos, uint8_t* dst8, int len) { const uint8_t* dict; uint32_t dict_pos, dict_size = s->dict_size; if (s->dict_base) { // Dictionary - dict_pos = (s->dict_bits > 0) ? s->dict_idx[rolling_index(src_pos)] : 0; + dict_pos = (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] : 0; dict = s->dict_base; } else { // Plain @@ -1373,6 +1401,7 @@ inline __device__ void get_nesting_bounds(int& start_depth, */ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value_count, page_state_s* s, + page_state_buffers_s* sb, int t) { // max nesting depth of the column @@ -1457,7 +1486,7 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu int const src_pos = nesting_info->valid_count + thread_valid_count; int const dst_pos = nesting_info->value_count + thread_value_count; // nz_idx is a mapping of src buffer indices to destination buffer indices - s->nz_idx[rolling_index(src_pos)] = dst_pos; + sb->nz_idx[rolling_index(src_pos)] = dst_pos; } // compute warp and thread value counts for the -next- nesting level. we need to @@ -1543,7 +1572,10 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu * @param[in] target_leaf_count Target count of non-null leaf values to generate indices for * @param[in] t Thread index */ -__device__ void gpuDecodeLevels(page_state_s* s, int32_t target_leaf_count, int t) +__device__ void gpuDecodeLevels(page_state_s* s, + page_state_buffers_s* sb, + int32_t target_leaf_count, + int t) { bool has_repetition = s->col.max_level[level_type::REPETITION] > 0; @@ -1563,7 +1595,7 @@ __device__ void gpuDecodeLevels(page_state_s* s, int32_t target_leaf_count, int : s->lvl_count[level_type::DEFINITION]; // process what we got back - gpuUpdateValidityOffsetsAndRowIndices(actual_leaf_count, s, t); + gpuUpdateValidityOffsetsAndRowIndices(actual_leaf_count, s, sb, t); cur_leaf_count = actual_leaf_count + batch_size; __syncwarp(); } @@ -1672,11 +1704,11 @@ __device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t) size_type target_pos = s->num_input_values; size_type str_len = 0; if (s->dict_base) { - auto const [new_target_pos, len] = gpuDecodeDictionaryIndices(s, target_pos, t); + auto const [new_target_pos, len] = gpuDecodeDictionaryIndices(s, nullptr, target_pos, t); target_pos = new_target_pos; str_len = len; } else if ((s->col.data_type & 7) == BYTE_ARRAY) { - str_len = gpuInitStringDescriptors(s, target_pos, t); + str_len = gpuInitStringDescriptors(s, nullptr, target_pos, t); } if (!t) { *(volatile int32_t*)&s->dict_pos = target_pos; } return str_len; @@ -1862,10 +1894,12 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData( PageInfo* pages, device_span chunks, size_t min_row, size_t num_rows) { __shared__ __align__(16) page_state_s state_g; + __shared__ __align__(16) page_state_buffers_s state_buffers; - page_state_s* const s = &state_g; - int page_idx = blockIdx.x; - int t = threadIdx.x; + page_state_s* const s = &state_g; + page_state_buffers_s* const sb = &state_buffers; + int page_idx = blockIdx.x; + int t = threadIdx.x; int out_thread0; if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; } @@ -1915,18 +1949,18 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData( // - update validity vectors // - updates offsets (for nested columns) // - produces non-NULL value indices in s->nz_idx for subsequent decoding - gpuDecodeLevels(s, target_pos, t); + gpuDecodeLevels(s, sb, target_pos, t); } else if (t < out_thread0) { // skipped_leaf_values will always be 0 for flat hierarchies. uint32_t src_target_pos = target_pos + skipped_leaf_values; // WARP1: Decode dictionary indices, booleans or string positions if (s->dict_base) { - src_target_pos = gpuDecodeDictionaryIndices(s, src_target_pos, t & 0x1f).first; + src_target_pos = gpuDecodeDictionaryIndices(s, sb, src_target_pos, t & 0x1f).first; } else if ((s->col.data_type & 7) == BOOLEAN) { - src_target_pos = gpuDecodeRleBooleans(s, src_target_pos, t & 0x1f); + src_target_pos = gpuDecodeRleBooleans(s, sb, src_target_pos, t & 0x1f); } else if ((s->col.data_type & 7) == BYTE_ARRAY) { - gpuInitStringDescriptors(s, src_target_pos, t & 0x1f); + gpuInitStringDescriptors(s, sb, src_target_pos, t & 0x1f); } if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; } } else { @@ -1935,7 +1969,7 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData( src_pos += t - out_thread0; // the position in the output column/buffer - int dst_pos = s->nz_idx[rolling_index(src_pos)]; + int dst_pos = sb->nz_idx[rolling_index(src_pos)]; // for the flat hierarchy case we will be reading from the beginning of the value stream, // regardless of the value of first_row. so adjust our destination offset accordingly. @@ -1967,7 +2001,7 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData( nesting_info_base[leaf_level_index].data_out + static_cast(dst_pos) * dtype_len; if (dtype == BYTE_ARRAY) { if (s->col.converted_type == DECIMAL) { - auto const [ptr, len] = gpuGetStringData(s, val_src_pos); + auto const [ptr, len] = gpuGetStringData(s, sb, val_src_pos); auto const decimal_precision = s->col.decimal_precision; if (decimal_precision <= MAX_DECIMAL32_PRECISION) { gpuOutputByteArrayAsInt(ptr, len, static_cast(dst)); @@ -1977,41 +2011,41 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData( gpuOutputByteArrayAsInt(ptr, len, static_cast<__int128_t*>(dst)); } } else { - gpuOutputString(s, val_src_pos, dst); + gpuOutputString(s, sb, val_src_pos, dst); } } else if (dtype == BOOLEAN) { - gpuOutputBoolean(s, val_src_pos, static_cast(dst)); + gpuOutputBoolean(s, sb, val_src_pos, static_cast(dst)); } else if (s->col.converted_type == DECIMAL) { switch (dtype) { - case INT32: gpuOutputFast(s, val_src_pos, static_cast(dst)); break; - case INT64: gpuOutputFast(s, val_src_pos, static_cast(dst)); break; + case INT32: gpuOutputFast(s, sb, val_src_pos, static_cast(dst)); break; + case INT64: gpuOutputFast(s, sb, val_src_pos, static_cast(dst)); break; default: if (s->dtype_len_in <= sizeof(int32_t)) { - gpuOutputFixedLenByteArrayAsInt(s, val_src_pos, static_cast(dst)); + gpuOutputFixedLenByteArrayAsInt(s, sb, val_src_pos, static_cast(dst)); } else if (s->dtype_len_in <= sizeof(int64_t)) { - gpuOutputFixedLenByteArrayAsInt(s, val_src_pos, static_cast(dst)); + gpuOutputFixedLenByteArrayAsInt(s, sb, val_src_pos, static_cast(dst)); } else { - gpuOutputFixedLenByteArrayAsInt(s, val_src_pos, static_cast<__int128_t*>(dst)); + gpuOutputFixedLenByteArrayAsInt(s, sb, val_src_pos, static_cast<__int128_t*>(dst)); } break; } } else if (dtype == INT96) { - gpuOutputInt96Timestamp(s, val_src_pos, static_cast(dst)); + gpuOutputInt96Timestamp(s, sb, val_src_pos, static_cast(dst)); } else if (dtype_len == 8) { if (s->dtype_len_in == 4) { // Reading INT32 TIME_MILLIS into 64-bit DURATION_MILLISECONDS // TIME_MILLIS is the only duration type stored as int32: // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#deprecated-time-convertedtype - gpuOutputFast(s, val_src_pos, static_cast(dst)); + gpuOutputFast(s, sb, val_src_pos, static_cast(dst)); } else if (s->ts_scale) { - gpuOutputInt64Timestamp(s, val_src_pos, static_cast(dst)); + gpuOutputInt64Timestamp(s, sb, val_src_pos, static_cast(dst)); } else { - gpuOutputFast(s, val_src_pos, static_cast(dst)); + gpuOutputFast(s, sb, val_src_pos, static_cast(dst)); } } else if (dtype_len == 4) { - gpuOutputFast(s, val_src_pos, static_cast(dst)); + gpuOutputFast(s, sb, val_src_pos, static_cast(dst)); } else { - gpuOutputGeneric(s, val_src_pos, static_cast(dst), dtype_len); + gpuOutputGeneric(s, sb, val_src_pos, static_cast(dst), dtype_len); } } From 12b2d60751594bccea5e3ea0569224ef528f8e6e Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Thu, 6 Apr 2023 12:10:04 -0500 Subject: [PATCH 2/2] Update function comments. --- cpp/src/io/parquet/page_data.cu | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 259d9dd35fe..a68aecaa03c 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -328,6 +328,7 @@ __device__ void gpuDecodeStream( * @brief Performs RLE decoding of dictionary indexes * * @param[in,out] s Page state input/output + * @param[out] sb Page state buffer output * @param[in] target_pos Target index position in dict_idx buffer (may exceed this value by up to * 31) * @param[in] t Warp1 thread ID (0..31) @@ -447,6 +448,7 @@ __device__ cuda::std::pair gpuDecodeDictionaryIndices(volatile page_st * @brief Performs RLE decoding of dictionary indexes, for when dict_size=1 * * @param[in,out] s Page state input/output + * @param[out] sb Page state buffer output * @param[in] target_pos Target write position * @param[in] t Thread ID * @@ -514,6 +516,7 @@ __device__ int gpuDecodeRleBooleans(volatile page_state_s* s, * processed * * @param[in,out] s Page state input/output + * @param[out] sb Page state buffer output * @param[in] target_pos Target output position * @param[in] t Thread ID * @@ -562,6 +565,7 @@ __device__ size_type gpuInitStringDescriptors(volatile page_state_s* s, * @brief Retrieves string information for a string at the specified source position * * @param[in] s Page state input + * @param[out] sb Page state buffer output * @param[in] src_pos Source position * * @return A pair containing a pointer to the string and its length @@ -597,6 +601,7 @@ inline __device__ cuda::std::pair gpuGetStringData( * @brief Output a string descriptor * * @param[in,out] s Page state input/output + * @param[out] sb Page state buffer output * @param[in] src_pos Source position * @param[in] dstv Pointer to row output data (string descriptor or 32-bit hash) */ @@ -624,6 +629,7 @@ inline __device__ void gpuOutputString(volatile page_state_s* s, * @brief Output a boolean * * @param[in,out] s Page state input/output + * @param[out] sb Page state buffer output * @param[in] src_pos Source position * @param[in] dst Pointer to row output data */ @@ -699,6 +705,7 @@ inline __device__ void gpuStoreOutput(uint2* dst, * @brief Convert an INT96 Spark timestamp to 64-bit timestamp * * @param[in,out] s Page state input/output + * @param[out] sb Page state buffer output * @param[in] src_pos Source position * @param[out] dst Pointer to row output data */ @@ -771,6 +778,7 @@ inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s, * @brief Output a 64-bit timestamp * * @param[in,out] s Page state input/output + * @param[out] sb Page state buffer output * @param[in] src_pos Source position * @param[in] dst Pointer to row output data */ @@ -851,6 +859,7 @@ __device__ void gpuOutputByteArrayAsInt(char const* ptr, int32_t len, T* dst) * @brief Output a fixed-length byte array as int. * * @param[in,out] s Page state input/output + * @param[out] sb Page state buffer output * @param[in] src_pos Source position * @param[in] dst Pointer to row output data */ @@ -885,6 +894,7 @@ __device__ void gpuOutputFixedLenByteArrayAsInt(volatile page_state_s* s, * @brief Output a small fixed-length value * * @param[in,out] s Page state input/output + * @param[out] sb Page state buffer output * @param[in] src_pos Source position * @param[in] dst Pointer to row output data */ @@ -914,6 +924,7 @@ inline __device__ void gpuOutputFast(volatile page_state_s* s, * @brief Output a N-byte value * * @param[in,out] s Page state input/output + * @param[out] sb Page state buffer output * @param[in] src_pos Source position * @param[in] dst8 Pointer to row output data * @param[in] len Length of element @@ -1397,6 +1408,7 @@ inline __device__ void get_nesting_bounds(int& start_depth, * * @param[in] target_input_value_count The # of repetition/definition levels to process up to * @param[in] s Local page information + * @param[out] sb Page state buffer output * @param[in] t Thread index */ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value_count, @@ -1569,6 +1581,7 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu * Only runs on 1 warp. * * @param[in] s The local page state + * @param[out] sb Page state buffer output * @param[in] target_leaf_count Target count of non-null leaf values to generate indices for * @param[in] t Thread index */