From b5ec22e3cc7f3570f9ef7eeb9b55b0ffde350eb7 Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Mon, 12 Aug 2024 16:31:00 -0400 Subject: [PATCH 01/38] work in progress --- cpp/src/io/parquet/decode_fixed.cu | 581 +++++++++++++++++++++++------ cpp/src/io/parquet/page_hdr.cu | 11 + cpp/src/io/parquet/parquet_gpu.hpp | 31 ++ cpp/src/io/parquet/reader_impl.cpp | 48 +++ 4 files changed, 566 insertions(+), 105 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index ea80ae73c2f..8157198e116 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -195,13 +195,13 @@ struct decode_fixed_width_split_values_func { }; template -static __device__ int gpuUpdateValidityAndRowIndicesNested( +static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists( int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t) { constexpr int num_warps = decode_block_size / cudf::detail::warp_size; constexpr int max_batch_size = num_warps * cudf::detail::warp_size; - // how many (input) values we've processed in the page so far + // how many (input) values we've processed in the page so far, prior to this loop iteration int value_count = s->input_value_count; // cap by last row so that we don't process any rows past what we want to output. @@ -217,69 +217,99 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( while (value_count < capped_target_value_count) { int const batch_size = min(max_batch_size, capped_target_value_count - value_count); - // definition level. only need to process for nullable columns - int d = 0; + // get definition level. only need to process for nullable columns + int def_level; if constexpr (nullable) { if (def) { - d = t < batch_size + def_level = t < batch_size ? static_cast(def[rolling_index(value_count + t)]) : -1; } else { - d = t < batch_size ? 1 : -1; + def_level = t < batch_size ? 1 : -1; } + } else { + def_level = 0; } - int const thread_value_count = t + 1; - int const block_value_count = batch_size; + //Determine value count & row index + int const thread_value_count = t + 1; //# of output values from the view of this thread + int const block_value_count = batch_size; + int const row_index = t + value_count; //thread_row_index in old + int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); - // compute our row index, whether we're in row bounds, and validity - int const row_index = (thread_value_count + value_count) - 1; - int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); - int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row); - int const write_start = __ffs(in_write_row_bounds) - 1; // first bit in the warp to store + //per-warp variables used below for writing validity + int const in_write_row_bounds = (row_index >= first_row) && (row_index < last_row); - // iterate by depth - for (int d_idx = 0; d_idx <= max_depth; d_idx++) { - auto& ni = s->nesting_info[d_idx]; + //bit mask of all threads that passed true + int const in_write_row_bounds_mask = ballot(in_write_row_bounds); - int is_valid; - if constexpr (nullable) { - is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0; - } else { - is_valid = in_row_bounds; + // index of first set bit (in the warp to store) + int write_start = __ffs(in_write_row_bounds_mask) - 1; + + // remaining code is trivial for non-nullable, non-list columns: no need to iterate over depth + if constexpr (!nullable) { + + // if this is valid and we're at the leaf, output dst_pos + int const is_valid = in_row_bounds; + if (is_valid) { + auto& ni = s->nesting_info[max_depth]; + int const thread_valid_count = thread_value_count; + + // for non-list types, the value count is always the same across + int const dst_pos = (value_count + thread_value_count) - 1; + int const src_pos = (ni.valid_count + thread_valid_count) - 1; + sb->nz_idx[rolling_index(src_pos)] = dst_pos; } - // thread and block validity count - int thread_valid_count, block_valid_count; - if constexpr (nullable) { + // update valid_count + if (t == 0) { + int const block_valid_count = block_value_count; + s->nesting_info[max_depth].valid_count += block_valid_count; + } + + __syncthreads(); // publish modification of nesting_info value_count + } else { + + // column is a nullable non-list: iterate by depth + for (int d_idx = 0; d_idx <= max_depth; d_idx++) { + + auto& ni = s->nesting_info[d_idx]; + + // everything up to the max_def_level is a non-null value + int is_valid = ((def_level >= ni.max_def_level) && in_row_bounds) ? 1 : 0; + + // thread and block validity count + // queries is_valid of all threads, stores prior total and total total using block_scan = cub::BlockScan; __shared__ typename block_scan::TempStorage scan_storage; + int thread_valid_count, block_valid_count; block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count); - __syncthreads(); - // validity is processed per-warp + // validity is processed per-warp (lane 0 writes), because writes are atomic // - // nested schemas always read and write to the same bounds (that is, read and write - // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading - // at the first value, even if that is before first_row, because we cannot trivially jump to - // the correct position to start reading. since we are about to write the validity vector + // nested schemas always read and write to the same bounds + // (that is, read and write positions are already pre-bounded by first_row/num_rows). + // since we are about to write the validity vector // here we need to adjust our computed mask to take into account the write row bounds. int warp_null_count = 0; - if (write_start >= 0 && ni.valid_map != nullptr) { - int const valid_map_offset = ni.valid_map_offset; + if ((write_start >= 0) && (ni.valid_map != nullptr)) { uint32_t const warp_validity_mask = ballot(is_valid); - // lane 0 from each warp writes out validity if ((t % cudf::detail::warp_size) == 0) { - int const vindex = - (value_count + thread_value_count) - 1; // absolute input value index - int const bit_offset = (valid_map_offset + vindex + write_start) - - first_row; // absolute bit offset into the output validity map - int const write_end = cudf::detail::warp_size - - __clz(in_write_row_bounds); // last bit in the warp to store - int const bit_count = write_end - write_start; - warp_null_count = bit_count - __popc(warp_validity_mask >> write_start); - - store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count); + // absolute input value index + int const vindex = (value_count + thread_value_count) - 1; + + // absolute bit offset into the output validity map + int const bit_offset = (ni.valid_map_offset + vindex + write_start) - first_row; + + // last bit in the warp to store + int const write_end = cudf::detail::warp_size - __clz(in_write_row_bounds_mask); + int const bit_count = write_end - write_start; //in old is warp_valid_mask_bit_count + + uint32_t const warp_output_valid_mask = warp_validity_mask >> write_start; + + store_validity(bit_offset, ni.valid_map, warp_output_valid_mask, bit_count); + + warp_null_count = bit_count - __popc(warp_output_valid_mask); } } @@ -290,25 +320,20 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( size_type const block_null_count = cudf::detail::single_lane_block_sum_reduce(warp_null_count); if (t == 0) { ni.null_count += block_null_count; } - } - // trivial for non-nullable columns - else { - thread_valid_count = thread_value_count; - block_valid_count = block_value_count; - } - // if this is valid and we're at the leaf, output dst_pos - __syncthreads(); // handle modification of ni.value_count from below - if (is_valid && d_idx == max_depth) { - // for non-list types, the value count is always the same across - int const dst_pos = (value_count + thread_value_count) - 1; - int const src_pos = (ni.valid_count + thread_valid_count) - 1; - sb->nz_idx[rolling_index(src_pos)] = dst_pos; - } - __syncthreads(); // handle modification of ni.value_count from below + // if this is valid and we're at the leaf, output dst_pos + if (is_valid && d_idx == max_depth) { + // for non-list types, the value count is always the same across + __syncthreads(); // handle modification of ni.valid_count from below + int const dst_pos = (value_count + thread_value_count) - 1; + int const src_pos = (ni.valid_count + thread_valid_count) - 1; + sb->nz_idx[rolling_index(src_pos)] = dst_pos; + } + __syncthreads(); // handle modification of ni.valid_count from below - // update stuff - if (t == 0) { ni.valid_count += block_valid_count; } + // update stuff + if (t == 0) { ni.valid_count += block_valid_count; } + } //END OF DEPTH LOOP } value_count += block_value_count; @@ -351,27 +376,26 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( while (value_count < capped_target_value_count) { int const batch_size = min(max_batch_size, capped_target_value_count - value_count); - // definition level. only need to process for nullable columns - int d = 0; - if constexpr (nullable) { - if (def) { - d = t < batch_size - ? static_cast(def[rolling_index(value_count + t)]) - : -1; - } else { - d = t < batch_size ? 1 : -1; - } - } - int const thread_value_count = t + 1; int const block_value_count = batch_size; // compute our row index, whether we're in row bounds, and validity int const row_index = (thread_value_count + value_count) - 1; int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); + + // determine if is valid + // everything up to the max_def_level is a non-null value int is_valid; if constexpr (nullable) { - is_valid = ((d > 0) && in_row_bounds) ? 1 : 0; + // get definition level. only need to process for nullable columns + if (t >= batch_size) { + is_valid = 0; + } else if (def) { + int const def_level = static_cast(def[rolling_index(value_count + t)]); + is_valid = ((def_level > 0) && in_row_bounds) ? 1 : 0; + } else { + is_valid = in_row_bounds; + } } else { is_valid = in_row_bounds; } @@ -379,32 +403,37 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( // thread and block validity count int thread_valid_count, block_valid_count; if constexpr (nullable) { + // use a scan to compute the total number of valid values, as well as the total number of valid + // values for each individual thread (how many valids there are including me, but no one after me) using block_scan = cub::BlockScan; __shared__ typename block_scan::TempStorage scan_storage; block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count); __syncthreads(); - // validity is processed per-warp + // validity is processed per-warp, because storing is an atomic operation // // nested schemas always read and write to the same bounds (that is, read and write // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading // at the first value, even if that is before first_row, because we cannot trivially jump to // the correct position to start reading. since we are about to write the validity vector // here we need to adjust our computed mask to take into account the write row bounds. - int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row); - int const write_start = __ffs(in_write_row_bounds) - 1; // first bit in the warp to store - int warp_null_count = 0; - if (write_start >= 0) { - uint32_t const warp_validity_mask = ballot(is_valid); + int const in_write_row_bounds = row_index >= first_row && row_index < last_row; + int const in_write_row_bounds_mask = ballot(in_write_row_bounds); + //is first_thread_in_write_range in old + int const write_start = __ffs(in_write_row_bounds_mask) - 1; // first bit in the warp to store + + int warp_null_count = 0; + if ((write_start >= 0) && (ni.valid_map != nullptr)) { + uint32_t const warp_validity_mask = ballot(is_valid); // is warp_valid_mask in old // lane 0 from each warp writes out validity if ((t % cudf::detail::warp_size) == 0) { int const vindex = (value_count + thread_value_count) - 1; // absolute input value index int const bit_offset = (valid_map_offset + vindex + write_start) - first_row; // absolute bit offset into the output validity map int const write_end = - cudf::detail::warp_size - __clz(in_write_row_bounds); // last bit in the warp to store + cudf::detail::warp_size - __clz(in_write_row_bounds_mask); // last bit in the warp to store int const bit_count = write_end - write_start; - warp_null_count = bit_count - __popc(warp_validity_mask >> write_start); + warp_null_count = bit_count - __popc(warp_validity_mask >> write_start); //#set bits store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count); } @@ -439,7 +468,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( if (t == 0) { // update valid value count for decoding and total # of values we've processed ni.valid_count = valid_count; - ni.value_count = value_count; // TODO: remove? this is unused in the non-list path + ni.value_count = value_count; s->nz_count = valid_count; s->input_value_count = value_count; s->input_row_count = value_count; @@ -448,6 +477,259 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( return valid_count; } +template +static __device__ int gpuUpdateValidityAndRowIndicesNestedLists( + int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, + level_t const* const rep, int t) +{ + constexpr int num_warps = decode_block_size / cudf::detail::warp_size; + constexpr int max_batch_size = num_warps * cudf::detail::warp_size; + + // how many (input) values we've processed in the page so far, prior to this loop iteration + int value_count = s->input_value_count; + + // how many rows we've processed in the page so far + int input_row_count = s->input_row_count; +if (t == 0) { printf("value_count %d, input_row_count %d\n", value_count, input_row_count); } + + // cap by last row so that we don't process any rows past what we want to output. + int const first_row = s->first_row; + int const last_row = first_row + s->num_rows; +if (t == 0) { printf("first_row %d, last_row %d, target_value_count %d\n", first_row, last_row, target_value_count); } + + int const row_index_lower_bound = s->row_index_lower_bound; + int const max_depth = s->col.max_nesting_depth - 1; + + __syncthreads(); + + while (value_count < target_value_count) { + bool const within_batch = value_count + t < target_value_count; + + // get definition level. only need to process for nullable columns + int def_level; + if constexpr (nullable) { + if (def) { + def_level = within_batch + ? static_cast(def[rolling_index(value_count + t)]) + : -1; + } else { + def_level = within_batch ? 1 : -1; + } + } else { + def_level = 0; + } + + // use repitition level to get start/end depth + // different for each thread, as each thread has a different r/d + int start_depth = -1, end_depth = -1; + if (within_batch) { + int const index = rolling_index(value_count + t); + int const rep_level = rep[index]; + //computed by generate_depth_remappings() + start_depth = s->nesting_info[rep_level].start_depth; + end_depth = s->nesting_info[def_level].end_depth; +if (t == 0) { printf("def_level %d, rep_level %d, start_depth %d, end_depth %d\n", \ + def_level, rep_level, start_depth, end_depth); } + } + + //Determine value count & row index + // track (page-relative) row index for the thread so we can compare against input bounds + // keep track of overall # of rows we've read. + int const is_new_row = start_depth == 0 ? 1 : 0; //TODO: UNCOMMENT + int thread_num_new_rows, total_num_new_rows; + using block_scan = cub::BlockScan; + __shared__ typename block_scan::TempStorage scan_storage; + block_scan(scan_storage).InclusiveSum(is_new_row, thread_num_new_rows, total_num_new_rows); + __syncthreads(); //Needed because scan_storage will be reused + +if (t == 0) { printf("thread_num_new_rows %d, total_num_new_rows %d\n", thread_num_new_rows, total_num_new_rows); } + + int const row_index = input_row_count + (thread_num_new_rows - 1); + input_row_count += total_num_new_rows; + int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); + + // thread and block value count + + // if we are within the range of nesting levels we should be adding value indices for + // is from/in current rep level to/in the rep level AT the depth with the def value + int in_nesting_bounds = ((0 >= start_depth && 0 <= end_depth) && in_row_bounds) ? 1 : 0; + +if (t == 0) { printf("row_index %d, in_row_bounds %d, in_nesting_bounds %d\n", \ + row_index, in_row_bounds, in_nesting_bounds); } + + // queries is_valid from all threads, stores prior total and total total + int thread_value_count = 0, block_value_count = 0; +/* int thread_value_count, block_value_count; + block_scan(scan_storage).ExclusiveSum(in_nesting_bounds, thread_value_count, block_value_count); +*/ + //bit mask of all threads that passed true + int const in_write_row_bounds_mask = ballot(in_row_bounds); + +if (t == 0) { printf("thread_value_count %d, block_value_count %d\n", thread_value_count, block_value_count); } + + // column is either nullable or is a list (or both): iterate by depth + for (int d_idx = 0; d_idx <= max_depth; d_idx++) { + + auto& ni = s->nesting_info[d_idx]; + + // everything up to the max_def_level is a non-null value + int is_valid; + if constexpr (nullable) { + is_valid = ((def_level >= ni.max_def_level) && in_nesting_bounds) ? 1 : 0; + } else { + is_valid = in_nesting_bounds; + } + +if (t == 0) { printf("nullable %d, depth %d, max_depth %d, is_valid %d\n", int(nullable), d_idx, max_depth, is_valid); } +if (t < 10) { printf("t %d, is_valid %d\n", t, is_valid); } + + // thread and block validity count + // queries is_valid of all threads, stores prior total and total total + + // for nested lists, it's more complicated. This block will visit 128 incoming values, + // however not all of them will necessarily represent a value at this nesting level. so + // the validity bit for thread t might actually represent output value t-6. the correct + // position for thread t's bit is thread_value_count. + static_assert(decode_block_size <= 8*sizeof(__uint128_t), + "This code relies on bits for block threads fitting within a uint128!"); + +if (t < 10) { printf("t %d, thread_value_count %d\n", t, thread_value_count); } + +/* using block_reduce = cub::BlockReduce<__uint128_t, decode_block_size>; + __shared__ typename block_reduce::TempStorage reduce_storage; + auto shifted_validity = static_cast<__uint128_t>(is_valid) << thread_value_count; + auto or_reducer = [](const __uint128_t& lhs, const __uint128_t& rhs){ + return lhs | rhs; + }; + __uint128_t block_valid_mask = block_reduce(reduce_storage).Reduce(shifted_validity, or_reducer); +*/ +__uint128_t block_valid_mask = 0; + + //Reduction result is only visible to thread zero, must share with other threads: +/* __shared__ __uint128_t block_valid_mask_storage; + if(t == 0) { block_valid_mask_storage = block_valid_mask; } + __syncthreads(); + block_valid_mask = block_valid_mask_storage; +*/ + auto count_set_bits = [](__uint128_t bits){ + return __popcll((uint64_t)bits) + __popcll((uint64_t)(bits >> 64)); + }; + auto thread_mask = (__uint128_t(1) << thread_value_count) - 1; + int const thread_valid_count = count_set_bits(block_valid_mask & thread_mask); + +if (t == 0) { printf("block_valid_mask %d, thread_valid_count %d\n", int(block_valid_mask), thread_valid_count); } +if (t < 10) { printf("t %d, thread_valid_count %d\n", t, thread_valid_count); } + + // compute warp and thread value counts for the -next- nesting level. we need to + // do this for nested schemas so that we can emit an offset for the -current- nesting + // level. more concretely : the offset for the current nesting level == current length of the + // next nesting level + int32_t next_thread_value_count = 0, next_block_value_count = 0; + int next_in_nesting_bounds = 0; + if (d_idx < max_depth) { + //mask is different between depths + next_in_nesting_bounds = + (d_idx + 1 >= start_depth && d_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0; +/* + using block_scan = cub::BlockScan; + __shared__ typename block_scan::TempStorage scan_storage; + block_scan(scan_storage).ExclusiveSum(next_in_nesting_bounds, next_thread_value_count, next_block_value_count); +*/ +if (t == 0) { printf("next_thread_value_count %d, next_block_value_count %d\n", next_thread_value_count, next_block_value_count); } + + // if we're -not- at a leaf column and we're within nesting/row bounds + // and we have a valid data_out pointer, it implies this is a list column, so + // emit an offset. + if (in_nesting_bounds && ni.data_out != nullptr) { + int const idx = ni.value_count + thread_value_count; + cudf::size_type const ofs = s->nesting_info[d_idx + 1].value_count + + next_thread_value_count + + s->nesting_info[d_idx + 1].page_start_value; + //STORE THE OFFSET FOR THE NEW LIST LOCATION + (reinterpret_cast(ni.data_out))[idx] = ofs; + } + } + + // validity is processed per-warp (on lane 0's), because writes are atomic + // + // nested schemas always read and write to the same bounds + // (that is, read and write positions are already pre-bounded by first_row/num_rows). + // since we are about to write the validity vector + // here we need to adjust our computed mask to take into account the write row bounds. + int warp_null_count = 0; + if constexpr (nullable) { + if (ni.valid_map != nullptr) { + uint32_t const warp_validity_mask = ballot(is_valid); + if ((t % cudf::detail::warp_size) == 0) { + // absolute input value index + int const vindex = (value_count + thread_value_count) - 1; + + // absolute bit offset into the output validity map +//TODO: first_row?? + int const bit_offset = (ni.valid_map_offset + vindex) - first_row; + + // last bit in the warp to store //in old is warp_valid_mask_bit_count + int const bit_count = cudf::detail::warp_size - __clz(in_write_row_bounds_mask); + + store_validity(bit_offset, ni.valid_map, warp_validity_mask, bit_count); + warp_null_count = bit_count - __popc(warp_validity_mask); + } + } + + // sum null counts. we have to do it this way instead of just incrementing by (value_count - + // valid_count) because valid_count also includes rows that potentially start before our row + // bounds. if we could come up with a way to clean that up, we could remove this and just + // compute it directly at the end of the kernel. + size_type const block_null_count = + cudf::detail::single_lane_block_sum_reduce(warp_null_count); + if (t == 0) { ni.null_count += block_null_count; } + } + + // if this is valid and we're at the leaf, output dst_pos + __syncthreads(); // handle modification of ni.valid_count from below + if (is_valid && d_idx == max_depth) { + // for non-list types, the value count is always the same across + int const dst_pos = (ni.value_count + thread_value_count) - 1; + int const src_pos = (ni.valid_count + thread_valid_count) - 1; + sb->nz_idx[rolling_index(src_pos)] = dst_pos; + } + __syncthreads(); // handle modification of ni.value_count from below + + // update stuff + if (t == 0) { + int const block_valid_count = count_set_bits(block_valid_mask); + ni.valid_count += block_valid_count; + ni.value_count += block_value_count; + } + + // propagate value counts for the next depth level + block_value_count = next_block_value_count; + thread_value_count = next_thread_value_count; + in_nesting_bounds = next_in_nesting_bounds; + } //END OF DEPTH LOOP + +if (t == 0) { printf("END DEPTH LOOP\n"); } + +//TODO: Shouldn't we guard against threads going beyond the last row? Old algo didn't? + int const batch_size = min(max_batch_size, target_value_count - value_count); + value_count += batch_size; + } + +if (t == 0) { printf("END LOOP\n"); } + + if (t == 0) { + // update valid value count for decoding and total # of values we've processed + s->nz_count = s->nesting_info[max_depth].valid_count; + s->input_value_count = value_count; + + // If we have lists # rows != # values + s->input_row_count = input_row_count; + } + + __syncthreads(); + return s->nesting_info[max_depth].valid_count; +} + // is the page marked nullable or not __device__ inline bool is_nullable(page_state_s* s) { @@ -494,6 +776,7 @@ template typename DecodeValuesFunc> CUDF_KERNEL void __launch_bounds__(decode_block_size_t) @@ -542,25 +825,22 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) bool const should_process_nulls = nullable && maybe_has_nulls(s); // shared buffer. all shared memory is suballocated out of here - // constexpr int shared_rep_size = has_lists_t ? cudf::util::round_up_unsafe(rle_run_buffer_size * - // sizeof(rle_run), size_t{16}) : 0; + constexpr int shared_rep_size = has_lists_t ? cudf::util::round_up_unsafe(rle_run_buffer_size * + sizeof(rle_run), size_t{16}) : 0; constexpr int shared_dict_size = has_dict_t ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), size_t{16}) : 0; constexpr int shared_def_size = cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), size_t{16}); - constexpr int shared_buf_size = /*shared_rep_size +*/ shared_dict_size + shared_def_size; + constexpr int shared_buf_size = shared_rep_size + shared_dict_size + shared_def_size; __shared__ __align__(16) uint8_t shared_buf[shared_buf_size]; // setup all shared memory buffers int shared_offset = 0; - /* - rle_run *rep_runs = reinterpret_cast*>(shared_buf + shared_offset); - if constexpr (has_lists_t){ - shared_offset += shared_rep_size; - } - */ + rle_run* rep_runs = reinterpret_cast*>(shared_buf + shared_offset); + if constexpr (has_lists_t){ shared_offset += shared_rep_size; } + rle_run* dict_runs = reinterpret_cast*>(shared_buf + shared_offset); if constexpr (has_dict_t) { shared_offset += shared_dict_size; } rle_run* def_runs = reinterpret_cast*>(shared_buf + shared_offset); @@ -575,17 +855,16 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) def, s->page.num_input_values); } - /* + rle_stream rep_decoder{rep_runs}; level_t* const rep = reinterpret_cast(pp->lvl_decode_buf[level_type::REPETITION]); - if constexpr(has_lists_t){ + if constexpr (has_lists_t){ rep_decoder.init(s->col.level_bits[level_type::REPETITION], s->abs_lvl_start[level_type::REPETITION], s->abs_lvl_end[level_type::REPETITION], rep, s->page.num_input_values); } - */ rle_stream dict_stream{dict_runs}; if constexpr (has_dict_t) { @@ -605,17 +884,28 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) int valid_count = 0; // the core loop. decode batches of level stream data using rle_stream objects // and pass the results to gpuDecodeValues + +if (t == 0) { printf("DECODE LOOP: has-list %d, has-nesting %d \n", int(has_lists_t), int(has_nesting_t)); } + while (s->error == 0 && processed_count < s->page.num_input_values) { int next_valid_count; // only need to process definition levels if this is a nullable column if (should_process_nulls) { + if constexpr (has_lists_t){ + rep_decoder.decode_next(t); + } processed_count += def_decoder.decode_next(t); __syncthreads(); if constexpr (has_nesting_t) { - next_valid_count = gpuUpdateValidityAndRowIndicesNested( - processed_count, s, sb, def, t); + if constexpr (has_lists_t) { + next_valid_count = gpuUpdateValidityAndRowIndicesNestedLists( + processed_count, s, sb, def, rep, t); + } else { + next_valid_count = gpuUpdateValidityAndRowIndicesNestedNonLists( + processed_count, s, sb, def, t); + } } else { next_valid_count = gpuUpdateValidityAndRowIndicesFlat( processed_count, s, sb, def, t); @@ -628,9 +918,15 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count); if constexpr (has_nesting_t) { - next_valid_count = - gpuUpdateValidityAndRowIndicesNested( - processed_count, s, sb, nullptr, t); + if constexpr (has_lists_t) { + next_valid_count = + gpuUpdateValidityAndRowIndicesNestedLists( + processed_count, s, sb, nullptr, rep, t); + } else { + next_valid_count = + gpuUpdateValidityAndRowIndicesNestedNonLists( + processed_count, s, sb, nullptr, t); + } } else { next_valid_count = gpuUpdateValidityAndRowIndicesFlat( processed_count, s, sb, nullptr, t); @@ -664,6 +960,7 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span pages, size_t min_row, int level_type_size, bool has_nesting, + bool is_list, kernel_error::pointer error_code, rmm::cuda_stream_view stream) { @@ -673,12 +970,23 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span pages, dim3 dim_grid(pages.size(), 1); // 1 threadblock per page if (level_type_size == 1) { - if (has_nesting) { + if (is_list) { + gpuDecodePageDataGeneric + <<>>( + pages.device_ptr(), chunks, min_row, num_rows, error_code); + } else if (has_nesting) { gpuDecodePageDataGeneric <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); @@ -688,17 +996,29 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span pages, decode_kernel_mask::FIXED_WIDTH_NO_DICT, false, false, + false, decode_fixed_width_values_func> <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); } } else { - if (has_nesting) { + if (is_list) { + gpuDecodePageDataGeneric + <<>>( + pages.device_ptr(), chunks, min_row, num_rows, error_code); + } else if (has_nesting) { gpuDecodePageDataGeneric <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); @@ -708,6 +1028,7 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span pages, decode_kernel_mask::FIXED_WIDTH_NO_DICT, false, false, + false, decode_fixed_width_values_func> <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); @@ -721,6 +1042,7 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span pa size_t min_row, int level_type_size, bool has_nesting, + bool is_list, kernel_error::pointer error_code, rmm::cuda_stream_view stream) { @@ -730,12 +1052,23 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span pa dim3 dim_grid(pages.size(), 1); // 1 thread block per page => # blocks if (level_type_size == 1) { - if (has_nesting) { + if (is_list) { + gpuDecodePageDataGeneric + <<>>( + pages.device_ptr(), chunks, min_row, num_rows, error_code); + } else if (has_nesting) { gpuDecodePageDataGeneric <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); @@ -745,17 +1078,29 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span pa decode_kernel_mask::FIXED_WIDTH_DICT, true, false, + false, decode_fixed_width_values_func> <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); } } else { - if (has_nesting) { + if (is_list) { + gpuDecodePageDataGeneric + <<>>( + pages.device_ptr(), chunks, min_row, num_rows, error_code); + } else if (has_nesting) { gpuDecodePageDataGeneric <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); @@ -765,6 +1110,7 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span pa decode_kernel_mask::FIXED_WIDTH_DICT, true, false, + true, decode_fixed_width_values_func> <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); @@ -779,6 +1125,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span pages, size_t min_row, int level_type_size, bool has_nesting, + bool is_list, kernel_error::pointer error_code, rmm::cuda_stream_view stream) { @@ -788,12 +1135,23 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span pages, dim3 dim_grid(pages.size(), 1); // 1 thread block per page => # blocks if (level_type_size == 1) { - if (has_nesting) { + if (is_list) { + gpuDecodePageDataGeneric + <<>>( + pages.device_ptr(), chunks, min_row, num_rows, error_code); + } else if (has_nesting) { gpuDecodePageDataGeneric <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); @@ -803,17 +1161,29 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span pages, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT, true, false, + false, decode_fixed_width_split_values_func> <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); } } else { - if (has_nesting) { + if (is_list) { + gpuDecodePageDataGeneric + <<>>( + pages.device_ptr(), chunks, min_row, num_rows, error_code); + } else if (has_nesting) { gpuDecodePageDataGeneric <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); @@ -823,6 +1193,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span pages, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT, true, false, + false, decode_fixed_width_split_values_func> <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index d604642be54..ac39e2ac291 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -181,6 +181,17 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page, } else if (is_string_col(chunk)) { // check for string before byte_stream_split so FLBA will go to the right kernel return decode_kernel_mask::STRING; + } + + if (is_list(chunk)) { + if (page.encoding == Encoding::PLAIN) { + return decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST; + } else if (page.encoding == Encoding::BYTE_STREAM_SPLIT) { + return decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST; + } else if (page.encoding == Encoding::PLAIN_DICTIONARY || + page.encoding == Encoding::RLE_DICTIONARY) { + return decode_kernel_mask::FIXED_WIDTH_DICT_LIST; + } } if (!is_list(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) { diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index efc1f5ebab1..d666f129af8 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -221,6 +221,9 @@ enum class decode_kernel_mask { (1 << 9), // Same as above but for nested, fixed-width data FIXED_WIDTH_NO_DICT_NESTED = (1 << 10), // Run decode kernel for fixed width non-dictionary pages FIXED_WIDTH_DICT_NESTED = (1 << 11), // Run decode kernel for fixed width dictionary pages + FIXED_WIDTH_DICT_LIST = (1 << 12), // Run decode kernel for fixed width dictionary pages for lists + FIXED_WIDTH_NO_DICT_LIST = (1 << 13), // Run decode kernel for fixed width non-dictionary pages for lists + BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST = (1 << 14), // Run decode kernel for BYTE_STREAM_SPLIT encoded data for fixed width lists }; // mask representing all the ways in which a string can be encoded @@ -815,6 +818,28 @@ void DecodeStringPageData(cudf::detail::hostdevice_span pages, kernel_error::pointer error_code, rmm::cuda_stream_view stream); +/** + * @brief Launches kernel for reading the list column data stored in the pages + * + * The page data will be written to the output pointed to in the page's + * associated column chunk. + * + * @param[in,out] pages All pages to be decoded + * @param[in] chunks All chunks to be decoded + * @param[in] num_rows Total number of rows to read + * @param[in] min_row Minimum number of rows to read + * @param[in] level_type_size Size in bytes of the type for level decoding + * @param[out] error_code Error code for kernel failures + * @param[in] stream CUDA stream to use + */ +void DecodeListPageData(cudf::detail::hostdevice_span pages, + cudf::detail::hostdevice_span chunks, + size_t num_rows, + size_t min_row, + int level_type_size, + kernel_error::pointer error_code, + rmm::cuda_stream_view stream); + /** * @brief Launches kernel for reading the DELTA_BINARY_PACKED column data stored in the pages * @@ -893,6 +918,7 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span pages, * @param[in] min_row Minimum number of rows to read * @param[in] level_type_size Size in bytes of the type for level decoding * @param[in] has_nesting Whether or not the data contains nested (but not list) data. + * @param[in] is_list Whether or not the data contains list data. * @param[out] error_code Error code for kernel failures * @param[in] stream CUDA stream to use */ @@ -902,6 +928,7 @@ void DecodePageDataFixed(cudf::detail::hostdevice_span pages, size_t min_row, int level_type_size, bool has_nesting, + bool is_list, kernel_error::pointer error_code, rmm::cuda_stream_view stream); @@ -917,6 +944,7 @@ void DecodePageDataFixed(cudf::detail::hostdevice_span pages, * @param[in] min_row Minimum number of rows to read * @param[in] level_type_size Size in bytes of the type for level decoding * @param[in] has_nesting Whether or not the data contains nested (but not list) data. + * @param[in] is_list Whether or not the data contains list data. * @param[out] error_code Error code for kernel failures * @param[in] stream CUDA stream to use */ @@ -926,6 +954,7 @@ void DecodePageDataFixedDict(cudf::detail::hostdevice_span pages, size_t min_row, int level_type_size, bool has_nesting, + bool is_list, kernel_error::pointer error_code, rmm::cuda_stream_view stream); @@ -941,6 +970,7 @@ void DecodePageDataFixedDict(cudf::detail::hostdevice_span pages, * @param[in] min_row Minimum number of rows to read * @param[in] level_type_size Size in bytes of the type for level decoding * @param[in] has_nesting Whether or not the data contains nested (but not list) data. + * @param[in] is_list Whether or not the data contains list data. * @param[out] error_code Error code for kernel failures * @param[in] stream CUDA stream to use */ @@ -950,6 +980,7 @@ void DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span pages size_t min_row, int level_type_size, bool has_nesting, + bool is_list, kernel_error::pointer error_code, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index f705f6626e7..cc98e263664 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -71,6 +71,8 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num // figure out which kernels to run auto const kernel_mask = GetAggregatedDecodeKernelMask(subpass.pages, _stream); +printf("DECODE DATA PAGE, mask %d\n", kernel_mask); + // Check to see if there are any string columns present. If so, then we need to get size info // for each string page. This size info will be used to pre-allocate memory for the column, // allowing the page decoder to write string data directly to the column buffer, rather than @@ -274,6 +276,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num skip_rows, level_type_size, false, + false, error_code.data(), streams[s_idx++]); } @@ -286,6 +289,20 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num skip_rows, level_type_size, true, + false, + error_code.data(), + streams[s_idx++]); + } + + // launch byte stream split decoder, for list columns + if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST) != 0) { + DecodeSplitPageFixedWidthData(subpass.pages, + pass.chunks, + num_rows, + skip_rows, + level_type_size, + true, + true, error_code.data(), streams[s_idx++]); } @@ -309,6 +326,21 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num skip_rows, level_type_size, false, + false, + error_code.data(), + streams[s_idx++]); + } + + // launch fixed width type decoder for lists + if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST) != 0) { +printf("LIST PAGE\n"); + DecodePageDataFixed(subpass.pages, + pass.chunks, + num_rows, + skip_rows, + level_type_size, + true, + true, error_code.data(), streams[s_idx++]); } @@ -321,6 +353,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num skip_rows, level_type_size, true, + false, error_code.data(), streams[s_idx++]); } @@ -333,6 +366,20 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num skip_rows, level_type_size, false, + false, + error_code.data(), + streams[s_idx++]); + } + + // launch fixed width type decoder with dictionaries for lists + if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT_LIST) != 0) { + DecodePageDataFixedDict(subpass.pages, + pass.chunks, + num_rows, + skip_rows, + level_type_size, + true, + true, error_code.data(), streams[s_idx++]); } @@ -345,6 +392,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num skip_rows, level_type_size, true, + false, error_code.data(), streams[s_idx++]); } From 2ca9618ef3c4f8a0973da9d680143c3776dbb3a7 Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Fri, 16 Aug 2024 16:19:50 -0400 Subject: [PATCH 02/38] Further work in list code --- cpp/src/io/parquet/decode_fixed.cu | 220 ++++++++++++++++------------- 1 file changed, 119 insertions(+), 101 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 8157198e116..57eaaf1079e 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -205,7 +205,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists( int value_count = s->input_value_count; // cap by last row so that we don't process any rows past what we want to output. - int const first_row = s->first_row; + int const first_row = s->first_row; //row index WITHIN THE PAGE int const last_row = first_row + s->num_rows; int const capped_target_value_count = min(target_value_count, last_row); @@ -232,7 +232,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists( } //Determine value count & row index - int const thread_value_count = t + 1; //# of output values from the view of this thread + int const thread_value_count = t; //# of output values from the view of this thread int const block_value_count = batch_size; int const row_index = t + value_count; //thread_row_index in old int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); @@ -256,8 +256,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists( int const thread_valid_count = thread_value_count; // for non-list types, the value count is always the same across - int const dst_pos = (value_count + thread_value_count) - 1; - int const src_pos = (ni.valid_count + thread_valid_count) - 1; + int const dst_pos = value_count + thread_value_count; + int const src_pos = ni.valid_count + thread_valid_count; sb->nz_idx[rolling_index(src_pos)] = dst_pos; } @@ -283,7 +283,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists( using block_scan = cub::BlockScan; __shared__ typename block_scan::TempStorage scan_storage; int thread_valid_count, block_valid_count; - block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count); + block_scan(scan_storage).ExclusiveSum(is_valid, thread_valid_count, block_valid_count); // validity is processed per-warp (lane 0 writes), because writes are atomic // @@ -296,9 +296,11 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists( uint32_t const warp_validity_mask = ballot(is_valid); if ((t % cudf::detail::warp_size) == 0) { // absolute input value index - int const vindex = (value_count + thread_value_count) - 1; + int const vindex = value_count + thread_value_count; // absolute bit offset into the output validity map + // subtract by first_row: we may skip first N rows, + // but still need to write bits at beginning of output vector int const bit_offset = (ni.valid_map_offset + vindex + write_start) - first_row; // last bit in the warp to store @@ -325,8 +327,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists( if (is_valid && d_idx == max_depth) { // for non-list types, the value count is always the same across __syncthreads(); // handle modification of ni.valid_count from below - int const dst_pos = (value_count + thread_value_count) - 1; - int const src_pos = (ni.valid_count + thread_valid_count) - 1; + int const dst_pos = value_count + thread_value_count; + int const src_pos = ni.valid_count + thread_valid_count; sb->nz_idx[rolling_index(src_pos)] = dst_pos; } __syncthreads(); // handle modification of ni.valid_count from below @@ -376,11 +378,11 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( while (value_count < capped_target_value_count) { int const batch_size = min(max_batch_size, capped_target_value_count - value_count); - int const thread_value_count = t + 1; + int const thread_value_count = t; int const block_value_count = batch_size; // compute our row index, whether we're in row bounds, and validity - int const row_index = (thread_value_count + value_count) - 1; + int const row_index = thread_value_count + value_count; int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); // determine if is valid @@ -407,7 +409,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( // values for each individual thread (how many valids there are including me, but no one after me) using block_scan = cub::BlockScan; __shared__ typename block_scan::TempStorage scan_storage; - block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count); + block_scan(scan_storage).ExclusiveSum(is_valid, thread_valid_count, block_valid_count); __syncthreads(); // validity is processed per-warp, because storing is an atomic operation @@ -427,9 +429,11 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( uint32_t const warp_validity_mask = ballot(is_valid); // is warp_valid_mask in old // lane 0 from each warp writes out validity if ((t % cudf::detail::warp_size) == 0) { - int const vindex = (value_count + thread_value_count) - 1; // absolute input value index - int const bit_offset = (valid_map_offset + vindex + write_start) - - first_row; // absolute bit offset into the output validity map + int const vindex = value_count + thread_value_count; // absolute input value index + + // absolute bit offset into the output validity map + int const bit_offset = (valid_map_offset + vindex + write_start) - first_row; + int const write_end = cudf::detail::warp_size - __clz(in_write_row_bounds_mask); // last bit in the warp to store int const bit_count = write_end - write_start; @@ -455,8 +459,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( // output offset if (is_valid) { - int const dst_pos = (value_count + thread_value_count) - 1; - int const src_pos = (valid_count + thread_valid_count) - 1; + int const dst_pos = value_count + thread_value_count; + int const src_pos = valid_count + thread_valid_count; sb->nz_idx[rolling_index(src_pos)] = dst_pos; } @@ -482,20 +486,23 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists( int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, level_t const* const rep, int t) { + //What is the output of this? Validity bits and offsets to list starts constexpr int num_warps = decode_block_size / cudf::detail::warp_size; constexpr int max_batch_size = num_warps * cudf::detail::warp_size; // how many (input) values we've processed in the page so far, prior to this loop iteration int value_count = s->input_value_count; + int printf_num_threads = 34; + // how many rows we've processed in the page so far int input_row_count = s->input_row_count; -if (t == 0) { printf("value_count %d, input_row_count %d\n", value_count, input_row_count); } + if (t == 0) { printf("value_count %d, input_row_count %d\n", value_count, input_row_count); } // cap by last row so that we don't process any rows past what we want to output. int const first_row = s->first_row; int const last_row = first_row + s->num_rows; -if (t == 0) { printf("first_row %d, last_row %d, target_value_count %d\n", first_row, last_row, target_value_count); } + if (t == 0) { printf("first_row %d, last_row %d, target_value_count %d\n", first_row, last_row, target_value_count); } int const row_index_lower_bound = s->row_index_lower_bound; int const max_depth = s->col.max_nesting_depth - 1; @@ -505,46 +512,34 @@ if (t == 0) { printf("first_row %d, last_row %d, target_value_count %d\n", first while (value_count < target_value_count) { bool const within_batch = value_count + t < target_value_count; - // get definition level. only need to process for nullable columns - int def_level; - if constexpr (nullable) { - if (def) { - def_level = within_batch - ? static_cast(def[rolling_index(value_count + t)]) - : -1; - } else { - def_level = within_batch ? 1 : -1; - } - } else { - def_level = 0; - } - - // use repitition level to get start/end depth + // get definition level, use repitition level to get start/end depth // different for each thread, as each thread has a different r/d - int start_depth = -1, end_depth = -1; + int def_level = -1, start_depth = -1, end_depth = -1; if (within_batch) { int const index = rolling_index(value_count + t); int const rep_level = rep[index]; + def_level = static_cast(def[rolling_index(value_count + t)]); + //computed by generate_depth_remappings() start_depth = s->nesting_info[rep_level].start_depth; end_depth = s->nesting_info[def_level].end_depth; -if (t == 0) { printf("def_level %d, rep_level %d, start_depth %d, end_depth %d\n", \ - def_level, rep_level, start_depth, end_depth); } + if (t < printf_num_threads) { printf("t %d, def_level %d, rep_level %d, start_depth %d, end_depth %d\n", \ + t, def_level, rep_level, start_depth, end_depth); } } //Determine value count & row index // track (page-relative) row index for the thread so we can compare against input bounds // keep track of overall # of rows we've read. - int const is_new_row = start_depth == 0 ? 1 : 0; //TODO: UNCOMMENT - int thread_num_new_rows, total_num_new_rows; + int const is_new_row = start_depth == 0 ? 1 : 0; + int num_prior_new_rows, total_num_new_rows; using block_scan = cub::BlockScan; __shared__ typename block_scan::TempStorage scan_storage; - block_scan(scan_storage).InclusiveSum(is_new_row, thread_num_new_rows, total_num_new_rows); + block_scan(scan_storage).ExclusiveSum(is_new_row, num_prior_new_rows, total_num_new_rows); __syncthreads(); //Needed because scan_storage will be reused -if (t == 0) { printf("thread_num_new_rows %d, total_num_new_rows %d\n", thread_num_new_rows, total_num_new_rows); } + if (t == 0) { printf("num_prior_new_rows %d, total_num_new_rows %d\n", num_prior_new_rows, total_num_new_rows); } - int const row_index = input_row_count + (thread_num_new_rows - 1); + int const row_index = input_row_count + (num_prior_new_rows + is_new_row - 1); input_row_count += total_num_new_rows; int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); @@ -554,18 +549,21 @@ if (t == 0) { printf("thread_num_new_rows %d, total_num_new_rows %d\n", thread_n // is from/in current rep level to/in the rep level AT the depth with the def value int in_nesting_bounds = ((0 >= start_depth && 0 <= end_depth) && in_row_bounds) ? 1 : 0; -if (t == 0) { printf("row_index %d, in_row_bounds %d, in_nesting_bounds %d\n", \ - row_index, in_row_bounds, in_nesting_bounds); } + if (t == 0) { printf("row_index %d, in_row_bounds %d, in_nesting_bounds %d, last_row %d\n", \ + row_index, in_row_bounds, in_nesting_bounds, last_row); } + if (t < printf_num_threads) { printf("t %d, is_new_row %d, num_prior_new_rows %d, row_index %d, in_row_bounds %d\n", + t, is_new_row, num_prior_new_rows, row_index, in_row_bounds); } // queries is_valid from all threads, stores prior total and total total int thread_value_count = 0, block_value_count = 0; -/* int thread_value_count, block_value_count; block_scan(scan_storage).ExclusiveSum(in_nesting_bounds, thread_value_count, block_value_count); -*/ - //bit mask of all threads that passed true - int const in_write_row_bounds_mask = ballot(in_row_bounds); -if (t == 0) { printf("thread_value_count %d, block_value_count %d\n", thread_value_count, block_value_count); } + if (t == 0) { printf("block_value_count %d\n", block_value_count); } + if (t < printf_num_threads) { printf("t %d, thread_value_count %d, in_nesting_bounds %d\n", + t, thread_value_count, in_nesting_bounds); } + + //bit mask of all threads that passed true //TODO DELETE ME + //uint32_t const in_write_row_bounds_mask = ballot(in_row_bounds); // column is either nullable or is a list (or both): iterate by depth for (int d_idx = 0; d_idx <= max_depth; d_idx++) { @@ -580,8 +578,9 @@ if (t == 0) { printf("thread_value_count %d, block_value_count %d\n", thread_val is_valid = in_nesting_bounds; } -if (t == 0) { printf("nullable %d, depth %d, max_depth %d, is_valid %d\n", int(nullable), d_idx, max_depth, is_valid); } -if (t < 10) { printf("t %d, is_valid %d\n", t, is_valid); } + if (t == 0) { printf("nullable %d, depth %d, max_depth %d, max_def_level %d, value_count %d\n", + int(nullable), d_idx, max_depth, ni.max_def_level, value_count); } + if (t < printf_num_threads) { printf("t %d, def_level %d, in_nesting_bounds %d, is_valid %d\n", t, def_level, in_nesting_bounds, is_valid); } // thread and block validity count // queries is_valid of all threads, stores prior total and total total @@ -593,32 +592,28 @@ if (t < 10) { printf("t %d, is_valid %d\n", t, is_valid); } static_assert(decode_block_size <= 8*sizeof(__uint128_t), "This code relies on bits for block threads fitting within a uint128!"); -if (t < 10) { printf("t %d, thread_value_count %d\n", t, thread_value_count); } - -/* using block_reduce = cub::BlockReduce<__uint128_t, decode_block_size>; + using block_reduce = cub::BlockReduce<__uint128_t, decode_block_size>; __shared__ typename block_reduce::TempStorage reduce_storage; auto shifted_validity = static_cast<__uint128_t>(is_valid) << thread_value_count; auto or_reducer = [](const __uint128_t& lhs, const __uint128_t& rhs){ return lhs | rhs; }; __uint128_t block_valid_mask = block_reduce(reduce_storage).Reduce(shifted_validity, or_reducer); -*/ -__uint128_t block_valid_mask = 0; //Reduction result is only visible to thread zero, must share with other threads: -/* __shared__ __uint128_t block_valid_mask_storage; + __shared__ __uint128_t block_valid_mask_storage; if(t == 0) { block_valid_mask_storage = block_valid_mask; } __syncthreads(); block_valid_mask = block_valid_mask_storage; -*/ + auto count_set_bits = [](__uint128_t bits){ return __popcll((uint64_t)bits) + __popcll((uint64_t)(bits >> 64)); }; auto thread_mask = (__uint128_t(1) << thread_value_count) - 1; int const thread_valid_count = count_set_bits(block_valid_mask & thread_mask); -if (t == 0) { printf("block_valid_mask %d, thread_valid_count %d\n", int(block_valid_mask), thread_valid_count); } -if (t < 10) { printf("t %d, thread_valid_count %d\n", t, thread_valid_count); } + if (t == 0) { printf("block_valid_mask %d\n", int(block_valid_mask)); } + if (t < printf_num_threads) { printf("t %d, thread_valid_count %d\n", t, thread_valid_count); } // compute warp and thread value counts for the -next- nesting level. we need to // do this for nested schemas so that we can emit an offset for the -current- nesting @@ -630,23 +625,28 @@ if (t < 10) { printf("t %d, thread_valid_count %d\n", t, thread_valid_count); } //mask is different between depths next_in_nesting_bounds = (d_idx + 1 >= start_depth && d_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0; -/* + using block_scan = cub::BlockScan; __shared__ typename block_scan::TempStorage scan_storage; block_scan(scan_storage).ExclusiveSum(next_in_nesting_bounds, next_thread_value_count, next_block_value_count); -*/ -if (t == 0) { printf("next_thread_value_count %d, next_block_value_count %d\n", next_thread_value_count, next_block_value_count); } + + if (t == 0) { printf("next depth %d, next_block_value_count %d\n", d_idx + 1, next_block_value_count); } + if (t < printf_num_threads) { printf("t %d, start_depth %d, end_depth %d, in_row_bounds %d, next_in_nesting_bounds %d\n", + t, start_depth, end_depth, in_row_bounds, next_in_nesting_bounds); } + if (t < printf_num_threads) { printf("t %d, next_thread_value_count %d\n", t, next_thread_value_count); } // if we're -not- at a leaf column and we're within nesting/row bounds // and we have a valid data_out pointer, it implies this is a list column, so // emit an offset. if (in_nesting_bounds && ni.data_out != nullptr) { + const auto& next_ni = s->nesting_info[d_idx + 1]; int const idx = ni.value_count + thread_value_count; - cudf::size_type const ofs = s->nesting_info[d_idx + 1].value_count + - next_thread_value_count + - s->nesting_info[d_idx + 1].page_start_value; + cudf::size_type const ofs = next_ni.value_count + next_thread_value_count + next_ni.page_start_value; + //STORE THE OFFSET FOR THE NEW LIST LOCATION (reinterpret_cast(ni.data_out))[idx] = ofs; + if (t < printf_num_threads) { printf("OFFSETS: t %d, idx %d, next value count %d, next page_start_value %d, ofs %d\n", + t, idx, next_ni.value_count, next_ni.page_start_value, ofs); } } } @@ -659,20 +659,34 @@ if (t == 0) { printf("next_thread_value_count %d, next_block_value_count %d\n", int warp_null_count = 0; if constexpr (nullable) { if (ni.valid_map != nullptr) { - uint32_t const warp_validity_mask = ballot(is_valid); +//TODO: Consider OR'ING for next_thread_value_count and popc() for next_thread_value_count +//so that we don't have to take a ballot here. Is uint128 so may deconstruct to this anyway ... + uint32_t const warp_count_mask = ballot(in_nesting_bounds); if ((t % cudf::detail::warp_size) == 0) { - // absolute input value index - int const vindex = (value_count + thread_value_count) - 1; + // last bit in the warp to store //in old is warp_valid_mask_bit_count +//so it's a count of everything in nesting bounds, though bits can be zero if NULL at this level + int const bit_count = __popc(warp_count_mask); + if(bit_count > 0) { - // absolute bit offset into the output validity map -//TODO: first_row?? - int const bit_offset = (ni.valid_map_offset + vindex) - first_row; + // absolute input value index + int const vindex = value_count + thread_value_count; - // last bit in the warp to store //in old is warp_valid_mask_bit_count - int const bit_count = cudf::detail::warp_size - __clz(in_write_row_bounds_mask); + // absolute bit offset into the output validity map + //is cumulative sum of bit_count at the given nesting depth + // DON'T subtract by first_row: since it's lists it's not 1-row-per-value + // valid_map_offset was already set during list pre-processing for appropriate start index + int const bit_offset = ni.valid_map_offset + vindex; + + auto const shifted_valid_mask = static_cast(block_valid_mask >> thread_value_count); + auto const bit_range_mask = (1 << bit_count) - 1; //mainly needed for warp_null_count + auto const warp_validity_mask = shifted_valid_mask & bit_range_mask; - store_validity(bit_offset, ni.valid_map, warp_validity_mask, bit_count); - warp_null_count = bit_count - __popc(warp_validity_mask); + printf("t %d, thread_value_count %d, vindex %d, bit_offset %d, bit_count %d, warp_validity_mask %u\n", + t, thread_value_count, vindex, bit_offset, bit_count, warp_validity_mask); + + store_validity(bit_offset, ni.valid_map, warp_validity_mask, bit_count); + warp_null_count = bit_count - __popc(warp_validity_mask); + } } } @@ -689,9 +703,14 @@ if (t == 0) { printf("next_thread_value_count %d, next_block_value_count %d\n", __syncthreads(); // handle modification of ni.valid_count from below if (is_valid && d_idx == max_depth) { // for non-list types, the value count is always the same across - int const dst_pos = (ni.value_count + thread_value_count) - 1; - int const src_pos = (ni.valid_count + thread_valid_count) - 1; - sb->nz_idx[rolling_index(src_pos)] = dst_pos; + int const dst_pos = ni.value_count + thread_value_count; + int const src_pos = ni.valid_count + thread_valid_count; + int const output_index = rolling_index(src_pos); + + if (t == 0) { printf("ni.value_count %d, ni.valid_count %d\n", int(ni.value_count), int(ni.valid_count)); } + if (t < printf_num_threads) { printf("t %d, src_pos %d, output_index %d\n", t, src_pos, output_index); } + + sb->nz_idx[output_index] = dst_pos; } __syncthreads(); // handle modification of ni.value_count from below @@ -708,14 +727,13 @@ if (t == 0) { printf("next_thread_value_count %d, next_block_value_count %d\n", in_nesting_bounds = next_in_nesting_bounds; } //END OF DEPTH LOOP -if (t == 0) { printf("END DEPTH LOOP\n"); } + if (t == 0) { printf("END DEPTH LOOP\n"); } -//TODO: Shouldn't we guard against threads going beyond the last row? Old algo didn't? int const batch_size = min(max_batch_size, target_value_count - value_count); value_count += batch_size; } -if (t == 0) { printf("END LOOP\n"); } + if (t == 0) { printf("END LOOP\n"); } if (t == 0) { // update valid value count for decoding and total # of values we've processed @@ -823,6 +841,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) bool const nullable = is_nullable(s); bool const should_process_nulls = nullable && maybe_has_nulls(s); + bool const should_process_def_levels = should_process_nulls || has_lists_t; // shared buffer. all shared memory is suballocated out of here constexpr int shared_rep_size = has_lists_t ? cudf::util::round_up_unsafe(rle_run_buffer_size * @@ -848,7 +867,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) // initialize the stream decoders (requires values computed in setupLocalPageInfo) rle_stream def_decoder{def_runs}; level_t* const def = reinterpret_cast(pp->lvl_decode_buf[level_type::DEFINITION]); - if (should_process_nulls) { + if (should_process_def_levels) { def_decoder.init(s->col.level_bits[level_type::DEFINITION], s->abs_lvl_start[level_type::DEFINITION], s->abs_lvl_end[level_type::DEFINITION], @@ -885,7 +904,8 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) // the core loop. decode batches of level stream data using rle_stream objects // and pass the results to gpuDecodeValues -if (t == 0) { printf("DECODE LOOP: has-list %d, has-nesting %d \n", int(has_lists_t), int(has_nesting_t)); } +if (t == 0) { printf("DECODE LOOP: has-list %d, has-nesting %d, should nulls %d \n", + int(has_lists_t), int(has_nesting_t), int(should_process_nulls)); } while (s->error == 0 && processed_count < s->page.num_input_values) { int next_valid_count; @@ -898,14 +918,12 @@ if (t == 0) { printf("DECODE LOOP: has-list %d, has-nesting %d \n", int(has_list processed_count += def_decoder.decode_next(t); __syncthreads(); - if constexpr (has_nesting_t) { - if constexpr (has_lists_t) { - next_valid_count = gpuUpdateValidityAndRowIndicesNestedLists( - processed_count, s, sb, def, rep, t); - } else { - next_valid_count = gpuUpdateValidityAndRowIndicesNestedNonLists( - processed_count, s, sb, def, t); - } + if constexpr (has_lists_t) { + next_valid_count = gpuUpdateValidityAndRowIndicesNestedLists( + processed_count, s, sb, def, rep, t); + } else if constexpr (has_nesting_t) { + next_valid_count = gpuUpdateValidityAndRowIndicesNestedNonLists( + processed_count, s, sb, def, t); } else { next_valid_count = gpuUpdateValidityAndRowIndicesFlat( processed_count, s, sb, def, t); @@ -917,16 +935,16 @@ if (t == 0) { printf("DECODE LOOP: has-list %d, has-nesting %d \n", int(has_list else { processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count); - if constexpr (has_nesting_t) { - if constexpr (has_lists_t) { - next_valid_count = - gpuUpdateValidityAndRowIndicesNestedLists( - processed_count, s, sb, nullptr, rep, t); - } else { - next_valid_count = - gpuUpdateValidityAndRowIndicesNestedNonLists( - processed_count, s, sb, nullptr, t); - } + if constexpr (has_lists_t) { + // no nulls, but if we have a list we still need the definition levels + def_decoder.decode_next(t); + next_valid_count = + gpuUpdateValidityAndRowIndicesNestedLists( + processed_count, s, sb, def, rep, t); + } else if constexpr (has_nesting_t) { + next_valid_count = + gpuUpdateValidityAndRowIndicesNestedNonLists( + processed_count, s, sb, nullptr, t); } else { next_valid_count = gpuUpdateValidityAndRowIndicesFlat( processed_count, s, sb, nullptr, t); From 4b5f91a9f6b7c70400a9ed08b05c9f82cc3be971 Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Tue, 27 Aug 2024 14:59:14 -0400 Subject: [PATCH 03/38] Tests working --- .../cudf/table/experimental/row_operators.cuh | 14 +- cpp/src/io/parquet/decode_fixed.cu | 462 +++++++++++------- cpp/src/io/parquet/page_data.cuh | 18 + cpp/src/io/parquet/page_decode.cuh | 101 ++-- cpp/src/io/parquet/page_hdr.cu | 3 +- cpp/src/io/parquet/reader_impl.cpp | 8 +- cpp/src/io/parquet/reader_impl_preprocess.cu | 12 + 7 files changed, 411 insertions(+), 207 deletions(-) diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh index e9b81a525fc..75de3a75197 100644 --- a/cpp/include/cudf/table/experimental/row_operators.cuh +++ b/cpp/include/cudf/table/experimental/row_operators.cuh @@ -1429,18 +1429,30 @@ class device_row_comparator { __device__ bool operator()(size_type const lhs_element_index, size_type const rhs_element_index) const noexcept { + static constexpr bool enable_print = false; if (check_nulls) { bool const lhs_is_null{lhs.is_null(lhs_element_index)}; bool const rhs_is_null{rhs.is_null(rhs_element_index)}; if (lhs_is_null and rhs_is_null) { return nulls_are_equal == null_equality::EQUAL; } else if (lhs_is_null != rhs_is_null) { + if constexpr (enable_print) { + printf("NULLS UNEQUAL AT %d, %d; values: %d %d\n", + lhs_element_index, rhs_element_index, int(lhs_is_null), int(rhs_is_null)); + } return false; } } - return comparator(lhs.element(lhs_element_index), + bool result = comparator(lhs.element(lhs_element_index), rhs.element(rhs_element_index)); + if constexpr (enable_print && cuda::std::is_integral_v) { + if(!result) { + printf("VALUES UNEQUAL: AT %d, %d, VALUES %d, %d\n", lhs_element_index, rhs_element_index, + (int)lhs.element(lhs_element_index), (int)rhs.element(rhs_element_index)); + } + } + return result; } template +template __device__ inline void gpuDecodeFixedWidthValues( page_state_s* s, state_buf* const sb, int start, int end, int t) { @@ -34,26 +34,66 @@ __device__ inline void gpuDecodeFixedWidthValues( PageNestingDecodeInfo* nesting_info_base = s->nesting_info; int const dtype = s->col.physical_type; + int const leaf_level_index = s->col.max_nesting_depth - 1; + uint32_t dtype_len = s->dtype_len; + auto const data_out = nesting_info_base[leaf_level_index].data_out; + uint32_t const skipped_leaf_values = s->page.skipped_leaf_values; + + static constexpr bool enable_print = false; + if constexpr (enable_print) { + if(t == 0) { printf("DECODE VALUES: start %d, end %d, first_row %d, leaf_level_index %d, dtype_len %u, " + "data_out %p, dict_base %p, dict_size %d, dict_bits %d, dict_val %d, data_start %p, skipped_leaf_values %u, input_row_count %d\n", + start, end, s->first_row, leaf_level_index, dtype_len, data_out, s->dict_base, s->dict_bits, s->dict_val, + s->dict_size, s->data_start, skipped_leaf_values, s->input_row_count); + } + } + // decode values int pos = start; while (pos < end) { int const batch_size = min(max_batch_size, end - pos); int const target_pos = pos + batch_size; - int const src_pos = pos + t; + int src_pos = pos + t; // the position in the output column/buffer - int dst_pos = sb->nz_idx[rolling_index(src_pos)] - s->first_row; +//Index from rolling buffer of values (which doesn't include nulls) to final array (which includes gaps for nulls) + auto offset = sb->nz_idx[rolling_index(src_pos)]; + int dst_pos = offset; + if constexpr (!has_lists_t) { + dst_pos -= s->first_row; + } + + int dict_idx = rolling_index(src_pos + skipped_leaf_values); + int dict_pos = sb->dict_idx[dict_idx]; + if constexpr (enable_print) { + if(t == 0) { + printf("DECODE OFFSETS: pos %d, src_pos %d, offset %d, dst_pos %d, target_pos %d, dict_idx %d, dict_pos %d\n", + pos, src_pos, offset, dst_pos, target_pos, dict_idx, dict_pos); + } + } // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values // before first_row) in the flat hierarchy case. if (src_pos < target_pos && dst_pos >= 0) { // nesting level that is storing actual leaf values - int const leaf_level_index = s->col.max_nesting_depth - 1; - uint32_t dtype_len = s->dtype_len; - void* dst = - nesting_info_base[leaf_level_index].data_out + static_cast(dst_pos) * dtype_len; + // src_pos represents the logical row position we want to read from. But in the case of + // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So our true read position + // has to take into account the # of values we have to skip in the page to get to the + // desired logical row. For flat hierarchies, skipped_leaf_values will always be 0. + if constexpr (has_lists_t) { + src_pos += skipped_leaf_values; + } + + void* dst = data_out + static_cast(dst_pos) * dtype_len; + if constexpr (enable_print) { + if(dst_pos == 0) { + printf("WRITTEN TO dst_pos ZERO: t %d, data_out %p, dst %p, src_pos %d, dict_idx %d, dict_pos %d, dict_base %p\n", + t, data_out, dst, src_pos, dict_idx, dict_pos, s->dict_base); + } + } + if (s->col.logical_type.has_value() && s->col.logical_type->type == LogicalType::DECIMAL) { switch (dtype) { case INT32: gpuOutputFast(s, sb, src_pos, static_cast(dst)); break; @@ -92,15 +132,15 @@ __device__ inline void gpuDecodeFixedWidthValues( } } -template +template struct decode_fixed_width_values_func { __device__ inline void operator()(page_state_s* s, state_buf* const sb, int start, int end, int t) { - gpuDecodeFixedWidthValues(s, sb, start, end, t); + gpuDecodeFixedWidthValues(s, sb, start, end, t); } }; -template +template __device__ inline void gpuDecodeFixedWidthSplitValues( page_state_s* s, state_buf* const sb, int start, int end, int t) { @@ -112,6 +152,7 @@ __device__ inline void gpuDecodeFixedWidthSplitValues( int const dtype = s->col.physical_type; auto const data_len = thrust::distance(s->data_start, s->data_end); auto const num_values = data_len / s->dtype_len_in; + uint32_t const skipped_leaf_values = s->page.skipped_leaf_values; // decode values int pos = start; @@ -119,10 +160,13 @@ __device__ inline void gpuDecodeFixedWidthSplitValues( int const batch_size = min(max_batch_size, end - pos); int const target_pos = pos + batch_size; - int const src_pos = pos + t; + int src_pos = pos + t; // the position in the output column/buffer - int dst_pos = sb->nz_idx[rolling_index(src_pos)] - s->first_row; + int dst_pos = sb->nz_idx[rolling_index(src_pos)]; + if constexpr (!has_lists_t) { + dst_pos -= s->first_row; + } // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values // before first_row) in the flat hierarchy case. @@ -130,6 +174,14 @@ __device__ inline void gpuDecodeFixedWidthSplitValues( // nesting level that is storing actual leaf values int const leaf_level_index = s->col.max_nesting_depth - 1; + // src_pos represents the logical row position we want to read from. But in the case of + // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So our true read position + // has to take into account the # of values we have to skip in the page to get to the + // desired logical row. For flat hierarchies, skipped_leaf_values will always be 0. + if constexpr (has_lists_t) { + src_pos += skipped_leaf_values; + } + uint32_t dtype_len = s->dtype_len; uint8_t const* src = s->data_start + src_pos; uint8_t* dst = @@ -186,11 +238,11 @@ __device__ inline void gpuDecodeFixedWidthSplitValues( } } -template +template struct decode_fixed_width_split_values_func { __device__ inline void operator()(page_state_s* s, state_buf* const sb, int start, int end, int t) { - gpuDecodeFixedWidthSplitValues(s, sb, start, end, t); + gpuDecodeFixedWidthSplitValues(s, sb, start, end, t); } }; @@ -201,11 +253,11 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists( constexpr int num_warps = decode_block_size / cudf::detail::warp_size; constexpr int max_batch_size = num_warps * cudf::detail::warp_size; - // how many (input) values we've processed in the page so far, prior to this loop iteration + // how many (input) values we've processed in the page so far int value_count = s->input_value_count; // cap by last row so that we don't process any rows past what we want to output. - int const first_row = s->first_row; //row index WITHIN THE PAGE + int const first_row = s->first_row; int const last_row = first_row + s->num_rows; int const capped_target_value_count = min(target_value_count, last_row); @@ -217,101 +269,69 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists( while (value_count < capped_target_value_count) { int const batch_size = min(max_batch_size, capped_target_value_count - value_count); - // get definition level. only need to process for nullable columns - int def_level; + // definition level. only need to process for nullable columns + int d = 0; if constexpr (nullable) { if (def) { - def_level = t < batch_size + d = t < batch_size ? static_cast(def[rolling_index(value_count + t)]) : -1; } else { - def_level = t < batch_size ? 1 : -1; + d = t < batch_size ? 1 : -1; } - } else { - def_level = 0; } - //Determine value count & row index - int const thread_value_count = t; //# of output values from the view of this thread - int const block_value_count = batch_size; - int const row_index = t + value_count; //thread_row_index in old - int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); - - //per-warp variables used below for writing validity - int const in_write_row_bounds = (row_index >= first_row) && (row_index < last_row); - - //bit mask of all threads that passed true - int const in_write_row_bounds_mask = ballot(in_write_row_bounds); - - // index of first set bit (in the warp to store) - int write_start = __ffs(in_write_row_bounds_mask) - 1; - - // remaining code is trivial for non-nullable, non-list columns: no need to iterate over depth - if constexpr (!nullable) { + int const thread_value_count = t + 1; + int const block_value_count = batch_size; - // if this is valid and we're at the leaf, output dst_pos - int const is_valid = in_row_bounds; - if (is_valid) { - auto& ni = s->nesting_info[max_depth]; - int const thread_valid_count = thread_value_count; + // compute our row index, whether we're in row bounds, and validity + int const row_index = (thread_value_count + value_count) - 1; + int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); + int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row); + int const write_start = __ffs(in_write_row_bounds) - 1; // first bit in the warp to store - // for non-list types, the value count is always the same across - int const dst_pos = value_count + thread_value_count; - int const src_pos = ni.valid_count + thread_valid_count; - sb->nz_idx[rolling_index(src_pos)] = dst_pos; - } + // iterate by depth + for (int d_idx = 0; d_idx <= max_depth; d_idx++) { + auto& ni = s->nesting_info[d_idx]; - // update valid_count - if (t == 0) { - int const block_valid_count = block_value_count; - s->nesting_info[max_depth].valid_count += block_valid_count; + int is_valid; + if constexpr (nullable) { + is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0; + } else { + is_valid = in_row_bounds; } - __syncthreads(); // publish modification of nesting_info value_count - } else { - - // column is a nullable non-list: iterate by depth - for (int d_idx = 0; d_idx <= max_depth; d_idx++) { - - auto& ni = s->nesting_info[d_idx]; - - // everything up to the max_def_level is a non-null value - int is_valid = ((def_level >= ni.max_def_level) && in_row_bounds) ? 1 : 0; - - // thread and block validity count - // queries is_valid of all threads, stores prior total and total total + // thread and block validity count + int thread_valid_count, block_valid_count; + if constexpr (nullable) { using block_scan = cub::BlockScan; __shared__ typename block_scan::TempStorage scan_storage; - int thread_valid_count, block_valid_count; - block_scan(scan_storage).ExclusiveSum(is_valid, thread_valid_count, block_valid_count); + block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count); + __syncthreads(); - // validity is processed per-warp (lane 0 writes), because writes are atomic + // validity is processed per-warp // - // nested schemas always read and write to the same bounds - // (that is, read and write positions are already pre-bounded by first_row/num_rows). - // since we are about to write the validity vector + // nested schemas always read and write to the same bounds (that is, read and write + // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading + // at the first value, even if that is before first_row, because we cannot trivially jump to + // the correct position to start reading. since we are about to write the validity vector // here we need to adjust our computed mask to take into account the write row bounds. int warp_null_count = 0; - if ((write_start >= 0) && (ni.valid_map != nullptr)) { + if (write_start >= 0 && ni.valid_map != nullptr) { + int const valid_map_offset = ni.valid_map_offset; uint32_t const warp_validity_mask = ballot(is_valid); + // lane 0 from each warp writes out validity if ((t % cudf::detail::warp_size) == 0) { - // absolute input value index - int const vindex = value_count + thread_value_count; - - // absolute bit offset into the output validity map - // subtract by first_row: we may skip first N rows, - // but still need to write bits at beginning of output vector - int const bit_offset = (ni.valid_map_offset + vindex + write_start) - first_row; - - // last bit in the warp to store - int const write_end = cudf::detail::warp_size - __clz(in_write_row_bounds_mask); - int const bit_count = write_end - write_start; //in old is warp_valid_mask_bit_count - - uint32_t const warp_output_valid_mask = warp_validity_mask >> write_start; - - store_validity(bit_offset, ni.valid_map, warp_output_valid_mask, bit_count); - - warp_null_count = bit_count - __popc(warp_output_valid_mask); + int const vindex = + (value_count + thread_value_count) - 1; // absolute input value index + int const bit_offset = (valid_map_offset + vindex + write_start) - + first_row; // absolute bit offset into the output validity map + int const write_end = cudf::detail::warp_size - + __clz(in_write_row_bounds); // last bit in the warp to store + int const bit_count = write_end - write_start; + warp_null_count = bit_count - __popc(warp_validity_mask >> write_start); + + store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count); } } @@ -322,20 +342,25 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists( size_type const block_null_count = cudf::detail::single_lane_block_sum_reduce(warp_null_count); if (t == 0) { ni.null_count += block_null_count; } + } + // trivial for non-nullable columns + else { + thread_valid_count = thread_value_count; + block_valid_count = block_value_count; + } - // if this is valid and we're at the leaf, output dst_pos - if (is_valid && d_idx == max_depth) { - // for non-list types, the value count is always the same across - __syncthreads(); // handle modification of ni.valid_count from below - int const dst_pos = value_count + thread_value_count; - int const src_pos = ni.valid_count + thread_valid_count; - sb->nz_idx[rolling_index(src_pos)] = dst_pos; - } - __syncthreads(); // handle modification of ni.valid_count from below + // if this is valid and we're at the leaf, output dst_pos + __syncthreads(); // handle modification of ni.value_count from below + if (is_valid && d_idx == max_depth) { + // for non-list types, the value count is always the same across + int const dst_pos = (value_count + thread_value_count) - 1; + int const src_pos = (ni.valid_count + thread_valid_count) - 1; + sb->nz_idx[rolling_index(src_pos)] = dst_pos; + } + __syncthreads(); // handle modification of ni.value_count from below - // update stuff - if (t == 0) { ni.valid_count += block_valid_count; } - } //END OF DEPTH LOOP + // update stuff + if (t == 0) { ni.valid_count += block_valid_count; } } value_count += block_value_count; @@ -482,7 +507,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( } template -static __device__ int gpuUpdateValidityAndRowIndicesNestedLists( +static __device__ int gpuUpdateValidityAndRowIndicesLists( int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, level_t const* const rep, int t) { @@ -493,16 +518,21 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists( // how many (input) values we've processed in the page so far, prior to this loop iteration int value_count = s->input_value_count; - int printf_num_threads = 34; + static constexpr bool enable_print = false; + int const printf_num_threads = 32; // how many rows we've processed in the page so far int input_row_count = s->input_row_count; - if (t == 0) { printf("value_count %d, input_row_count %d\n", value_count, input_row_count); } + if constexpr (enable_print) { + if (t == 0) { printf("value_count %d, input_row_count %d\n", value_count, input_row_count); } + } // cap by last row so that we don't process any rows past what we want to output. int const first_row = s->first_row; int const last_row = first_row + s->num_rows; - if (t == 0) { printf("first_row %d, last_row %d, target_value_count %d\n", first_row, last_row, target_value_count); } + if constexpr (enable_print) { + if (t == 0) { printf("first_row %d, last_row %d, target_value_count %d\n", first_row, last_row, target_value_count); } + } int const row_index_lower_bound = s->row_index_lower_bound; int const max_depth = s->col.max_nesting_depth - 1; @@ -510,6 +540,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists( __syncthreads(); while (value_count < target_value_count) { + if constexpr (enable_print) { + if(t == 0) { printf("VALUE COUNT: %d\n", value_count); } + } bool const within_batch = value_count + t < target_value_count; // get definition level, use repitition level to get start/end depth @@ -517,13 +550,23 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists( int def_level = -1, start_depth = -1, end_depth = -1; if (within_batch) { int const index = rolling_index(value_count + t); - int const rep_level = rep[index]; - def_level = static_cast(def[rolling_index(value_count + t)]); + auto const rep_level = static_cast(rep[index]); + def_level = static_cast(def[index]); //computed by generate_depth_remappings() + if constexpr (enable_print) { + if((rep_level < 0) || (rep_level > max_depth)) { + printf("WHOA: rep level %d out of bounds %d!\n", rep_level, max_depth); + } + } start_depth = s->nesting_info[rep_level].start_depth; end_depth = s->nesting_info[def_level].end_depth; - if (t < printf_num_threads) { printf("t %d, def_level %d, rep_level %d, start_depth %d, end_depth %d\n", \ + if constexpr (enable_print) { + if((def_level < 0) || (def_level > (max_depth + 1))) { + printf("WHOA: def level %d out of bounds (max_depth %d) (index %d) (end_depth %d)!\n", def_level, max_depth, index, end_depth); + } + } + if (enable_print && (t < printf_num_threads)) { printf("t %d, def_level %d, rep_level %d, start_depth %d, end_depth %d\n", \ t, def_level, rep_level, start_depth, end_depth); } } @@ -537,7 +580,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists( block_scan(scan_storage).ExclusiveSum(is_new_row, num_prior_new_rows, total_num_new_rows); __syncthreads(); //Needed because scan_storage will be reused - if (t == 0) { printf("num_prior_new_rows %d, total_num_new_rows %d\n", num_prior_new_rows, total_num_new_rows); } + if constexpr (enable_print) { + if (t == 0) { printf("num_prior_new_rows %d, total_num_new_rows %d\n", num_prior_new_rows, total_num_new_rows); } + } int const row_index = input_row_count + (num_prior_new_rows + is_new_row - 1); input_row_count += total_num_new_rows; @@ -549,21 +594,22 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists( // is from/in current rep level to/in the rep level AT the depth with the def value int in_nesting_bounds = ((0 >= start_depth && 0 <= end_depth) && in_row_bounds) ? 1 : 0; - if (t == 0) { printf("row_index %d, in_row_bounds %d, in_nesting_bounds %d, last_row %d\n", \ - row_index, in_row_bounds, in_nesting_bounds, last_row); } - if (t < printf_num_threads) { printf("t %d, is_new_row %d, num_prior_new_rows %d, row_index %d, in_row_bounds %d\n", - t, is_new_row, num_prior_new_rows, row_index, in_row_bounds); } + if constexpr (enable_print) { + if (t == 0) { printf("row_index %d, in_row_bounds %d, in_nesting_bounds %d, last_row %d\n", \ + row_index, in_row_bounds, in_nesting_bounds, last_row); } + if (t < printf_num_threads) { printf("t %d, is_new_row %d, num_prior_new_rows %d, row_index %d, in_row_bounds %d\n", + t, is_new_row, num_prior_new_rows, row_index, in_row_bounds); } + } // queries is_valid from all threads, stores prior total and total total int thread_value_count = 0, block_value_count = 0; block_scan(scan_storage).ExclusiveSum(in_nesting_bounds, thread_value_count, block_value_count); - if (t == 0) { printf("block_value_count %d\n", block_value_count); } - if (t < printf_num_threads) { printf("t %d, thread_value_count %d, in_nesting_bounds %d\n", - t, thread_value_count, in_nesting_bounds); } - - //bit mask of all threads that passed true //TODO DELETE ME - //uint32_t const in_write_row_bounds_mask = ballot(in_row_bounds); + if constexpr (enable_print) { + if (t == 0) { printf("block_value_count %d\n", block_value_count); } + if (t < printf_num_threads) { printf("t %d, thread_value_count %d, in_nesting_bounds %d\n", + t, thread_value_count, in_nesting_bounds); } + } // column is either nullable or is a list (or both): iterate by depth for (int d_idx = 0; d_idx <= max_depth; d_idx++) { @@ -578,9 +624,12 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists( is_valid = in_nesting_bounds; } - if (t == 0) { printf("nullable %d, depth %d, max_depth %d, max_def_level %d, value_count %d\n", - int(nullable), d_idx, max_depth, ni.max_def_level, value_count); } - if (t < printf_num_threads) { printf("t %d, def_level %d, in_nesting_bounds %d, is_valid %d\n", t, def_level, in_nesting_bounds, is_valid); } + if constexpr (enable_print) { + if (t == 0) { printf("nullable %d, depth %d, max_depth %d, max_def_level %d, value_count %d\n", + int(nullable), d_idx, max_depth, ni.max_def_level, value_count); } + if (t < printf_num_threads) { printf("t %d, def_level %d, in_nesting_bounds %d, is_valid %d\n", + t, def_level, in_nesting_bounds, is_valid); } + } // thread and block validity count // queries is_valid of all threads, stores prior total and total total @@ -612,8 +661,16 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists( auto thread_mask = (__uint128_t(1) << thread_value_count) - 1; int const thread_valid_count = count_set_bits(block_valid_mask & thread_mask); - if (t == 0) { printf("block_valid_mask %d\n", int(block_valid_mask)); } - if (t < printf_num_threads) { printf("t %d, thread_valid_count %d\n", t, thread_valid_count); } + if constexpr (enable_print) { + if((block_valid_mask == 0) && (t == 0) && (d_idx == max_depth)) { + printf("EMPTY VALID MASK: def_level %d, max_def_level %d, in_nesting_bounds %d, start_depth %d, " + "end_depth %d, in_row_bounds %d, row_index %d, row_index_lower_bound %d, last_row %d, input_row_count %d\n", + def_level, ni.max_def_level, in_nesting_bounds, start_depth, end_depth, in_row_bounds, row_index, + row_index_lower_bound, last_row, input_row_count); } + + if (t == 0) { printf("block_valid_mask %u\n", int(block_valid_mask)); } + if (t < printf_num_threads) { printf("t %d, thread_valid_count %d\n", t, thread_valid_count); } + } // compute warp and thread value counts for the -next- nesting level. we need to // do this for nested schemas so that we can emit an offset for the -current- nesting @@ -630,10 +687,12 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists( __shared__ typename block_scan::TempStorage scan_storage; block_scan(scan_storage).ExclusiveSum(next_in_nesting_bounds, next_thread_value_count, next_block_value_count); - if (t == 0) { printf("next depth %d, next_block_value_count %d\n", d_idx + 1, next_block_value_count); } - if (t < printf_num_threads) { printf("t %d, start_depth %d, end_depth %d, in_row_bounds %d, next_in_nesting_bounds %d\n", - t, start_depth, end_depth, in_row_bounds, next_in_nesting_bounds); } - if (t < printf_num_threads) { printf("t %d, next_thread_value_count %d\n", t, next_thread_value_count); } + if constexpr (enable_print) { + if (t == 0) { printf("next depth %d, next_block_value_count %d\n", d_idx + 1, next_block_value_count); } + if (t < printf_num_threads) { printf("t %d, start_depth %d, end_depth %d, in_row_bounds %d, next_in_nesting_bounds %d\n", + t, start_depth, end_depth, in_row_bounds, next_in_nesting_bounds); } + if (t < printf_num_threads) { printf("t %d, next_thread_value_count %d\n", t, next_thread_value_count); } + } // if we're -not- at a leaf column and we're within nesting/row bounds // and we have a valid data_out pointer, it implies this is a list column, so @@ -645,8 +704,12 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists( //STORE THE OFFSET FOR THE NEW LIST LOCATION (reinterpret_cast(ni.data_out))[idx] = ofs; - if (t < printf_num_threads) { printf("OFFSETS: t %d, idx %d, next value count %d, next page_start_value %d, ofs %d\n", - t, idx, next_ni.value_count, next_ni.page_start_value, ofs); } + + if constexpr (enable_print) { + if(idx < 0) { printf("WHOA: offset index out of bounds!\n"); } + if (t < printf_num_threads) { printf("OFFSETS: t %d, idx %d, next value count %d, next page_start_value %d, ofs %d\n", + t, idx, next_ni.value_count, next_ni.page_start_value, ofs); } + } } } @@ -668,24 +731,22 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists( int const bit_count = __popc(warp_count_mask); if(bit_count > 0) { - // absolute input value index - int const vindex = value_count + thread_value_count; - // absolute bit offset into the output validity map //is cumulative sum of bit_count at the given nesting depth // DON'T subtract by first_row: since it's lists it's not 1-row-per-value - // valid_map_offset was already set during list pre-processing for appropriate start index - int const bit_offset = ni.valid_map_offset + vindex; - + int const bit_offset = ni.valid_map_offset + thread_value_count; auto const shifted_valid_mask = static_cast(block_valid_mask >> thread_value_count); auto const bit_range_mask = (1 << bit_count) - 1; //mainly needed for warp_null_count auto const warp_validity_mask = shifted_valid_mask & bit_range_mask; - printf("t %d, thread_value_count %d, vindex %d, bit_offset %d, bit_count %d, warp_validity_mask %u\n", - t, thread_value_count, vindex, bit_offset, bit_count, warp_validity_mask); - store_validity(bit_offset, ni.valid_map, warp_validity_mask, bit_count); warp_null_count = bit_count - __popc(warp_validity_mask); + + if constexpr (enable_print) { + printf("STORE VALIDITY: t %d, depth %d, thread_value_count %d, valid_map_offset %d, bit_offset %d, bit_count %d, warp_validity_mask %u\n", + t, d_idx, thread_value_count, ni.valid_map_offset, bit_offset, bit_count, warp_validity_mask); + printf("NUM NULLS: t %d, depth %d, warp_null_count %d\n", t, d_idx, warp_null_count); + } } } } @@ -696,6 +757,10 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists( // compute it directly at the end of the kernel. size_type const block_null_count = cudf::detail::single_lane_block_sum_reduce(warp_null_count); + if constexpr (enable_print) { + if (t == 0) { printf("BLOCK NULLS: depth %d, prior %d, block_null_count %u\n", + d_idx, ni.null_count, block_null_count); } + } if (t == 0) { ni.null_count += block_null_count; } } @@ -707,9 +772,21 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists( int const src_pos = ni.valid_count + thread_valid_count; int const output_index = rolling_index(src_pos); - if (t == 0) { printf("ni.value_count %d, ni.valid_count %d\n", int(ni.value_count), int(ni.valid_count)); } - if (t < printf_num_threads) { printf("t %d, src_pos %d, output_index %d\n", t, src_pos, output_index); } + if constexpr (enable_print) { + if (t == 0) { printf("ni.value_count %d, ni.valid_count %d\n", int(ni.value_count), int(ni.valid_count)); } + if (t < printf_num_threads) { printf("t %d, src_pos %d, output_index %d\n", t, src_pos, output_index); } + + if((output_index < 0) || (output_index >= state_buf::nz_buf_size)) { + printf("WHOA: output index out of bounds!\n"); + } + + if((t == 0) && (src_pos == 0)) {printf("SPECIAL: output_index %d, dst_pos %d, ni.value_count %d, ni.valid_count %d, thread_value_count %d, thread_valid_count %d\n", + output_index, dst_pos, ni.value_count, ni.valid_count, thread_value_count, thread_valid_count);} + printf("OUTPUT_INDICES: output_index %d, dst_pos %d\n", output_index, dst_pos); + } + + //Index from rolling buffer of values (which doesn't include nulls) to final array (which includes gaps for nulls) sb->nz_idx[output_index] = dst_pos; } __syncthreads(); // handle modification of ni.value_count from below @@ -719,6 +796,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists( int const block_valid_count = count_set_bits(block_valid_mask); ni.valid_count += block_valid_count; ni.value_count += block_value_count; + ni.valid_map_offset += block_value_count; } // propagate value counts for the next depth level @@ -727,13 +805,17 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists( in_nesting_bounds = next_in_nesting_bounds; } //END OF DEPTH LOOP - if (t == 0) { printf("END DEPTH LOOP\n"); } + if constexpr (enable_print) { + if (t == 0) { printf("END DEPTH LOOP\n"); } + } int const batch_size = min(max_batch_size, target_value_count - value_count); value_count += batch_size; } - if (t == 0) { printf("END LOOP\n"); } + if constexpr (enable_print) { + if (t == 0) { printf("END LOOP\n"); } + } if (t == 0) { // update valid value count for decoding and total # of values we've processed @@ -795,7 +877,7 @@ template + template typename DecodeValuesFunc> CUDF_KERNEL void __launch_bounds__(decode_block_size_t) gpuDecodePageDataGeneric(PageInfo* pages, @@ -837,7 +919,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) // if we have no work to do (eg, in a skip_rows/num_rows case) in this page. if (s->num_rows == 0) { return; } - DecodeValuesFunc decode_values; + DecodeValuesFunc decode_values; bool const nullable = is_nullable(s); bool const should_process_nulls = nullable && maybe_has_nulls(s); @@ -885,18 +967,27 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) s->page.num_input_values); } + static constexpr bool enable_print = false; + rle_stream dict_stream{dict_runs}; if constexpr (has_dict_t) { + //auto const skipped_leaf_values = s->page.skipped_leaf_values; + //int const dict_offset = skipped_leaf_values * sizeof(uint32_t); dict_stream.init( s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values); + if constexpr (enable_print) { + if(t == 0) { printf("INIT DICT: dict_bits %d, data_start %p, data_end %p, dict_idx %p, page.num_input_values %d, s->dict_pos %d \n", + s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values, s->dict_pos); } + } + dict_stream.decode_next(t, s->page.skipped_leaf_values); } __syncthreads(); // We use two counters in the loop below: processed_count and valid_count. - // - processed_count: number of rows out of num_input_values that we have decoded so far. + // - processed_count: number of values out of num_input_values that we have decoded so far. // the definition stream returns the number of total rows it has processed in each call // to decode_next and we accumulate in process_count. - // - valid_count: number of non-null rows we have decoded so far. In each iteration of the + // - valid_count: number of non-null values we have decoded so far. In each iteration of the // loop below, we look at the number of valid items (which could be all for non-nullable), // and valid_count is that running count. int processed_count = 0; @@ -904,23 +995,55 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) // the core loop. decode batches of level stream data using rle_stream objects // and pass the results to gpuDecodeValues -if (t == 0) { printf("DECODE LOOP: has-list %d, has-nesting %d, should nulls %d \n", - int(has_lists_t), int(has_nesting_t), int(should_process_nulls)); } + if constexpr (enable_print) { + if(t == 0) { printf("page_idx %d, nullable %d, should_process_nulls %d, has_lists_t %d, has_dict_t %d, num_rows %lu, page.num_input_values %d\n", + page_idx, int(nullable), int(should_process_nulls), int(has_lists_t), int(has_dict_t), num_rows, s->page.num_input_values); } + } + + + auto print_nestings = [&](bool is_post){ + if constexpr (enable_print) { + auto print_nesting_level = [&](const PageNestingDecodeInfo& ni) { + printf("page_idx %d, max_def_level %d, start_depth %d, end_depth %d, page_start_value %d, null_count %d, " + "valid_map_offset %d, valid_count %d, value_count %d\n", + page_idx, ni.max_def_level, ni.start_depth, ni.end_depth, ni.page_start_value, ni.null_count, + ni.valid_map_offset, ni.valid_count, ni.value_count); + }; + + if(t == 0) { + printf("POST %d NESTING 0: ", int(is_post)); + print_nesting_level(s->nesting_info[0]); + printf("POST %d NESTING 1: ", int(is_post)); + print_nesting_level(s->nesting_info[1]); + printf("POST %d NESTING 2: ", int(is_post)); + print_nesting_level(s->nesting_info[2]); + } + } + }; + + print_nestings(false); while (s->error == 0 && processed_count < s->page.num_input_values) { int next_valid_count; + if constexpr (has_lists_t){ + rep_decoder.decode_next(t); + } + // only need to process definition levels if this is a nullable column if (should_process_nulls) { - if constexpr (has_lists_t){ - rep_decoder.decode_next(t); - } processed_count += def_decoder.decode_next(t); __syncthreads(); if constexpr (has_lists_t) { - next_valid_count = gpuUpdateValidityAndRowIndicesNestedLists( + int value_count = s->input_value_count; + next_valid_count = gpuUpdateValidityAndRowIndicesLists( processed_count, s, sb, def, rep, t); + if constexpr (enable_print) { + if(t == 0) { printf("PROCESSING: page total values %d, num_input_values %d, pre value_count %d, post value_count %d, " + "processed_count %d, valid_count %d, next_valid_count %d\n", + s->page.num_input_values, s->input_value_count, value_count, s->input_value_count, processed_count, valid_count, next_valid_count); } + } } else if constexpr (has_nesting_t) { next_valid_count = gpuUpdateValidityAndRowIndicesNestedNonLists( processed_count, s, sb, def, t); @@ -933,21 +1056,25 @@ if (t == 0) { printf("DECODE LOOP: has-list %d, has-nesting %d, should nulls %d // this function call entirely since all it will ever generate is a mapping of (i -> i) for // nz_idx. gpuDecodeFixedWidthValues would be the only work that happens. else { - processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count); - if constexpr (has_lists_t) { // no nulls, but if we have a list we still need the definition levels - def_decoder.decode_next(t); + processed_count += def_decoder.decode_next(t); + __syncthreads(); + next_valid_count = - gpuUpdateValidityAndRowIndicesNestedLists( + gpuUpdateValidityAndRowIndicesLists( processed_count, s, sb, def, rep, t); - } else if constexpr (has_nesting_t) { - next_valid_count = - gpuUpdateValidityAndRowIndicesNestedNonLists( - processed_count, s, sb, nullptr, t); } else { - next_valid_count = gpuUpdateValidityAndRowIndicesFlat( - processed_count, s, sb, nullptr, t); + processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count); + + if constexpr (has_nesting_t) { + next_valid_count = + gpuUpdateValidityAndRowIndicesNestedNonLists( + processed_count, s, sb, nullptr, t); + } else { + next_valid_count = gpuUpdateValidityAndRowIndicesFlat( + processed_count, s, sb, nullptr, t); + } } } __syncthreads(); @@ -966,8 +1093,15 @@ if (t == 0) { printf("DECODE LOOP: has-list %d, has-nesting %d, should nulls %d __syncthreads(); valid_count = next_valid_count; + + if constexpr (enable_print) { + if(t == 0) { printf("LOOP: processed_count %d, #page values %d, error %d\n", + processed_count, s->page.num_input_values, s->error); } + } } if (t == 0 and s->error != 0) { set_error(s->error, error_code); } + + print_nestings(true); } } // anonymous namespace diff --git a/cpp/src/io/parquet/page_data.cuh b/cpp/src/io/parquet/page_data.cuh index f182747650e..e82f927e34f 100644 --- a/cpp/src/io/parquet/page_data.cuh +++ b/cpp/src/io/parquet/page_data.cuh @@ -89,6 +89,14 @@ inline __device__ void gpuStoreOutput(uint32_t* dst, bytebuf = 0; } *dst = bytebuf; + + static constexpr bool enable_print = false; + if constexpr (enable_print) { + if (threadIdx.x == 0) { + printf("STORE VALUE %u at %p, src8 %p, dict_pos %u, dict_size %u, ofs %u\n", + bytebuf, dst, src8, dict_pos, dict_size, ofs); + } + } } /** @@ -328,6 +336,7 @@ inline __device__ void gpuOutputFast(page_state_s* s, state_buf* sb, int src_pos uint8_t const* dict; uint32_t dict_pos, dict_size = s->dict_size; +auto dict_lookup_idx = rolling_index(src_pos); if (s->dict_base) { // Dictionary dict_pos = @@ -339,6 +348,15 @@ inline __device__ void gpuOutputFast(page_state_s* s, state_buf* sb, int src_pos dict = s->data_start; } dict_pos *= (uint32_t)s->dtype_len_in; + + static constexpr bool enable_print = false; + if constexpr (enable_print) { + if (threadIdx.x == 0) { + printf("PREP OUTPUT VALUE at dst %p, dict %p, dict_pos %u, dict_size %u, dict_base %p, dict_bits %d, dict_lookup_idx %d, dtype_len_in %d\n", + dst, dict, dict_pos, dict_size, s->dict_base, s->dict_bits, dict_lookup_idx, s->dtype_len_in); + } + } + gpuStoreOutput(dst, dict, dict_pos, dict_size); } diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index b1f8e6dd5fe..cb682112195 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -588,8 +588,8 @@ inline __device__ void store_validity(int valid_map_offset, if (relevant_mask == ~0) { valid_map[word_offset] = valid_mask; } else { - atomicAnd(valid_map + word_offset, ~(relevant_mask << bit_offset)); - atomicOr(valid_map + word_offset, (valid_mask & relevant_mask) << bit_offset); + atomicAnd(valid_map + word_offset, ~(relevant_mask << bit_offset)); //clears old bits + atomicOr(valid_map + word_offset, (valid_mask & relevant_mask) << bit_offset); //sets valid mask } } // we're going to spill over into the next word. @@ -719,9 +719,16 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value // track (page-relative) row index for the thread so we can compare against input bounds // keep track of overall # of rows we've read. int const is_new_row = start_depth == 0 ? 1 : 0; - uint32_t const warp_row_count_mask = ballot(is_new_row); + uint32_t const warp_row_count_mask = ballot(is_new_row); //how many threads are starting a new row + //t is zero through 31. the shifted bit is the 1st through the 32nd bit. then we -1: mask + //the mask we and with is querying PRIOR threads + uint32_t const prior_thread_mask = ((1 << t) - 1); //query "for all threads before me" + uint32_t const prior_new_rows_bits = warp_row_count_mask & prior_thread_mask; + int32_t const num_prior_new_rows = __popc(prior_new_rows_bits); + int32_t const thread_row_index = - input_row_count + ((__popc(warp_row_count_mask & ((1 << t) - 1)) + is_new_row) - 1); + input_row_count + ((num_prior_new_rows + is_new_row) - 1); + input_row_count += __popc(warp_row_count_mask); // is this thread within read row bounds? int const in_row_bounds = thread_row_index >= s->row_index_lower_bound && @@ -729,30 +736,34 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value ? 1 : 0; + // if we are within the range of nesting levels we should be adding value indices for +//if list: is from/in current rep level to/in the rep level AT the depth with the def value + int in_nesting_bounds = ((0 >= start_depth && 0 <= end_depth) && in_row_bounds) ? 1 : 0; + // compute warp and thread value counts - uint32_t const warp_count_mask = - ballot((0 >= start_depth && 0 <= end_depth) && in_row_bounds ? 1 : 0); + uint32_t const warp_count_mask = ballot(in_nesting_bounds); warp_value_count = __popc(warp_count_mask); - // Note : ((1 << t) - 1) implies "for all threads before me" - thread_value_count = __popc(warp_count_mask & ((1 << t) - 1)); + // thread_value_count : # of output values from the view of this thread + // is all threads before me that start from rep level zero (new row) + thread_value_count = __popc(warp_count_mask & prior_thread_mask); // walk from 0 to max_depth - uint32_t next_thread_value_count, next_warp_value_count; for (int s_idx = 0; s_idx < max_depth; s_idx++) { PageNestingDecodeInfo* nesting_info = &nesting_info_base[s_idx]; - // if we are within the range of nesting levels we should be adding value indices for - int const in_nesting_bounds = - ((s_idx >= start_depth && s_idx <= end_depth) && in_row_bounds) ? 1 : 0; - // everything up to the max_def_level is a non-null value +//if is NOT list, then means is-not-null, OR is-null in a CHILD node +//if IS list, also: is from/in current rep level to/in the rep level AT the depth with the def value uint32_t const is_valid = d >= nesting_info->max_def_level && in_nesting_bounds ? 1 : 0; // compute warp and thread valid counts + // bit is set for each thread in the warp that is_valid +//OR of all is_valid's shifted by thread_value_count uint32_t const warp_valid_mask = // for flat schemas, a simple ballot_sync gives us the correct count and bit positions // because every value in the input matches to a value in the output +//If no lists: every entry is a new row, which may be null !has_repetition ? ballot(is_valid) : @@ -763,8 +774,10 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value // __reduce_or_sync(), but until then we have to do a warp reduce. WarpReduceOr32(is_valid << thread_value_count); +//For this value, we save an offset at every depth (in the loop) + //# bits prior to this thread that are valid (set) thread_valid_count = __popc(warp_valid_mask & ((1 << thread_value_count) - 1)); - warp_valid_count = __popc(warp_valid_mask); + warp_valid_count = __popc(warp_valid_mask); //#set bits of all threads in warp // if this is the value column emit an index for value decoding if (is_valid && s_idx == max_depth - 1) { @@ -778,10 +791,15 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value // do this for nested schemas so that we can emit an offset for the -current- nesting // level. more concretely : the offset for the current nesting level == current length of the // next nesting level + uint32_t next_thread_value_count = 0, next_warp_value_count = 0; + int next_in_nesting_bounds = 0; if (s_idx < max_depth - 1) { - uint32_t const next_warp_count_mask = - ballot((s_idx + 1 >= start_depth && s_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0); - next_warp_value_count = __popc(next_warp_count_mask); + //mask is different between depths + next_in_nesting_bounds = + (s_idx + 1 >= start_depth && s_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0; + uint32_t const next_warp_count_mask = ballot(next_in_nesting_bounds); + + next_warp_value_count = __popc(next_warp_count_mask); //same for all threads, but not all depths next_thread_value_count = __popc(next_warp_count_mask & ((1 << t) - 1)); // if we're -not- at a leaf column and we're within nesting/row bounds @@ -792,34 +810,36 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value cudf::size_type const ofs = nesting_info_base[s_idx + 1].value_count + next_thread_value_count + nesting_info_base[s_idx + 1].page_start_value; +//STORE THE OFFSET FOR THE NEW LIST LOCATION (reinterpret_cast(nesting_info->data_out))[idx] = ofs; } } - // nested schemas always read and write to the same bounds (that is, read and write positions - // are already pre-bounded by first_row/num_rows). flat schemas will start reading at the - // first value, even if that is before first_row, because we cannot trivially jump to - // the correct position to start reading. since we are about to write the validity vector here + // lists always read and write to the same bounds (that is, read and write positions + // are already pre-bounded by first_row/num_rows) how? we have pre-processed them. + // flat schemas will start reading at the first value, even if that is before first_row, + // because we cannot trivially jump to the correct position to start reading. + // why not? because we don't know how many nulls were before it (haven't preprocessed them) + // since we are about to write the validity vector here // we need to adjust our computed mask to take into account the write row bounds. int const in_write_row_bounds = !has_repetition ? thread_row_index >= s->first_row && thread_row_index < (s->first_row + s->num_rows) : in_row_bounds; + //is write_start in new int const first_thread_in_write_range = - !has_repetition ? __ffs(ballot(in_write_row_bounds)) - 1 : 0; - - // # of bits to of the validity mask to write out - int const warp_valid_mask_bit_count = - first_thread_in_write_range < 0 ? 0 : warp_value_count - first_thread_in_write_range; + !has_repetition ? __ffs(ballot(in_write_row_bounds)) - 1 : 0; //index of lowest bit set to // increment count of valid values, count of total values, and update validity mask if (!t) { + // # of bits to of the validity mask to write out //becomes bit_count + int const warp_valid_mask_bit_count = + first_thread_in_write_range < 0 ? 0 : warp_value_count - first_thread_in_write_range; + if (nesting_info->valid_map != nullptr && warp_valid_mask_bit_count > 0) { uint32_t const warp_output_valid_mask = warp_valid_mask >> first_thread_in_write_range; - store_validity(nesting_info->valid_map_offset, - nesting_info->valid_map, - warp_output_valid_mask, - warp_valid_mask_bit_count); + store_validity(nesting_info->valid_map_offset, nesting_info->valid_map, + warp_output_valid_mask, warp_valid_mask_bit_count); nesting_info->valid_map_offset += warp_valid_mask_bit_count; nesting_info->null_count += warp_valid_mask_bit_count - __popc(warp_output_valid_mask); } @@ -830,7 +850,8 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value // propagate value counts for the next level warp_value_count = next_warp_value_count; thread_value_count = next_thread_value_count; - } + in_nesting_bounds = next_in_nesting_bounds; + } //END OF DEPTH LOOP input_value_count += min(32, (target_input_value_count - input_value_count)); __syncwarp(); @@ -1096,6 +1117,12 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, s->num_rows = (page_start_row + s->first_row) + max_page_rows <= max_row ? max_page_rows : max_row - (page_start_row + s->first_row); + + static constexpr bool enable_print = false; + if constexpr (enable_print) { + printf("NUM_ROWS: col.start_row %lu, page.chunk_row %d, page_start_row %lu, s->first_row %d, s->page.num_rows %d, max_row %lu, min_row %lu, num_rows %lu, s->num_rows %d\n", + s->col.start_row, s->page.chunk_row, page_start_row, s->first_row, s->page.num_rows, max_row, min_row, num_rows, s->num_rows); + } } } @@ -1256,13 +1283,11 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, } if (s->col.column_data_base != nullptr) { - nesting_info->data_out = static_cast(s->col.column_data_base[idx]); if (s->col.column_string_base != nullptr) { nesting_info->string_out = static_cast(s->col.column_string_base[idx]); } nesting_info->data_out = static_cast(s->col.column_data_base[idx]); - if (nesting_info->data_out != nullptr) { // anything below max depth with a valid data pointer must be a list, so the // element size is the size of the offset type. @@ -1277,8 +1302,8 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, } nesting_info->valid_map = s->col.valid_map_base[idx]; if (nesting_info->valid_map != nullptr) { - nesting_info->valid_map += output_offset >> 5; - nesting_info->valid_map_offset = (int32_t)(output_offset & 0x1f); + nesting_info->valid_map += output_offset >> 5; //is pointer to warp start + nesting_info->valid_map_offset = (int32_t)(output_offset & 0x1f); //is index within warp } } } @@ -1357,7 +1382,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, s->dict_pos = 0; s->src_pos = 0; - // for flat hierarchies, we can't know how many leaf values to skip unless we do a full + // for non-lists, we can't know how many leaf values to skip unless we do a full // preprocess of the definition levels (since nulls will have no actual decodable value, there // is no direct correlation between # of rows and # of decodable values). so we will start // processing at the beginning of the value stream and disregard any indices that start @@ -1371,7 +1396,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, s->row_index_lower_bound = -1; } - // for nested hierarchies, we have run a preprocess that lets us skip directly to the values + // for lists, we have run a preprocess that lets us skip directly to the values // we need to start decoding at else { // input_row_count translates to "how many rows we have processed so far", so since we are @@ -1379,7 +1404,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, s->input_row_count = s->first_row; // return the lower bound to compare (page-relative) thread row index against. Explanation: - // In the case of nested schemas, rows can span page boundaries. That is to say, + // In the case of lists, rows can span page boundaries. That is to say, // we can encounter the first value for row X on page M, but the last value for page M // might not be the last value for row X. page M+1 (or further) may contain the last value. // diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index ac39e2ac291..53a55a43300 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -183,7 +183,8 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page, return decode_kernel_mask::STRING; } - if (is_list(chunk)) { + if (is_list(chunk) && !is_string_col(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) { + //if (is_list(chunk)) { if (page.encoding == Encoding::PLAIN) { return decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST; } else if (page.encoding == Encoding::BYTE_STREAM_SPLIT) { diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index cc98e263664..b72359f0d73 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -71,8 +71,6 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num // figure out which kernels to run auto const kernel_mask = GetAggregatedDecodeKernelMask(subpass.pages, _stream); -printf("DECODE DATA PAGE, mask %d\n", kernel_mask); - // Check to see if there are any string columns present. If so, then we need to get size info // for each string page. This size info will be used to pre-allocate memory for the column, // allowing the page decoder to write string data directly to the column buffer, rather than @@ -223,6 +221,11 @@ printf("DECODE DATA PAGE, mask %d\n", kernel_mask); int const nkernels = std::bitset<32>(kernel_mask).count(); auto streams = cudf::detail::fork_streams(_stream, nkernels); + static constexpr bool enable_print = false; + if constexpr (enable_print) { + printf("PAGE DATA DECODE MASK: %d\n", kernel_mask); + } + // launch string decoder int s_idx = 0; if (BitAnd(kernel_mask, decode_kernel_mask::STRING) != 0) { @@ -333,7 +336,6 @@ printf("DECODE DATA PAGE, mask %d\n", kernel_mask); // launch fixed width type decoder for lists if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST) != 0) { -printf("LIST PAGE\n"); DecodePageDataFixed(subpass.pages, pass.chunks, num_rows, diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index f28a7311ccb..9405f658429 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -129,7 +129,12 @@ void generate_depth_remappings(std::map, std::ve // depth. // + static constexpr bool enable_print = false; + // compute "X" from above + if constexpr (enable_print) { + printf("REMAPPING: max def %d, max rep %d\n", schema.max_definition_level, schema.max_repetition_level); + } for (int s_idx = schema.max_repetition_level; s_idx >= 0; s_idx--) { auto find_shallowest = [&](int r) { int shallowest = -1; @@ -148,6 +153,9 @@ void generate_depth_remappings(std::map, std::ve if (!cur_schema.is_stub()) { cur_depth--; } schema_idx = cur_schema.parent_idx; } + if constexpr (enable_print) { + printf("REMAPPING: s_idx / r %d, shallowest %d\n", r, shallowest); + } return shallowest; }; rep_depth_remap[s_idx] = find_shallowest(s_idx); @@ -186,6 +194,10 @@ void generate_depth_remappings(std::map, std::ve prev_schema = cur_schema; schema_idx = cur_schema.parent_idx; } + + if constexpr (enable_print) { + printf("REMAPPING: s_idx %d, r1 %d, end_depth %d\n", s_idx, r1, depth); + } return depth; }; def_depth_remap[s_idx] = find_deepest(s_idx); From ead17b8bc2f12f77e122ca7de7fbce52ef77c945 Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Wed, 28 Aug 2024 13:56:28 -0400 Subject: [PATCH 04/38] Revert page_decode changes --- cpp/src/io/parquet/decode_fixed.cu | 47 +++++++++++-- cpp/src/io/parquet/page_data.cuh | 2 +- cpp/src/io/parquet/page_decode.cuh | 103 +++++++++++------------------ cpp/src/io/parquet/page_hdr.cu | 1 - cpp/src/io/parquet/parquet_gpu.hpp | 22 ------ 5 files changed, 80 insertions(+), 95 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 312aa31c67b..8ba251aec0b 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -261,12 +261,21 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists( int const last_row = first_row + s->num_rows; int const capped_target_value_count = min(target_value_count, last_row); + static constexpr bool enable_print = false; + if constexpr (enable_print) { + if (t == 0) { printf("NESTED: s->input_value_count %d, first_row %d, last_row %d, target_value_count %d, capped_target_value_count %d\n", + s->input_value_count, first_row, last_row, target_value_count, capped_target_value_count); } + } + int const row_index_lower_bound = s->row_index_lower_bound; int const max_depth = s->col.max_nesting_depth - 1; __syncthreads(); while (value_count < capped_target_value_count) { + if constexpr (enable_print) { + if(t == 0) { printf("NESTED VALUE COUNT: %d\n", value_count); } + } int const batch_size = min(max_batch_size, capped_target_value_count - value_count); // definition level. only need to process for nullable columns @@ -290,6 +299,11 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists( int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row); int const write_start = __ffs(in_write_row_bounds) - 1; // first bit in the warp to store + if constexpr (enable_print) { + if(t == 0) { printf("NESTED ROWS: row_index %d, row_index_lower_bound %d, last_row %d, in_row_bounds %d\n", + row_index, row_index_lower_bound, last_row, in_row_bounds); } + } + // iterate by depth for (int d_idx = 0; d_idx <= max_depth; d_idx++) { auto& ni = s->nesting_info[d_idx]; @@ -356,6 +370,10 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists( int const dst_pos = (value_count + thread_value_count) - 1; int const src_pos = (ni.valid_count + thread_valid_count) - 1; sb->nz_idx[rolling_index(src_pos)] = dst_pos; + if constexpr (enable_print) { + if(t == 0) {printf("NESTED STORE: first_row %d, row_index %d dst_pos %d, src_pos %d\n", + first_row, row_index, dst_pos, src_pos);} + } } __syncthreads(); // handle modification of ni.value_count from below @@ -395,12 +413,22 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( int const last_row = first_row + s->num_rows; int const capped_target_value_count = min(target_value_count, last_row); + static constexpr bool enable_print = false; + if constexpr (enable_print) { + if (t == 0) { printf("FLAT: s->input_value_count %d, first_row %d, last_row %d, target_value_count %d, capped_target_value_count %d\n", + s->input_value_count, first_row, last_row, target_value_count, capped_target_value_count); } + } + int const valid_map_offset = ni.valid_map_offset; int const row_index_lower_bound = s->row_index_lower_bound; __syncthreads(); while (value_count < capped_target_value_count) { + if constexpr (enable_print) { + if(t == 0) { printf("FLAT VALUE COUNT: %d\n", value_count); } + } + int const batch_size = min(max_batch_size, capped_target_value_count - value_count); int const thread_value_count = t; @@ -519,7 +547,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( int value_count = s->input_value_count; static constexpr bool enable_print = false; - int const printf_num_threads = 32; + int const printf_num_threads = 0; // how many rows we've processed in the page so far int input_row_count = s->input_row_count; @@ -531,7 +559,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( int const first_row = s->first_row; int const last_row = first_row + s->num_rows; if constexpr (enable_print) { - if (t == 0) { printf("first_row %d, last_row %d, target_value_count %d\n", first_row, last_row, target_value_count); } + if (t == 0) { printf("LIST s->input_value_count %d, first_row %d, last_row %d, target_value_count %d\n", + s->input_value_count, first_row, last_row, target_value_count); } } int const row_index_lower_bound = s->row_index_lower_bound; @@ -541,7 +570,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( while (value_count < target_value_count) { if constexpr (enable_print) { - if(t == 0) { printf("VALUE COUNT: %d\n", value_count); } + if(t == 0) { printf("LIST VALUE COUNT: %d\n", value_count); } } bool const within_batch = value_count + t < target_value_count; @@ -565,9 +594,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( if((def_level < 0) || (def_level > (max_depth + 1))) { printf("WHOA: def level %d out of bounds (max_depth %d) (index %d) (end_depth %d)!\n", def_level, max_depth, index, end_depth); } + if (t < printf_num_threads) { printf("t %d, def_level %d, rep_level %d, start_depth %d, end_depth %d\n", \ + t, def_level, rep_level, start_depth, end_depth); } } - if (enable_print && (t < printf_num_threads)) { printf("t %d, def_level %d, rep_level %d, start_depth %d, end_depth %d\n", \ - t, def_level, rep_level, start_depth, end_depth); } } //Determine value count & row index @@ -595,8 +624,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( int in_nesting_bounds = ((0 >= start_depth && 0 <= end_depth) && in_row_bounds) ? 1 : 0; if constexpr (enable_print) { - if (t == 0) { printf("row_index %d, in_row_bounds %d, in_nesting_bounds %d, last_row %d\n", \ - row_index, in_row_bounds, in_nesting_bounds, last_row); } + if(t == 0) { printf("LIST ROWS: row_index %d, row_index_lower_bound %d, last_row %d, in_row_bounds %d, in_nesting_bounds %d\n", + row_index, row_index_lower_bound, last_row, in_row_bounds, in_nesting_bounds); } if (t < printf_num_threads) { printf("t %d, is_new_row %d, num_prior_new_rows %d, row_index %d, in_row_bounds %d\n", t, is_new_row, num_prior_new_rows, row_index, in_row_bounds); } } @@ -1040,6 +1069,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) next_valid_count = gpuUpdateValidityAndRowIndicesLists( processed_count, s, sb, def, rep, t); if constexpr (enable_print) { + if(t == 0) { printf("LISTS NEXT: next_valid_count %d\n", next_valid_count); } if(t == 0) { printf("PROCESSING: page total values %d, num_input_values %d, pre value_count %d, post value_count %d, " "processed_count %d, valid_count %d, next_valid_count %d\n", s->page.num_input_values, s->input_value_count, value_count, s->input_value_count, processed_count, valid_count, next_valid_count); } @@ -1047,6 +1077,9 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) } else if constexpr (has_nesting_t) { next_valid_count = gpuUpdateValidityAndRowIndicesNestedNonLists( processed_count, s, sb, def, t); + if constexpr (enable_print) { + if(t == 0) { printf("NESTED NEXT: next_valid_count %d\n", next_valid_count); } + } } else { next_valid_count = gpuUpdateValidityAndRowIndicesFlat( processed_count, s, sb, def, t); diff --git a/cpp/src/io/parquet/page_data.cuh b/cpp/src/io/parquet/page_data.cuh index e82f927e34f..1e13302c467 100644 --- a/cpp/src/io/parquet/page_data.cuh +++ b/cpp/src/io/parquet/page_data.cuh @@ -336,7 +336,6 @@ inline __device__ void gpuOutputFast(page_state_s* s, state_buf* sb, int src_pos uint8_t const* dict; uint32_t dict_pos, dict_size = s->dict_size; -auto dict_lookup_idx = rolling_index(src_pos); if (s->dict_base) { // Dictionary dict_pos = @@ -352,6 +351,7 @@ auto dict_lookup_idx = rolling_index(src_pos); static constexpr bool enable_print = false; if constexpr (enable_print) { if (threadIdx.x == 0) { + auto dict_lookup_idx = rolling_index(src_pos); printf("PREP OUTPUT VALUE at dst %p, dict %p, dict_pos %u, dict_size %u, dict_base %p, dict_bits %d, dict_lookup_idx %d, dtype_len_in %d\n", dst, dict, dict_pos, dict_size, s->dict_base, s->dict_bits, dict_lookup_idx, s->dtype_len_in); } diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index cb682112195..7e4fb0271d5 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -588,8 +588,8 @@ inline __device__ void store_validity(int valid_map_offset, if (relevant_mask == ~0) { valid_map[word_offset] = valid_mask; } else { - atomicAnd(valid_map + word_offset, ~(relevant_mask << bit_offset)); //clears old bits - atomicOr(valid_map + word_offset, (valid_mask & relevant_mask) << bit_offset); //sets valid mask + atomicAnd(valid_map + word_offset, ~(relevant_mask << bit_offset)); + atomicOr(valid_map + word_offset, (valid_mask & relevant_mask) << bit_offset); } } // we're going to spill over into the next word. @@ -719,16 +719,9 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value // track (page-relative) row index for the thread so we can compare against input bounds // keep track of overall # of rows we've read. int const is_new_row = start_depth == 0 ? 1 : 0; - uint32_t const warp_row_count_mask = ballot(is_new_row); //how many threads are starting a new row - //t is zero through 31. the shifted bit is the 1st through the 32nd bit. then we -1: mask - //the mask we and with is querying PRIOR threads - uint32_t const prior_thread_mask = ((1 << t) - 1); //query "for all threads before me" - uint32_t const prior_new_rows_bits = warp_row_count_mask & prior_thread_mask; - int32_t const num_prior_new_rows = __popc(prior_new_rows_bits); - + uint32_t const warp_row_count_mask = ballot(is_new_row); int32_t const thread_row_index = - input_row_count + ((num_prior_new_rows + is_new_row) - 1); - + input_row_count + ((__popc(warp_row_count_mask & ((1 << t) - 1)) + is_new_row) - 1); input_row_count += __popc(warp_row_count_mask); // is this thread within read row bounds? int const in_row_bounds = thread_row_index >= s->row_index_lower_bound && @@ -736,34 +729,30 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value ? 1 : 0; - // if we are within the range of nesting levels we should be adding value indices for -//if list: is from/in current rep level to/in the rep level AT the depth with the def value - int in_nesting_bounds = ((0 >= start_depth && 0 <= end_depth) && in_row_bounds) ? 1 : 0; - // compute warp and thread value counts - uint32_t const warp_count_mask = ballot(in_nesting_bounds); + uint32_t const warp_count_mask = + ballot((0 >= start_depth && 0 <= end_depth) && in_row_bounds ? 1 : 0); warp_value_count = __popc(warp_count_mask); - // thread_value_count : # of output values from the view of this thread - // is all threads before me that start from rep level zero (new row) - thread_value_count = __popc(warp_count_mask & prior_thread_mask); + // Note : ((1 << t) - 1) implies "for all threads before me" + thread_value_count = __popc(warp_count_mask & ((1 << t) - 1)); // walk from 0 to max_depth + uint32_t next_thread_value_count, next_warp_value_count; for (int s_idx = 0; s_idx < max_depth; s_idx++) { PageNestingDecodeInfo* nesting_info = &nesting_info_base[s_idx]; + // if we are within the range of nesting levels we should be adding value indices for + int const in_nesting_bounds = + ((s_idx >= start_depth && s_idx <= end_depth) && in_row_bounds) ? 1 : 0; + // everything up to the max_def_level is a non-null value -//if is NOT list, then means is-not-null, OR is-null in a CHILD node -//if IS list, also: is from/in current rep level to/in the rep level AT the depth with the def value uint32_t const is_valid = d >= nesting_info->max_def_level && in_nesting_bounds ? 1 : 0; // compute warp and thread valid counts - // bit is set for each thread in the warp that is_valid -//OR of all is_valid's shifted by thread_value_count uint32_t const warp_valid_mask = // for flat schemas, a simple ballot_sync gives us the correct count and bit positions // because every value in the input matches to a value in the output -//If no lists: every entry is a new row, which may be null !has_repetition ? ballot(is_valid) : @@ -774,10 +763,8 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value // __reduce_or_sync(), but until then we have to do a warp reduce. WarpReduceOr32(is_valid << thread_value_count); -//For this value, we save an offset at every depth (in the loop) - //# bits prior to this thread that are valid (set) thread_valid_count = __popc(warp_valid_mask & ((1 << thread_value_count) - 1)); - warp_valid_count = __popc(warp_valid_mask); //#set bits of all threads in warp + warp_valid_count = __popc(warp_valid_mask); // if this is the value column emit an index for value decoding if (is_valid && s_idx == max_depth - 1) { @@ -791,15 +778,10 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value // do this for nested schemas so that we can emit an offset for the -current- nesting // level. more concretely : the offset for the current nesting level == current length of the // next nesting level - uint32_t next_thread_value_count = 0, next_warp_value_count = 0; - int next_in_nesting_bounds = 0; if (s_idx < max_depth - 1) { - //mask is different between depths - next_in_nesting_bounds = - (s_idx + 1 >= start_depth && s_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0; - uint32_t const next_warp_count_mask = ballot(next_in_nesting_bounds); - - next_warp_value_count = __popc(next_warp_count_mask); //same for all threads, but not all depths + uint32_t const next_warp_count_mask = + ballot((s_idx + 1 >= start_depth && s_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0); + next_warp_value_count = __popc(next_warp_count_mask); next_thread_value_count = __popc(next_warp_count_mask & ((1 << t) - 1)); // if we're -not- at a leaf column and we're within nesting/row bounds @@ -810,36 +792,34 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value cudf::size_type const ofs = nesting_info_base[s_idx + 1].value_count + next_thread_value_count + nesting_info_base[s_idx + 1].page_start_value; -//STORE THE OFFSET FOR THE NEW LIST LOCATION (reinterpret_cast(nesting_info->data_out))[idx] = ofs; } } - // lists always read and write to the same bounds (that is, read and write positions - // are already pre-bounded by first_row/num_rows) how? we have pre-processed them. - // flat schemas will start reading at the first value, even if that is before first_row, - // because we cannot trivially jump to the correct position to start reading. - // why not? because we don't know how many nulls were before it (haven't preprocessed them) - // since we are about to write the validity vector here + // nested schemas always read and write to the same bounds (that is, read and write positions + // are already pre-bounded by first_row/num_rows). flat schemas will start reading at the + // first value, even if that is before first_row, because we cannot trivially jump to + // the correct position to start reading. since we are about to write the validity vector here // we need to adjust our computed mask to take into account the write row bounds. int const in_write_row_bounds = !has_repetition ? thread_row_index >= s->first_row && thread_row_index < (s->first_row + s->num_rows) : in_row_bounds; - //is write_start in new int const first_thread_in_write_range = - !has_repetition ? __ffs(ballot(in_write_row_bounds)) - 1 : 0; //index of lowest bit set to + !has_repetition ? __ffs(ballot(in_write_row_bounds)) - 1 : 0; + + // # of bits to of the validity mask to write out + int const warp_valid_mask_bit_count = + first_thread_in_write_range < 0 ? 0 : warp_value_count - first_thread_in_write_range; // increment count of valid values, count of total values, and update validity mask if (!t) { - // # of bits to of the validity mask to write out //becomes bit_count - int const warp_valid_mask_bit_count = - first_thread_in_write_range < 0 ? 0 : warp_value_count - first_thread_in_write_range; - if (nesting_info->valid_map != nullptr && warp_valid_mask_bit_count > 0) { uint32_t const warp_output_valid_mask = warp_valid_mask >> first_thread_in_write_range; - store_validity(nesting_info->valid_map_offset, nesting_info->valid_map, - warp_output_valid_mask, warp_valid_mask_bit_count); + store_validity(nesting_info->valid_map_offset, + nesting_info->valid_map, + warp_output_valid_mask, + warp_valid_mask_bit_count); nesting_info->valid_map_offset += warp_valid_mask_bit_count; nesting_info->null_count += warp_valid_mask_bit_count - __popc(warp_output_valid_mask); } @@ -850,8 +830,7 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value // propagate value counts for the next level warp_value_count = next_warp_value_count; thread_value_count = next_thread_value_count; - in_nesting_bounds = next_in_nesting_bounds; - } //END OF DEPTH LOOP + } input_value_count += min(32, (target_input_value_count - input_value_count)); __syncwarp(); @@ -1117,12 +1096,6 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, s->num_rows = (page_start_row + s->first_row) + max_page_rows <= max_row ? max_page_rows : max_row - (page_start_row + s->first_row); - - static constexpr bool enable_print = false; - if constexpr (enable_print) { - printf("NUM_ROWS: col.start_row %lu, page.chunk_row %d, page_start_row %lu, s->first_row %d, s->page.num_rows %d, max_row %lu, min_row %lu, num_rows %lu, s->num_rows %d\n", - s->col.start_row, s->page.chunk_row, page_start_row, s->first_row, s->page.num_rows, max_row, min_row, num_rows, s->num_rows); - } } } @@ -1283,11 +1256,13 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, } if (s->col.column_data_base != nullptr) { + nesting_info->data_out = static_cast(s->col.column_data_base[idx]); if (s->col.column_string_base != nullptr) { nesting_info->string_out = static_cast(s->col.column_string_base[idx]); } nesting_info->data_out = static_cast(s->col.column_data_base[idx]); + if (nesting_info->data_out != nullptr) { // anything below max depth with a valid data pointer must be a list, so the // element size is the size of the offset type. @@ -1302,8 +1277,8 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, } nesting_info->valid_map = s->col.valid_map_base[idx]; if (nesting_info->valid_map != nullptr) { - nesting_info->valid_map += output_offset >> 5; //is pointer to warp start - nesting_info->valid_map_offset = (int32_t)(output_offset & 0x1f); //is index within warp + nesting_info->valid_map += output_offset >> 5; + nesting_info->valid_map_offset = (int32_t)(output_offset & 0x1f); } } } @@ -1382,7 +1357,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, s->dict_pos = 0; s->src_pos = 0; - // for non-lists, we can't know how many leaf values to skip unless we do a full + // for flat hierarchies, we can't know how many leaf values to skip unless we do a full // preprocess of the definition levels (since nulls will have no actual decodable value, there // is no direct correlation between # of rows and # of decodable values). so we will start // processing at the beginning of the value stream and disregard any indices that start @@ -1396,7 +1371,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, s->row_index_lower_bound = -1; } - // for lists, we have run a preprocess that lets us skip directly to the values + // for nested hierarchies, we have run a preprocess that lets us skip directly to the values // we need to start decoding at else { // input_row_count translates to "how many rows we have processed so far", so since we are @@ -1404,7 +1379,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, s->input_row_count = s->first_row; // return the lower bound to compare (page-relative) thread row index against. Explanation: - // In the case of lists, rows can span page boundaries. That is to say, + // In the case of nested schemas, rows can span page boundaries. That is to say, // we can encounter the first value for row X on page M, but the last value for page M // might not be the last value for row X. page M+1 (or further) may contain the last value. // @@ -1439,4 +1414,4 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, return true; } -} // namespace cudf::io::parquet::detail +} // namespace cudf::io::parquet::detail \ No newline at end of file diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index 53a55a43300..3fad8e344ea 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -184,7 +184,6 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page, } if (is_list(chunk) && !is_string_col(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) { - //if (is_list(chunk)) { if (page.encoding == Encoding::PLAIN) { return decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST; } else if (page.encoding == Encoding::BYTE_STREAM_SPLIT) { diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index d666f129af8..b8093cb3195 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -818,28 +818,6 @@ void DecodeStringPageData(cudf::detail::hostdevice_span pages, kernel_error::pointer error_code, rmm::cuda_stream_view stream); -/** - * @brief Launches kernel for reading the list column data stored in the pages - * - * The page data will be written to the output pointed to in the page's - * associated column chunk. - * - * @param[in,out] pages All pages to be decoded - * @param[in] chunks All chunks to be decoded - * @param[in] num_rows Total number of rows to read - * @param[in] min_row Minimum number of rows to read - * @param[in] level_type_size Size in bytes of the type for level decoding - * @param[out] error_code Error code for kernel failures - * @param[in] stream CUDA stream to use - */ -void DecodeListPageData(cudf::detail::hostdevice_span pages, - cudf::detail::hostdevice_span chunks, - size_t num_rows, - size_t min_row, - int level_type_size, - kernel_error::pointer error_code, - rmm::cuda_stream_view stream); - /** * @brief Launches kernel for reading the DELTA_BINARY_PACKED column data stored in the pages * From 0dccec54a6a3f40d7096d565d23755d48be68cad Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Thu, 5 Sep 2024 18:20:30 -0400 Subject: [PATCH 05/38] Add debugging --- cpp/src/io/parquet/decode_fixed.cu | 192 ++++++++++++++++++++++++++--- 1 file changed, 173 insertions(+), 19 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 685249c607e..33f11aef9b2 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -40,6 +40,8 @@ __device__ inline void gpuDecodeFixedWidthValues( uint32_t const skipped_leaf_values = s->page.skipped_leaf_values; static constexpr bool enable_print = false; + static constexpr bool enable_print_range_error = false; + if constexpr (enable_print) { if(t == 0) { printf("DECODE VALUES: start %d, end %d, first_row %d, leaf_level_index %d, dtype_len %u, " "data_out %p, dict_base %p, dict_size %d, dict_bits %d, dict_val %d, data_start %p, skipped_leaf_values %u, input_row_count %d\n", @@ -57,13 +59,17 @@ __device__ inline void gpuDecodeFixedWidthValues( int src_pos = pos + t; // the position in the output column/buffer -//Index from rolling buffer of values (which doesn't include nulls) to final array (which includes gaps for nulls) +//Index from rolling buffer of values (which doesn't include nulls) to final array (which includes gaps for nulls) auto offset = sb->nz_idx[rolling_index(src_pos)]; int dst_pos = offset; if constexpr (!has_lists_t) { dst_pos -= s->first_row; } + if constexpr (has_lists_t && enable_print_range_error) { + if((dst_pos < 0) && (src_pos < target_pos)) { printf("WHOA: decode dst_pos %d out of bounds, src_pos %d, start %d\n", dst_pos, src_pos, start); } + } + int dict_idx = rolling_index(src_pos + skipped_leaf_values); int dict_pos = sb->dict_idx[dict_idx]; if constexpr (enable_print) { @@ -126,6 +132,14 @@ __device__ inline void gpuDecodeFixedWidthValues( } else { gpuOutputGeneric(s, sb, src_pos, static_cast(dst), dtype_len); } + + if (dtype == INT32) { + int value_stored = *static_cast(dst); + int overall_index = blockIdx.x * 20000 * 4 + src_pos; + if((overall_index % 1024) != value_stored) { + printf("WHOA BAD VALUE: WROTE %d to %d!\n", value_stored, overall_index); + } + } } pos += batch_size; @@ -547,6 +561,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( int value_count = s->input_value_count; static constexpr bool enable_print = false; + static constexpr bool enable_print_range_error = false; + static constexpr bool enable_print_large_list = true; int const printf_num_threads = 0; // how many rows we've processed in the page so far @@ -568,6 +584,14 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( __syncthreads(); +if constexpr (enable_print_large_list) { + auto first_ni_value_count = s->nesting_info[0].value_count; + if((value_count != (4*input_row_count)) || (input_row_count != first_ni_value_count)){ + printf("ALGO GARBAGE GET: blockIdx.x %d, value_count %d, target_value_count %d, t %d, value_count %d, input_row_count %d, first_ni_value_count %d\n", + blockIdx.x, value_count, target_value_count, t, value_count, input_row_count, first_ni_value_count); + } +} + while (value_count < target_value_count) { if constexpr (enable_print) { if(t == 0) { printf("LIST VALUE COUNT: %d\n", value_count); } @@ -576,24 +600,33 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( // get definition level, use repitition level to get start/end depth // different for each thread, as each thread has a different r/d - int def_level = -1, start_depth = -1, end_depth = -1; + int rep_level = -1, def_level = -1, start_depth = -1, end_depth = -1; if (within_batch) { int const index = rolling_index(value_count + t); - auto const rep_level = static_cast(rep[index]); + rep_level = static_cast(rep[index]); def_level = static_cast(def[index]); //computed by generate_depth_remappings() - if constexpr (enable_print) { + if constexpr (enable_print || enable_print_range_error) { if((rep_level < 0) || (rep_level > max_depth)) { printf("WHOA: rep level %d out of bounds %d!\n", rep_level, max_depth); } + if((def_level < 0)/* || (def_level > (max_depth + 1)) */ ) { + printf("WHOA: def level %d out of bounds (max_depth %d) (index %d)!\n", def_level, max_depth, index); + } } + start_depth = s->nesting_info[rep_level].start_depth; end_depth = s->nesting_info[def_level].end_depth; - if constexpr (enable_print) { - if((def_level < 0) || (def_level > (max_depth + 1))) { - printf("WHOA: def level %d out of bounds (max_depth %d) (index %d) (end_depth %d)!\n", def_level, max_depth, index, end_depth); + if constexpr (enable_print || enable_print_range_error) { + if((start_depth < 0) || (start_depth > (max_depth + 1))) { + printf("WHOA: start_depth %d out of bounds (max_depth %d) (index %d)!\n", start_depth, max_depth, index); } + if((end_depth < 0) || (end_depth > (max_depth + 1))) { + printf("WHOA: end_depth %d out of bounds (max_depth %d) (index %d)!\n", end_depth, max_depth, index); + } + } + if constexpr (enable_print) { if (t < printf_num_threads) { printf("t %d, def_level %d, rep_level %d, start_depth %d, end_depth %d\n", \ t, def_level, rep_level, start_depth, end_depth); } } @@ -609,6 +642,21 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( block_scan(scan_storage).ExclusiveSum(is_new_row, num_prior_new_rows, total_num_new_rows); __syncthreads(); //Needed because scan_storage will be reused +if constexpr (enable_print_large_list) { + if(bool(is_new_row) != (t % 4 == 0)) { + printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, is_new_row %d\n", + blockIdx.x, value_count, target_value_count, t, is_new_row); + } + if(num_prior_new_rows != ((t + 3) / 4)) { + printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, num_prior_new_rows %d\n", + blockIdx.x, value_count, target_value_count, t, num_prior_new_rows); + } + if(total_num_new_rows != 32) { + printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, total_num_new_rows %d\n", + blockIdx.x, value_count, target_value_count, t, total_num_new_rows); + } +} + if constexpr (enable_print) { if (t == 0) { printf("num_prior_new_rows %d, total_num_new_rows %d\n", num_prior_new_rows, total_num_new_rows); } } @@ -633,6 +681,22 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( // queries is_valid from all threads, stores prior total and total total int thread_value_count = 0, block_value_count = 0; block_scan(scan_storage).ExclusiveSum(in_nesting_bounds, thread_value_count, block_value_count); + __syncthreads(); + +if constexpr (enable_print_large_list) { + if(in_nesting_bounds != (t % 4 == 0)) { + printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, in_nesting_bounds %d, start_depth %d, end_depth %d, in_row_bounds %d, row_index %d, input_row_count %d\n", + blockIdx.x, value_count, target_value_count, t, in_nesting_bounds, start_depth, end_depth, in_row_bounds, row_index, input_row_count); + } + if(thread_value_count != ((t + 3) / 4)) { + printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, thread_value_count %d\n", + blockIdx.x, value_count, target_value_count, t, thread_value_count); + } + if(block_value_count != 32) { + printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, block_value_count %d\n", + blockIdx.x, value_count, target_value_count, t, block_value_count); + } +} if constexpr (enable_print) { if (t == 0) { printf("block_value_count %d\n", block_value_count); } @@ -670,13 +734,15 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( static_assert(decode_block_size <= 8*sizeof(__uint128_t), "This code relies on bits for block threads fitting within a uint128!"); - using block_reduce = cub::BlockReduce<__uint128_t, decode_block_size>; - __shared__ typename block_reduce::TempStorage reduce_storage; auto shifted_validity = static_cast<__uint128_t>(is_valid) << thread_value_count; auto or_reducer = [](const __uint128_t& lhs, const __uint128_t& rhs){ return lhs | rhs; }; + + using block_reduce = cub::BlockReduce<__uint128_t, decode_block_size>; + __shared__ typename block_reduce::TempStorage reduce_storage; __uint128_t block_valid_mask = block_reduce(reduce_storage).Reduce(shifted_validity, or_reducer); + __syncthreads(); // TODO: WHY IS THIS NEEDED? //Reduction result is only visible to thread zero, must share with other threads: __shared__ __uint128_t block_valid_mask_storage; @@ -689,6 +755,24 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( }; auto thread_mask = (__uint128_t(1) << thread_value_count) - 1; int const thread_valid_count = count_set_bits(block_valid_mask & thread_mask); +int const block_valid_count = count_set_bits(block_valid_mask); + +if constexpr (enable_print_large_list) { + if(((d_idx == 0) && (is_valid != (t % 4 == 0))) || ((d_idx == 1) && !is_valid)) { + printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, is_valid %d, in_nesting_bounds %d\n", + blockIdx.x, value_count, target_value_count, t, d_idx, is_valid, in_nesting_bounds); + } + if (((d_idx == 0) && (thread_valid_count != ((t + 3)/ 4))) || ((d_idx == 1) && (thread_valid_count != t))) { + printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, thread_valid_count %d\n", + blockIdx.x, value_count, target_value_count, t, d_idx, thread_valid_count); + } + if(((d_idx == 0) && (block_valid_count != 32)) || ((d_idx == 1) && (block_valid_count != 128))) { + printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, block_valid_count %d\n", + blockIdx.x, value_count, target_value_count, t, d_idx, block_valid_count); + } +} + + if constexpr (enable_print) { if((block_valid_mask == 0) && (t == 0) && (d_idx == max_depth)) { @@ -715,6 +799,24 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( using block_scan = cub::BlockScan; __shared__ typename block_scan::TempStorage scan_storage; block_scan(scan_storage).ExclusiveSum(next_in_nesting_bounds, next_thread_value_count, next_block_value_count); + __syncthreads(); // TODO: WHY IS THIS NEEDED? + + +if constexpr (enable_print_large_list) { + if(next_in_nesting_bounds != 1) { + printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, next_in_nesting_bounds %d, start_depth %d, end_depth %d, in_row_bounds %d, row_index %d, input_row_count %d\n", + blockIdx.x, value_count, target_value_count, t, next_in_nesting_bounds, start_depth, end_depth, in_row_bounds, row_index, input_row_count); + } + if(next_thread_value_count != t) { + printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, next_thread_value_count %d\n", + blockIdx.x, value_count, target_value_count, t, next_thread_value_count); + } + if(next_block_value_count != 128) { + printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, next_block_value_count %d\n", + blockIdx.x, value_count, target_value_count, t, next_block_value_count); + } +} + if constexpr (enable_print) { if (t == 0) { printf("next depth %d, next_block_value_count %d\n", d_idx + 1, next_block_value_count); } @@ -734,6 +836,24 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( //STORE THE OFFSET FOR THE NEW LIST LOCATION (reinterpret_cast(ni.data_out))[idx] = ofs; +int overall_index = 4*(blockIdx.x * 20000 + idx); +if(overall_index != ofs) { + printf("WHOA BAD OFFSET: WROTE %d to %d! t %d, blockIdx.x %d, idx %d, d_idx %d, start_depth %d, end_depth %d, max_depth %d, " + "in_row_bounds %d, in_nesting_bounds %d, next_in_nesting_bounds %d, row_index %d, row_index_lower_bound %d, last_row %d, " + "input_row_count %d, num_prior_new_rows %d, is_new_row %d, total_num_new_rows %d, rep_level %d, def_level %d, ni.value_count %d, " + "thread_value_count %d, next_ni.value_count %d, next_thread_value_count %d, next_ni.page_start_value %d, value_count %d, " + "target_value_count %d, block_value_count %d, next_block_value_count %d\n", + ofs, overall_index, t, blockIdx.x, idx, d_idx, start_depth, end_depth, max_depth, in_row_bounds, in_nesting_bounds, + next_in_nesting_bounds, row_index, row_index_lower_bound, last_row, input_row_count, num_prior_new_rows, is_new_row, + total_num_new_rows, rep_level, def_level, ni.value_count, thread_value_count, next_ni.value_count, + next_thread_value_count, next_ni.page_start_value, value_count, target_value_count, block_value_count, next_block_value_count); +} + + if constexpr (enable_print || enable_print_range_error) { + if((idx < 0) || (idx > 50000)){ printf("WHOA: offset index %d out of bounds!\n", idx); } + if(ofs < 0){ printf("WHOA: offset value %d out of bounds!\n", ofs); } + } + if constexpr (enable_print) { if(idx < 0) { printf("WHOA: offset index out of bounds!\n"); } if (t < printf_num_threads) { printf("OFFSETS: t %d, idx %d, next value count %d, next page_start_value %d, ofs %d\n", @@ -801,18 +921,21 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( int const src_pos = ni.valid_count + thread_valid_count; int const output_index = rolling_index(src_pos); + if constexpr (enable_print || enable_print_range_error) { + if((output_index < 0) || (output_index >= state_buf::nz_buf_size)) { + printf("WHOA: output index STORE %d out of bounds!\n", output_index); + } + if(dst_pos < 0) { printf("WHOA: dst_pos STORE %d out of bounds!\n", dst_pos); } + } + if constexpr (enable_print) { if (t == 0) { printf("ni.value_count %d, ni.valid_count %d\n", int(ni.value_count), int(ni.valid_count)); } if (t < printf_num_threads) { printf("t %d, src_pos %d, output_index %d\n", t, src_pos, output_index); } - if((output_index < 0) || (output_index >= state_buf::nz_buf_size)) { - printf("WHOA: output index out of bounds!\n"); - } - if((t == 0) && (src_pos == 0)) {printf("SPECIAL: output_index %d, dst_pos %d, ni.value_count %d, ni.valid_count %d, thread_value_count %d, thread_valid_count %d\n", output_index, dst_pos, ni.value_count, ni.valid_count, thread_value_count, thread_valid_count);} - printf("OUTPUT_INDICES: output_index %d, dst_pos %d\n", output_index, dst_pos); + if (t == 0) { printf("OUTPUT_INDICES: output_index %d, dst_pos %d\n", output_index, dst_pos); } } //Index from rolling buffer of values (which doesn't include nulls) to final array (which includes gaps for nulls) @@ -822,11 +945,12 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( // update stuff if (t == 0) { - int const block_valid_count = count_set_bits(block_valid_mask); +// int const block_valid_count = count_set_bits(block_valid_mask); ni.valid_count += block_valid_count; ni.value_count += block_value_count; ni.valid_map_offset += block_value_count; } + __syncthreads(); // handle modification of ni.value_count from below // propagate value counts for the next depth level block_value_count = next_block_value_count; @@ -853,6 +977,13 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( // If we have lists # rows != # values s->input_row_count = input_row_count; +if constexpr (enable_print_large_list) { + auto first_ni_value_count = s->nesting_info[0].value_count; + if((value_count != (4*input_row_count)) || (input_row_count != first_ni_value_count)){ + printf("ALGO GARBAGE SET: blockIdx.x %d, value_count %d, target_value_count %d, t %d, value_count %d, input_row_count %d, first_ni_value_count %d\n", + blockIdx.x, value_count, target_value_count, t, value_count, input_row_count, first_ni_value_count); + } +} } __syncthreads(); @@ -927,6 +1058,11 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) page_state_s* const s = &state_g; auto* const sb = &state_buffers; int const page_idx = blockIdx.x; +/* page_idx = (page_idx == -1) ? blockIdx.x : page_idx + blockIdx.x; + if(page_idx >= num_pages) { + printf("BAIL ON PAGE %d of %d\n", page_idx, num_pages); + return; + }*/ int const t = threadIdx.x; PageInfo* pp = &pages[page_idx]; @@ -1008,7 +1144,15 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) if(t == 0) { printf("INIT DICT: dict_bits %d, data_start %p, data_end %p, dict_idx %p, page.num_input_values %d, s->dict_pos %d \n", s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values, s->dict_pos); } } - dict_stream.decode_next(t, s->page.skipped_leaf_values); + if constexpr (has_lists_t){ + int init_decode = 0; + while (init_decode < s->page.skipped_leaf_values) { + auto const to_skip = min(decode_block_size_t, s->page.skipped_leaf_values - init_decode); + dict_stream.decode_next(t, to_skip); + init_decode += to_skip; + __syncthreads(); + } + } } __syncthreads(); @@ -1044,14 +1188,16 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) print_nesting_level(s->nesting_info[0]); printf("POST %d NESTING 1: ", int(is_post)); print_nesting_level(s->nesting_info[1]); - printf("POST %d NESTING 2: ", int(is_post)); - print_nesting_level(s->nesting_info[2]); + //printf("POST %d NESTING 2: ", int(is_post)); + //print_nesting_level(s->nesting_info[2]); } } }; print_nestings(false); - + if constexpr (enable_print) { + if(t == 0) {printf("LOOP START page_idx %d\n", page_idx);} + } while (s->error == 0 && processed_count < s->page.num_input_values) { int next_valid_count; @@ -1153,7 +1299,13 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span pages, dim3 dim_block(decode_block_size, 1); dim3 dim_grid(pages.size(), 1); // 1 threadblock per page + /* + auto num_pages = pages.size(); + auto grid_dim = num_pages; //2, 10, 40, 100 no problem; all = problem + dim3 dim_grid(grid_dim, 1); // 1 threadblock per page +for(decltype(num_pages) idx = 0; idx < num_pages; idx += grid_dim) { + */ if (level_type_size == 1) { if (is_list) { gpuDecodePageDataGeneric pages, decode_fixed_width_values_func> <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); +// pages.device_ptr(), chunks, min_row, num_rows, error_code, idx, num_pages); } else if (has_nesting) { gpuDecodePageDataGeneric pages, decode_fixed_width_values_func> <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); +// pages.device_ptr(), chunks, min_row, num_rows, error_code, idx, num_pages); } else if (has_nesting) { gpuDecodePageDataGeneric Date: Sat, 7 Sep 2024 14:55:13 -0400 Subject: [PATCH 06/38] Tests working --- cpp/src/io/parquet/decode_fixed.cu | 147 +++++++++++++++++++---------- cpp/src/io/parquet/reader_impl.cpp | 4 + 2 files changed, 103 insertions(+), 48 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 33f11aef9b2..b1acb4d8a86 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -41,6 +41,8 @@ __device__ inline void gpuDecodeFixedWidthValues( static constexpr bool enable_print = false; static constexpr bool enable_print_range_error = false; + static constexpr bool enable_print_large_list = false; + static constexpr bool enable_print_loop_check = false; if constexpr (enable_print) { if(t == 0) { printf("DECODE VALUES: start %d, end %d, first_row %d, leaf_level_index %d, dtype_len %u, " @@ -50,11 +52,20 @@ __device__ inline void gpuDecodeFixedWidthValues( } } +int loop_count = 0; + // decode values int pos = start; while (pos < end) { int const batch_size = min(max_batch_size, end - pos); + if constexpr (enable_print_loop_check) { + ++loop_count; + if((loop_count > 100) && (t == 0)) { + printf("INFINITE LOOP IN gpuDecodeFixedWidthValues!\n"); + } + } + int const target_pos = pos + batch_size; int src_pos = pos + t; @@ -133,11 +144,13 @@ __device__ inline void gpuDecodeFixedWidthValues( gpuOutputGeneric(s, sb, src_pos, static_cast(dst), dtype_len); } - if (dtype == INT32) { - int value_stored = *static_cast(dst); - int overall_index = blockIdx.x * 20000 * 4 + src_pos; - if((overall_index % 1024) != value_stored) { - printf("WHOA BAD VALUE: WROTE %d to %d!\n", value_stored, overall_index); + if constexpr (enable_print_large_list) { + if (dtype == INT32) { + int value_stored = *static_cast(dst); + int overall_index = blockIdx.x * 20000 * 4 + src_pos; + if((overall_index % 1024) != value_stored) { + printf("WHOA BAD VALUE: WROTE %d to %d!\n", value_stored, overall_index); + } } } } @@ -562,7 +575,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( static constexpr bool enable_print = false; static constexpr bool enable_print_range_error = false; - static constexpr bool enable_print_large_list = true; + static constexpr bool enable_print_large_list = false; + static constexpr bool enable_print_loop_check = false; int const printf_num_threads = 0; // how many rows we've processed in the page so far @@ -592,7 +606,20 @@ if constexpr (enable_print_large_list) { } } + using block_scan = cub::BlockScan; + __shared__ typename block_scan::TempStorage scan_storage; + +int loop_count = 0; + while (value_count < target_value_count) { + + if constexpr (enable_print_loop_check) { + ++loop_count; + if((loop_count > 100) && (t == 0)) { + printf("INFINITE LOOP IN LISTS!\n"); + } + } + if constexpr (enable_print) { if(t == 0) { printf("LIST VALUE COUNT: %d\n", value_count); } } @@ -627,20 +654,19 @@ if constexpr (enable_print_large_list) { } } if constexpr (enable_print) { - if (t < printf_num_threads) { printf("t %d, def_level %d, rep_level %d, start_depth %d, end_depth %d\n", \ - t, def_level, rep_level, start_depth, end_depth); } + if (t == 0) { printf("t %d, def_level %d, rep_level %d, start_depth %d, end_depth %d, max_depth %d\n", \ + t, def_level, rep_level, start_depth, end_depth, max_depth); } } } //Determine value count & row index // track (page-relative) row index for the thread so we can compare against input bounds // keep track of overall # of rows we've read. + //THIS IS THE UNDO POINT int const is_new_row = start_depth == 0 ? 1 : 0; int num_prior_new_rows, total_num_new_rows; - using block_scan = cub::BlockScan; - __shared__ typename block_scan::TempStorage scan_storage; block_scan(scan_storage).ExclusiveSum(is_new_row, num_prior_new_rows, total_num_new_rows); - __syncthreads(); //Needed because scan_storage will be reused + __syncthreads(); if constexpr (enable_print_large_list) { if(bool(is_new_row) != (t % 4 == 0)) { @@ -704,9 +730,17 @@ if constexpr (enable_print_large_list) { t, thread_value_count, in_nesting_bounds); } } +int depth_loop_count = 0; // column is either nullable or is a list (or both): iterate by depth for (int d_idx = 0; d_idx <= max_depth; d_idx++) { + if constexpr (enable_print_loop_check) { + ++depth_loop_count; + if((depth_loop_count > 100) && (t == 0)) { + printf("INFINITE LOOP IN LISTS DEPTH!\n"); + } + } + auto& ni = s->nesting_info[d_idx]; // everything up to the max_def_level is a non-null value @@ -742,7 +776,6 @@ if constexpr (enable_print_large_list) { using block_reduce = cub::BlockReduce<__uint128_t, decode_block_size>; __shared__ typename block_reduce::TempStorage reduce_storage; __uint128_t block_valid_mask = block_reduce(reduce_storage).Reduce(shifted_validity, or_reducer); - __syncthreads(); // TODO: WHY IS THIS NEEDED? //Reduction result is only visible to thread zero, must share with other threads: __shared__ __uint128_t block_valid_mask_storage; @@ -755,7 +788,7 @@ if constexpr (enable_print_large_list) { }; auto thread_mask = (__uint128_t(1) << thread_value_count) - 1; int const thread_valid_count = count_set_bits(block_valid_mask & thread_mask); -int const block_valid_count = count_set_bits(block_valid_mask); +//int const block_valid_count = count_set_bits(block_valid_mask); if constexpr (enable_print_large_list) { if(((d_idx == 0) && (is_valid != (t % 4 == 0))) || ((d_idx == 1) && !is_valid)) { @@ -766,14 +799,12 @@ if constexpr (enable_print_large_list) { printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, thread_valid_count %d\n", blockIdx.x, value_count, target_value_count, t, d_idx, thread_valid_count); } - if(((d_idx == 0) && (block_valid_count != 32)) || ((d_idx == 1) && (block_valid_count != 128))) { +/* if(((d_idx == 0) && (block_valid_count != 32)) || ((d_idx == 1) && (block_valid_count != 128))) { printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, block_valid_count %d\n", blockIdx.x, value_count, target_value_count, t, d_idx, block_valid_count); - } + }*/ } - - if constexpr (enable_print) { if((block_valid_mask == 0) && (t == 0) && (d_idx == max_depth)) { printf("EMPTY VALID MASK: def_level %d, max_def_level %d, in_nesting_bounds %d, start_depth %d, " @@ -796,11 +827,8 @@ if constexpr (enable_print_large_list) { next_in_nesting_bounds = (d_idx + 1 >= start_depth && d_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0; - using block_scan = cub::BlockScan; - __shared__ typename block_scan::TempStorage scan_storage; block_scan(scan_storage).ExclusiveSum(next_in_nesting_bounds, next_thread_value_count, next_block_value_count); - __syncthreads(); // TODO: WHY IS THIS NEEDED? - + __syncthreads(); if constexpr (enable_print_large_list) { if(next_in_nesting_bounds != 1) { @@ -817,7 +845,6 @@ if constexpr (enable_print_large_list) { } } - if constexpr (enable_print) { if (t == 0) { printf("next depth %d, next_block_value_count %d\n", d_idx + 1, next_block_value_count); } if (t < printf_num_threads) { printf("t %d, start_depth %d, end_depth %d, in_row_bounds %d, next_in_nesting_bounds %d\n", @@ -836,17 +863,19 @@ if constexpr (enable_print_large_list) { //STORE THE OFFSET FOR THE NEW LIST LOCATION (reinterpret_cast(ni.data_out))[idx] = ofs; -int overall_index = 4*(blockIdx.x * 20000 + idx); -if(overall_index != ofs) { - printf("WHOA BAD OFFSET: WROTE %d to %d! t %d, blockIdx.x %d, idx %d, d_idx %d, start_depth %d, end_depth %d, max_depth %d, " - "in_row_bounds %d, in_nesting_bounds %d, next_in_nesting_bounds %d, row_index %d, row_index_lower_bound %d, last_row %d, " - "input_row_count %d, num_prior_new_rows %d, is_new_row %d, total_num_new_rows %d, rep_level %d, def_level %d, ni.value_count %d, " - "thread_value_count %d, next_ni.value_count %d, next_thread_value_count %d, next_ni.page_start_value %d, value_count %d, " - "target_value_count %d, block_value_count %d, next_block_value_count %d\n", - ofs, overall_index, t, blockIdx.x, idx, d_idx, start_depth, end_depth, max_depth, in_row_bounds, in_nesting_bounds, - next_in_nesting_bounds, row_index, row_index_lower_bound, last_row, input_row_count, num_prior_new_rows, is_new_row, - total_num_new_rows, rep_level, def_level, ni.value_count, thread_value_count, next_ni.value_count, - next_thread_value_count, next_ni.page_start_value, value_count, target_value_count, block_value_count, next_block_value_count); +if constexpr (enable_print_large_list) { + int overall_index = 4*(blockIdx.x * 20000 + idx); + if(overall_index != ofs) { + printf("WHOA BAD OFFSET: WROTE %d to %d! t %d, blockIdx.x %d, idx %d, d_idx %d, start_depth %d, end_depth %d, max_depth %d, " + "in_row_bounds %d, in_nesting_bounds %d, next_in_nesting_bounds %d, row_index %d, row_index_lower_bound %d, last_row %d, " + "input_row_count %d, num_prior_new_rows %d, is_new_row %d, total_num_new_rows %d, rep_level %d, def_level %d, ni.value_count %d, " + "thread_value_count %d, next_ni.value_count %d, next_thread_value_count %d, next_ni.page_start_value %d, value_count %d, " + "target_value_count %d, block_value_count %d, next_block_value_count %d\n", + ofs, overall_index, t, blockIdx.x, idx, d_idx, start_depth, end_depth, max_depth, in_row_bounds, in_nesting_bounds, + next_in_nesting_bounds, row_index, row_index_lower_bound, last_row, input_row_count, num_prior_new_rows, is_new_row, + total_num_new_rows, rep_level, def_level, ni.value_count, thread_value_count, next_ni.value_count, + next_thread_value_count, next_ni.page_start_value, value_count, target_value_count, block_value_count, next_block_value_count); + } } if constexpr (enable_print || enable_print_range_error) { @@ -914,11 +943,14 @@ if(overall_index != ofs) { } // if this is valid and we're at the leaf, output dst_pos + // Read these before the sync, so that when thread 0 modifies them we've already read their values + int current_value_count = ni.value_count; + int current_valid_count = ni.valid_count; __syncthreads(); // handle modification of ni.valid_count from below if (is_valid && d_idx == max_depth) { // for non-list types, the value count is always the same across - int const dst_pos = ni.value_count + thread_value_count; - int const src_pos = ni.valid_count + thread_valid_count; + int const dst_pos = current_value_count + thread_value_count; + int const src_pos = current_valid_count + thread_valid_count; int const output_index = rolling_index(src_pos); if constexpr (enable_print || enable_print_range_error) { @@ -941,11 +973,11 @@ if(overall_index != ofs) { //Index from rolling buffer of values (which doesn't include nulls) to final array (which includes gaps for nulls) sb->nz_idx[output_index] = dst_pos; } - __syncthreads(); // handle modification of ni.value_count from below +// __syncthreads(); // handle modification of ni.value_count from below TODO: TRY REMOVE // update stuff if (t == 0) { -// int const block_valid_count = count_set_bits(block_valid_mask); + int const block_valid_count = count_set_bits(block_valid_mask); ni.valid_count += block_valid_count; ni.value_count += block_value_count; ni.valid_map_offset += block_value_count; @@ -1044,7 +1076,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) device_span chunks, size_t min_row, size_t num_rows, - kernel_error::pointer error_code) + kernel_error::pointer error_code /*, int page_idx = -1, int num_pages = -1*/) { constexpr int rolling_buf_size = decode_block_size_t * 2; constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size(); @@ -1059,7 +1091,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) auto* const sb = &state_buffers; int const page_idx = blockIdx.x; /* page_idx = (page_idx == -1) ? blockIdx.x : page_idx + blockIdx.x; - if(page_idx >= num_pages) { + if((page_idx >= num_pages) && (num_pages != -1)) { printf("BAIL ON PAGE %d of %d\n", page_idx, num_pages); return; }*/ @@ -1091,16 +1123,18 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) bool const should_process_def_levels = should_process_nulls || has_lists_t; // shared buffer. all shared memory is suballocated out of here + static constexpr auto align_test = false; + static constexpr size_t buffer_alignment = align_test ? 128 : 16; constexpr int shared_rep_size = has_lists_t ? cudf::util::round_up_unsafe(rle_run_buffer_size * - sizeof(rle_run), size_t{16}) : 0; + sizeof(rle_run), buffer_alignment) : 0; constexpr int shared_dict_size = has_dict_t - ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), size_t{16}) + ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), buffer_alignment) : 0; constexpr int shared_def_size = - cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), size_t{16}); + cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), buffer_alignment); constexpr int shared_buf_size = shared_rep_size + shared_dict_size + shared_def_size; - __shared__ __align__(16) uint8_t shared_buf[shared_buf_size]; + __shared__ __align__(buffer_alignment) uint8_t shared_buf[shared_buf_size]; // setup all shared memory buffers int shared_offset = 0; @@ -1133,11 +1167,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) } static constexpr bool enable_print = false; + static constexpr bool enable_print_loop_check = false; rle_stream dict_stream{dict_runs}; if constexpr (has_dict_t) { - //auto const skipped_leaf_values = s->page.skipped_leaf_values; - //int const dict_offset = skipped_leaf_values * sizeof(uint32_t); dict_stream.init( s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values); if constexpr (enable_print) { @@ -1156,6 +1189,12 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) } __syncthreads(); + if constexpr (enable_print) { + if((t == 0) && (page_idx == 0)){ + printf("SIZES: shared_rep_size %d, shared_dict_size %d, shared_def_size %d\n", shared_rep_size, shared_dict_size, shared_def_size); + } + } + // We use two counters in the loop below: processed_count and valid_count. // - processed_count: number of values out of num_input_values that we have decoded so far. // the definition stream returns the number of total rows it has processed in each call @@ -1198,11 +1237,22 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) if constexpr (enable_print) { if(t == 0) {printf("LOOP START page_idx %d\n", page_idx);} } +int loop_count = 0; while (s->error == 0 && processed_count < s->page.num_input_values) { int next_valid_count; + if constexpr (enable_print_loop_check) { + ++loop_count; + if((loop_count > 10000) && (t == 0)) { + printf("INFINITE LOOP IN MAIN!\n"); + } + } + if constexpr (has_lists_t){ rep_decoder.decode_next(t); + if constexpr (!align_test) { + __syncthreads(); + } } // only need to process definition levels if this is a nullable column @@ -1299,13 +1349,13 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span pages, dim3 dim_block(decode_block_size, 1); dim3 dim_grid(pages.size(), 1); // 1 threadblock per page - /* +/* auto num_pages = pages.size(); - auto grid_dim = num_pages; //2, 10, 40, 100 no problem; all = problem + auto grid_dim = 1; //2, 10, 40, 100 no problem; all = problem dim3 dim_grid(grid_dim, 1); // 1 threadblock per page for(decltype(num_pages) idx = 0; idx < num_pages; idx += grid_dim) { - */ +*/ if (level_type_size == 1) { if (is_list) { gpuDecodePageDataGeneric pages, diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 8f33f318f54..b305a7348e1 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -51,6 +51,8 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num { auto& pass = *_pass_itm_data; auto& subpass = *pass.subpass; +//printf("PREP LAUNCH: decode_page_data: mode %d, skip_rows %lu, num_rows %lu, #pages %lu\n", +// (int)mode, skip_rows, num_rows, subpass.pages.size()); auto& page_nesting = subpass.page_nesting_info; auto& page_nesting_decode = subpass.page_nesting_decode_info; @@ -418,9 +420,11 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num page_nesting.device_to_host_async(_stream); page_nesting_decode.device_to_host_async(_stream); +//printf("SYNC ERROR CODE\n"); if (auto const error = error_code.value_sync(_stream); error != 0) { CUDF_FAIL("Parquet data decode failed with code(s) " + kernel_error::to_string(error)); } +//printf("ERROR CODE SUNK\n"); // for list columns, add the final offset to every offset buffer. // TODO : make this happen in more efficiently. Maybe use thrust::for_each From 24c9ab1012d69d0a5e98134ce8c3c88082f34c24 Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Mon, 9 Sep 2024 17:27:16 -0400 Subject: [PATCH 07/38] compile fixes --- cpp/src/io/parquet/decode_fixed.cu | 60 +++++++----------------------- 1 file changed, 14 insertions(+), 46 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index b1acb4d8a86..62bdf01c533 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -42,7 +42,6 @@ __device__ inline void gpuDecodeFixedWidthValues( static constexpr bool enable_print = false; static constexpr bool enable_print_range_error = false; static constexpr bool enable_print_large_list = false; - static constexpr bool enable_print_loop_check = false; if constexpr (enable_print) { if(t == 0) { printf("DECODE VALUES: start %d, end %d, first_row %d, leaf_level_index %d, dtype_len %u, " @@ -52,20 +51,11 @@ __device__ inline void gpuDecodeFixedWidthValues( } } -int loop_count = 0; - // decode values int pos = start; while (pos < end) { int const batch_size = min(max_batch_size, end - pos); - if constexpr (enable_print_loop_check) { - ++loop_count; - if((loop_count > 100) && (t == 0)) { - printf("INFINITE LOOP IN gpuDecodeFixedWidthValues!\n"); - } - } - int const target_pos = pos + batch_size; int src_pos = pos + t; @@ -576,8 +566,6 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( static constexpr bool enable_print = false; static constexpr bool enable_print_range_error = false; static constexpr bool enable_print_large_list = false; - static constexpr bool enable_print_loop_check = false; - int const printf_num_threads = 0; // how many rows we've processed in the page so far int input_row_count = s->input_row_count; @@ -609,17 +597,8 @@ if constexpr (enable_print_large_list) { using block_scan = cub::BlockScan; __shared__ typename block_scan::TempStorage scan_storage; -int loop_count = 0; - while (value_count < target_value_count) { - if constexpr (enable_print_loop_check) { - ++loop_count; - if((loop_count > 100) && (t == 0)) { - printf("INFINITE LOOP IN LISTS!\n"); - } - } - if constexpr (enable_print) { if(t == 0) { printf("LIST VALUE COUNT: %d\n", value_count); } } @@ -700,7 +679,7 @@ if constexpr (enable_print_large_list) { if constexpr (enable_print) { if(t == 0) { printf("LIST ROWS: row_index %d, row_index_lower_bound %d, last_row %d, in_row_bounds %d, in_nesting_bounds %d\n", row_index, row_index_lower_bound, last_row, in_row_bounds, in_nesting_bounds); } - if (t < printf_num_threads) { printf("t %d, is_new_row %d, num_prior_new_rows %d, row_index %d, in_row_bounds %d\n", + if (t < 32) { printf("t %d, is_new_row %d, num_prior_new_rows %d, row_index %d, in_row_bounds %d\n", t, is_new_row, num_prior_new_rows, row_index, in_row_bounds); } } @@ -726,21 +705,13 @@ if constexpr (enable_print_large_list) { if constexpr (enable_print) { if (t == 0) { printf("block_value_count %d\n", block_value_count); } - if (t < printf_num_threads) { printf("t %d, thread_value_count %d, in_nesting_bounds %d\n", + if (t < 32) { printf("t %d, thread_value_count %d, in_nesting_bounds %d\n", t, thread_value_count, in_nesting_bounds); } } -int depth_loop_count = 0; // column is either nullable or is a list (or both): iterate by depth for (int d_idx = 0; d_idx <= max_depth; d_idx++) { - if constexpr (enable_print_loop_check) { - ++depth_loop_count; - if((depth_loop_count > 100) && (t == 0)) { - printf("INFINITE LOOP IN LISTS DEPTH!\n"); - } - } - auto& ni = s->nesting_info[d_idx]; // everything up to the max_def_level is a non-null value @@ -754,7 +725,7 @@ int depth_loop_count = 0; if constexpr (enable_print) { if (t == 0) { printf("nullable %d, depth %d, max_depth %d, max_def_level %d, value_count %d\n", int(nullable), d_idx, max_depth, ni.max_def_level, value_count); } - if (t < printf_num_threads) { printf("t %d, def_level %d, in_nesting_bounds %d, is_valid %d\n", + if (t < 32) { printf("t %d, def_level %d, in_nesting_bounds %d, is_valid %d\n", t, def_level, in_nesting_bounds, is_valid); } } @@ -813,7 +784,7 @@ if constexpr (enable_print_large_list) { row_index_lower_bound, last_row, input_row_count); } if (t == 0) { printf("block_valid_mask %u\n", int(block_valid_mask)); } - if (t < printf_num_threads) { printf("t %d, thread_valid_count %d\n", t, thread_valid_count); } + if (t < 32) { printf("t %d, thread_valid_count %d\n", t, thread_valid_count); } } // compute warp and thread value counts for the -next- nesting level. we need to @@ -847,9 +818,9 @@ if constexpr (enable_print_large_list) { if constexpr (enable_print) { if (t == 0) { printf("next depth %d, next_block_value_count %d\n", d_idx + 1, next_block_value_count); } - if (t < printf_num_threads) { printf("t %d, start_depth %d, end_depth %d, in_row_bounds %d, next_in_nesting_bounds %d\n", + if (t < 32) { printf("t %d, start_depth %d, end_depth %d, in_row_bounds %d, next_in_nesting_bounds %d\n", t, start_depth, end_depth, in_row_bounds, next_in_nesting_bounds); } - if (t < printf_num_threads) { printf("t %d, next_thread_value_count %d\n", t, next_thread_value_count); } + if (t < 32) { printf("t %d, next_thread_value_count %d\n", t, next_thread_value_count); } } // if we're -not- at a leaf column and we're within nesting/row bounds @@ -885,7 +856,7 @@ if constexpr (enable_print_large_list) { if constexpr (enable_print) { if(idx < 0) { printf("WHOA: offset index out of bounds!\n"); } - if (t < printf_num_threads) { printf("OFFSETS: t %d, idx %d, next value count %d, next page_start_value %d, ofs %d\n", + if (t < 32) { printf("OFFSETS: t %d, idx %d, next value count %d, next page_start_value %d, ofs %d\n", t, idx, next_ni.value_count, next_ni.page_start_value, ofs); } } } @@ -962,7 +933,7 @@ if constexpr (enable_print_large_list) { if constexpr (enable_print) { if (t == 0) { printf("ni.value_count %d, ni.valid_count %d\n", int(ni.value_count), int(ni.valid_count)); } - if (t < printf_num_threads) { printf("t %d, src_pos %d, output_index %d\n", t, src_pos, output_index); } + if (t < 32) { printf("t %d, src_pos %d, output_index %d\n", t, src_pos, output_index); } if((t == 0) && (src_pos == 0)) {printf("SPECIAL: output_index %d, dst_pos %d, ni.value_count %d, ni.valid_count %d, thread_value_count %d, thread_valid_count %d\n", output_index, dst_pos, ni.value_count, ni.valid_count, thread_value_count, thread_valid_count);} @@ -1167,7 +1138,6 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) } static constexpr bool enable_print = false; - static constexpr bool enable_print_loop_check = false; rle_stream dict_stream{dict_runs}; if constexpr (has_dict_t) { @@ -1193,6 +1163,11 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) if((t == 0) && (page_idx == 0)){ printf("SIZES: shared_rep_size %d, shared_dict_size %d, shared_def_size %d\n", shared_rep_size, shared_dict_size, shared_def_size); } + if constexpr (has_lists_t){ + printf("Is fixed list page\n"); + } else { + printf("Is fixed non-list page\n"); + } } // We use two counters in the loop below: processed_count and valid_count. @@ -1237,17 +1212,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) if constexpr (enable_print) { if(t == 0) {printf("LOOP START page_idx %d\n", page_idx);} } -int loop_count = 0; + while (s->error == 0 && processed_count < s->page.num_input_values) { int next_valid_count; - if constexpr (enable_print_loop_check) { - ++loop_count; - if((loop_count > 10000) && (t == 0)) { - printf("INFINITE LOOP IN MAIN!\n"); - } - } - if constexpr (has_lists_t){ rep_decoder.decode_next(t); if constexpr (!align_test) { From 342c2f43633ca10c0a547d2f6ace65836fde6b94 Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Tue, 10 Sep 2024 14:25:33 -0400 Subject: [PATCH 08/38] No need to decode def levels if not nullable --- cpp/src/io/parquet/decode_fixed.cu | 31 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 62bdf01c533..cd4345266d2 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -610,20 +610,24 @@ if constexpr (enable_print_large_list) { if (within_batch) { int const index = rolling_index(value_count + t); rep_level = static_cast(rep[index]); - def_level = static_cast(def[index]); + if constexpr (nullable) { + def_level = static_cast(def[index]); + end_depth = s->nesting_info[def_level].end_depth; + } else { + end_depth = max_depth; + } //computed by generate_depth_remappings() if constexpr (enable_print || enable_print_range_error) { if((rep_level < 0) || (rep_level > max_depth)) { printf("WHOA: rep level %d out of bounds %d!\n", rep_level, max_depth); } - if((def_level < 0)/* || (def_level > (max_depth + 1)) */ ) { + if(nullable && ((def_level < 0)/* || (def_level > (max_depth + 1)) */ )) { printf("WHOA: def level %d out of bounds (max_depth %d) (index %d)!\n", def_level, max_depth, index); } } start_depth = s->nesting_info[rep_level].start_depth; - end_depth = s->nesting_info[def_level].end_depth; if constexpr (enable_print || enable_print_range_error) { if((start_depth < 0) || (start_depth > (max_depth + 1))) { printf("WHOA: start_depth %d out of bounds (max_depth %d) (index %d)!\n", start_depth, max_depth, index); @@ -736,6 +740,7 @@ if constexpr (enable_print_large_list) { // however not all of them will necessarily represent a value at this nesting level. so // the validity bit for thread t might actually represent output value t-6. the correct // position for thread t's bit is thread_value_count. + static_assert(decode_block_size <= 8*sizeof(__uint128_t), "This code relies on bits for block threads fitting within a uint128!"); @@ -944,7 +949,6 @@ if constexpr (enable_print_large_list) { //Index from rolling buffer of values (which doesn't include nulls) to final array (which includes gaps for nulls) sb->nz_idx[output_index] = dst_pos; } -// __syncthreads(); // handle modification of ni.value_count from below TODO: TRY REMOVE // update stuff if (t == 0) { @@ -1091,7 +1095,6 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) bool const nullable = is_nullable(s); bool const should_process_nulls = nullable && maybe_has_nulls(s); - bool const should_process_def_levels = should_process_nulls || has_lists_t; // shared buffer. all shared memory is suballocated out of here static constexpr auto align_test = false; @@ -1119,7 +1122,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) // initialize the stream decoders (requires values computed in setupLocalPageInfo) rle_stream def_decoder{def_runs}; level_t* const def = reinterpret_cast(pp->lvl_decode_buf[level_type::DEFINITION]); - if (should_process_def_levels) { + if (should_process_nulls) { def_decoder.init(s->col.level_bits[level_type::DEFINITION], s->abs_lvl_start[level_type::DEFINITION], s->abs_lvl_end[level_type::DEFINITION], @@ -1187,7 +1190,6 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) page_idx, int(nullable), int(should_process_nulls), int(has_lists_t), int(has_dict_t), num_rows, s->page.num_input_values); } } - auto print_nestings = [&](bool is_post){ if constexpr (enable_print) { auto print_nesting_level = [&](const PageNestingDecodeInfo& ni) { @@ -1216,19 +1218,15 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) while (s->error == 0 && processed_count < s->page.num_input_values) { int next_valid_count; - if constexpr (has_lists_t){ - rep_decoder.decode_next(t); - if constexpr (!align_test) { - __syncthreads(); - } - } - // only need to process definition levels if this is a nullable column if (should_process_nulls) { processed_count += def_decoder.decode_next(t); __syncthreads(); if constexpr (has_lists_t) { + rep_decoder.decode_next(t); + __syncthreads(); + int value_count = s->input_value_count; next_valid_count = gpuUpdateValidityAndRowIndicesLists( processed_count, s, sb, def, rep, t); @@ -1254,13 +1252,12 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) // nz_idx. gpuDecodeFixedWidthValues would be the only work that happens. else { if constexpr (has_lists_t) { - // no nulls, but if we have a list we still need the definition levels - processed_count += def_decoder.decode_next(t); + processed_count += rep_decoder.decode_next(t); __syncthreads(); next_valid_count = gpuUpdateValidityAndRowIndicesLists( - processed_count, s, sb, def, rep, t); + processed_count, s, sb, nullptr, rep, t); } else { processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count); From 50bbc94a571ea1163acebbb23453d01c0ba8793d Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Tue, 10 Sep 2024 16:03:25 -0400 Subject: [PATCH 09/38] Manual block scan --- cpp/src/io/parquet/decode_fixed.cu | 177 ++++++++++++++++++----------- 1 file changed, 110 insertions(+), 67 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index cd4345266d2..b47b96b91a2 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -551,6 +551,48 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( return valid_count; } +struct scan_results +{ + uint32_t warp_bits; + int thread_count_within_warp; + int warp_count; + + int thread_count_within_block; + int block_count; +}; + +template +static __device__ void scan_block(uint32_t warp_bits, int warp_lane, int warp_index, uint32_t thread_mask, scan_results& results) +{ + constexpr int num_warps = decode_block_size / cudf::detail::warp_size; + + results.warp_bits = warp_bits; + results.warp_count = __popc(results.warp_bits); + results.thread_count_within_warp = __popc(results.warp_bits & thread_mask); + + __shared__ uint32_t warp_counts[num_warps]; + if(warp_lane == 0) { + warp_counts[warp_index] = results.warp_count; + } + __syncthreads(); + + results.block_count = 0; + results.thread_count_within_block = results.thread_count_within_warp; + for(int warp_idx = 0; warp_idx < num_warps; ++warp_idx) { + results.block_count += warp_counts[warp_idx]; + if(warp_idx < warp_index) { + results.thread_count_within_block += warp_counts[warp_idx]; + } + } +} + +template +static __device__ void scan_block(int thread_bit, int warp_lane, int warp_index, uint32_t thread_mask, scan_results& results) +{ + uint32_t warp_bits = ballot(thread_bit); + scan_block(warp_bits, warp_lane, warp_index, thread_mask, results); +} + template static __device__ int gpuUpdateValidityAndRowIndicesLists( int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, @@ -597,6 +639,11 @@ if constexpr (enable_print_large_list) { using block_scan = cub::BlockScan; __shared__ typename block_scan::TempStorage scan_storage; + int const warp_lane = t % cudf::detail::warp_size; + bool const is_first_lane = warp_lane == 0; + int const warp_index = t / cudf::detail::warp_size; + uint32_t const lane_mask = (uint32_t(1) << warp_lane) - 1; + while (value_count < target_value_count) { if constexpr (enable_print) { @@ -688,9 +735,15 @@ if constexpr (enable_print_large_list) { } // queries is_valid from all threads, stores prior total and total total - int thread_value_count = 0, block_value_count = 0; - block_scan(scan_storage).ExclusiveSum(in_nesting_bounds, thread_value_count, block_value_count); - __syncthreads(); + + //WARP VALUE COUNT: + scan_results value_count_scan_results; + scan_block(in_nesting_bounds, warp_lane, warp_index, lane_mask, value_count_scan_results); + + int thread_value_count_within_warp = value_count_scan_results.thread_count_within_warp; + int warp_value_count = value_count_scan_results.warp_count; + int thread_value_count = value_count_scan_results.thread_count_within_block; + int block_value_count = value_count_scan_results.block_count; if constexpr (enable_print_large_list) { if(in_nesting_bounds != (t % 4 == 0)) { @@ -741,30 +794,22 @@ if constexpr (enable_print_large_list) { // the validity bit for thread t might actually represent output value t-6. the correct // position for thread t's bit is thread_value_count. - static_assert(decode_block_size <= 8*sizeof(__uint128_t), - "This code relies on bits for block threads fitting within a uint128!"); - - auto shifted_validity = static_cast<__uint128_t>(is_valid) << thread_value_count; - auto or_reducer = [](const __uint128_t& lhs, const __uint128_t& rhs){ - return lhs | rhs; - }; - using block_reduce = cub::BlockReduce<__uint128_t, decode_block_size>; - __shared__ typename block_reduce::TempStorage reduce_storage; - __uint128_t block_valid_mask = block_reduce(reduce_storage).Reduce(shifted_validity, or_reducer); +//WARP VALID COUNT: + // for nested schemas, it's more complicated. This warp will visit 32 incoming values, + // however not all of them will necessarily represent a value at this nesting level. so + // the validity bit for thread t might actually represent output value t-6. the correct + // position for thread t's bit is thread_value_count. for cuda 11 we could use + // __reduce_or_sync(), but until then we have to do a warp reduce. + uint32_t const warp_valid_mask = WarpReduceOr32((uint32_t)is_valid << thread_value_count_within_warp); + auto thread_mask = (uint32_t(1) << thread_value_count_within_warp) - 1; - //Reduction result is only visible to thread zero, must share with other threads: - __shared__ __uint128_t block_valid_mask_storage; - if(t == 0) { block_valid_mask_storage = block_valid_mask; } - __syncthreads(); - block_valid_mask = block_valid_mask_storage; + scan_results valid_count_scan_results; + scan_block(warp_valid_mask, warp_lane, warp_index, thread_mask, valid_count_scan_results); - auto count_set_bits = [](__uint128_t bits){ - return __popcll((uint64_t)bits) + __popcll((uint64_t)(bits >> 64)); - }; - auto thread_mask = (__uint128_t(1) << thread_value_count) - 1; - int const thread_valid_count = count_set_bits(block_valid_mask & thread_mask); -//int const block_valid_count = count_set_bits(block_valid_mask); + int warp_valid_count = valid_count_scan_results.warp_count; + int thread_valid_count = valid_count_scan_results.thread_count_within_block; + int block_valid_count = valid_count_scan_results.block_count; if constexpr (enable_print_large_list) { if(((d_idx == 0) && (is_valid != (t % 4 == 0))) || ((d_idx == 1) && !is_valid)) { @@ -775,20 +820,20 @@ if constexpr (enable_print_large_list) { printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, thread_valid_count %d\n", blockIdx.x, value_count, target_value_count, t, d_idx, thread_valid_count); } -/* if(((d_idx == 0) && (block_valid_count != 32)) || ((d_idx == 1) && (block_valid_count != 128))) { + if(((d_idx == 0) && (block_valid_count != 32)) || ((d_idx == 1) && (block_valid_count != 128))) { printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, block_valid_count %d\n", blockIdx.x, value_count, target_value_count, t, d_idx, block_valid_count); - }*/ + } } if constexpr (enable_print) { - if((block_valid_mask == 0) && (t == 0) && (d_idx == max_depth)) { + if((block_valid_count == 0) && (t == 0) && (d_idx == max_depth)) { printf("EMPTY VALID MASK: def_level %d, max_def_level %d, in_nesting_bounds %d, start_depth %d, " "end_depth %d, in_row_bounds %d, row_index %d, row_index_lower_bound %d, last_row %d, input_row_count %d\n", def_level, ni.max_def_level, in_nesting_bounds, start_depth, end_depth, in_row_bounds, row_index, row_index_lower_bound, last_row, input_row_count); } - if (t == 0) { printf("block_valid_mask %u\n", int(block_valid_mask)); } + if (t == 0) { printf("block_valid_count %u\n", int(block_valid_count)); } if (t < 32) { printf("t %d, thread_valid_count %d\n", t, thread_valid_count); } } @@ -796,15 +841,23 @@ if constexpr (enable_print_large_list) { // do this for nested schemas so that we can emit an offset for the -current- nesting // level. more concretely : the offset for the current nesting level == current length of the // next nesting level - int32_t next_thread_value_count = 0, next_block_value_count = 0; + int next_thread_value_count_within_warp = 0, next_warp_value_count = 0; + int next_thread_value_count = 0, next_block_value_count = 0; int next_in_nesting_bounds = 0; if (d_idx < max_depth) { //mask is different between depths next_in_nesting_bounds = (d_idx + 1 >= start_depth && d_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0; - block_scan(scan_storage).ExclusiveSum(next_in_nesting_bounds, next_thread_value_count, next_block_value_count); - __syncthreads(); +//NEXT WARP VALUE COUNT: + scan_results next_value_count_scan_results; + scan_block(next_in_nesting_bounds, warp_lane, warp_index, lane_mask, next_value_count_scan_results); + + next_thread_value_count_within_warp = next_value_count_scan_results.thread_count_within_warp; + next_warp_value_count = next_value_count_scan_results.warp_count; + next_thread_value_count = next_value_count_scan_results.thread_count_within_block; + next_block_value_count = next_value_count_scan_results.block_count; + if constexpr (enable_print_large_list) { if(next_in_nesting_bounds != 1) { @@ -873,49 +926,38 @@ if constexpr (enable_print_large_list) { // (that is, read and write positions are already pre-bounded by first_row/num_rows). // since we are about to write the validity vector // here we need to adjust our computed mask to take into account the write row bounds. - int warp_null_count = 0; if constexpr (nullable) { - if (ni.valid_map != nullptr) { //TODO: Consider OR'ING for next_thread_value_count and popc() for next_thread_value_count //so that we don't have to take a ballot here. Is uint128 so may deconstruct to this anyway ... - uint32_t const warp_count_mask = ballot(in_nesting_bounds); - if ((t % cudf::detail::warp_size) == 0) { - // last bit in the warp to store //in old is warp_valid_mask_bit_count + + int warp_null_count = 0; + if(is_first_lane && (ni.valid_map != nullptr) && (warp_value_count > 0)) { + // last bit in the warp to store //in old is warp_valid_mask_bit_count //so it's a count of everything in nesting bounds, though bits can be zero if NULL at this level - int const bit_count = __popc(warp_count_mask); - if(bit_count > 0) { - - // absolute bit offset into the output validity map - //is cumulative sum of bit_count at the given nesting depth - // DON'T subtract by first_row: since it's lists it's not 1-row-per-value - int const bit_offset = ni.valid_map_offset + thread_value_count; - auto const shifted_valid_mask = static_cast(block_valid_mask >> thread_value_count); - auto const bit_range_mask = (1 << bit_count) - 1; //mainly needed for warp_null_count - auto const warp_validity_mask = shifted_valid_mask & bit_range_mask; - - store_validity(bit_offset, ni.valid_map, warp_validity_mask, bit_count); - warp_null_count = bit_count - __popc(warp_validity_mask); - - if constexpr (enable_print) { - printf("STORE VALIDITY: t %d, depth %d, thread_value_count %d, valid_map_offset %d, bit_offset %d, bit_count %d, warp_validity_mask %u\n", - t, d_idx, thread_value_count, ni.valid_map_offset, bit_offset, bit_count, warp_validity_mask); - printf("NUM NULLS: t %d, depth %d, warp_null_count %d\n", t, d_idx, warp_null_count); - } + + // absolute bit offset into the output validity map + //is cumulative sum of warp_value_count at the given nesting depth + // DON'T subtract by first_row: since it's lists it's not 1-row-per-value + int const bit_offset = ni.valid_map_offset + thread_value_count; + + store_validity(bit_offset, ni.valid_map, warp_valid_mask, warp_value_count); + warp_null_count = warp_value_count - warp_valid_count; + + if constexpr (enable_print) { + printf("STORE VALIDITY: t %d, depth %d, thread_value_count %d, valid_map_offset %d, bit_offset %d, warp_value_count %d, warp_valid_mask %u\n", + t, d_idx, thread_value_count, ni.valid_map_offset, bit_offset, warp_value_count, warp_valid_mask); + printf("NUM NULLS: t %d, depth %d, warp_null_count %d\n", t, d_idx, warp_null_count); } - } } - // sum null counts. we have to do it this way instead of just incrementing by (value_count - - // valid_count) because valid_count also includes rows that potentially start before our row - // bounds. if we could come up with a way to clean that up, we could remove this and just - // compute it directly at the end of the kernel. - size_type const block_null_count = - cudf::detail::single_lane_block_sum_reduce(warp_null_count); - if constexpr (enable_print) { - if (t == 0) { printf("BLOCK NULLS: depth %d, prior %d, block_null_count %u\n", - d_idx, ni.null_count, block_null_count); } + if (t == 0) { + size_type const block_null_count = block_value_count - block_valid_count; + if constexpr (enable_print) { + if (t == 0) { printf("BLOCK NULLS: depth %d, prior %d, block_null_count %u\n", + d_idx, ni.null_count, block_null_count); } + } + ni.null_count += block_null_count; } - if (t == 0) { ni.null_count += block_null_count; } } // if this is valid and we're at the leaf, output dst_pos @@ -952,7 +994,6 @@ if constexpr (enable_print_large_list) { // update stuff if (t == 0) { - int const block_valid_count = count_set_bits(block_valid_mask); ni.valid_count += block_valid_count; ni.value_count += block_value_count; ni.valid_map_offset += block_value_count; @@ -963,6 +1004,8 @@ if constexpr (enable_print_large_list) { block_value_count = next_block_value_count; thread_value_count = next_thread_value_count; in_nesting_bounds = next_in_nesting_bounds; + warp_value_count = next_warp_value_count; + thread_value_count_within_warp = next_thread_value_count_within_warp; } //END OF DEPTH LOOP if constexpr (enable_print) { From 539066114f909f13b329468f570bdaad1bf3470e Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Wed, 18 Sep 2024 11:47:43 -0400 Subject: [PATCH 10/38] Optimize parquet reader block scans, simplify and consolidate non-nullable column code --- cpp/src/io/parquet/decode_fixed.cu | 369 +++++++++++++++++------------ 1 file changed, 212 insertions(+), 157 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 8a866141c4b..73eb9e87c61 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -24,6 +24,39 @@ namespace cudf::io::parquet::detail { namespace { +struct block_scan_results { + uint32_t warp_bits; + int thread_count_within_warp; + int warp_count; + + int thread_count_within_block; + int block_count; +}; + +template +static __device__ void scan_block_exclusive_sum(int t, int thread_bit, block_scan_results& results) +{ + constexpr int num_warps = decode_block_size / cudf::detail::warp_size; + int const warp_index = t / cudf::detail::warp_size; + int const warp_lane = t % cudf::detail::warp_size; + uint32_t const lane_mask = (uint32_t(1) << warp_lane) - 1; + + results.warp_bits = ballot(thread_bit); + results.warp_count = __popc(results.warp_bits); + results.thread_count_within_warp = __popc(results.warp_bits & lane_mask); + + __shared__ int warp_counts[num_warps]; + if (warp_lane == 0) { warp_counts[warp_index] = results.warp_count; } + __syncthreads(); + + results.block_count = 0; + results.thread_count_within_block = results.thread_count_within_warp; + for (int warp_idx = 0; warp_idx < num_warps; ++warp_idx) { + results.block_count += warp_counts[warp_idx]; + if (warp_idx < warp_index) { results.thread_count_within_block += warp_counts[warp_idx]; } + } +} + template __device__ inline void gpuDecodeFixedWidthValues( page_state_s* s, state_buf* const sb, int start, int end, int t) @@ -194,7 +227,7 @@ struct decode_fixed_width_split_values_func { } }; -template +template static __device__ int gpuUpdateValidityAndRowIndicesNested( int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t) { @@ -212,28 +245,27 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( int const row_index_lower_bound = s->row_index_lower_bound; int const max_depth = s->col.max_nesting_depth - 1; - __syncthreads(); + auto& max_depth_ni = s->nesting_info[max_depth]; + int valid_count = max_depth_ni.valid_count; while (value_count < capped_target_value_count) { int const batch_size = min(max_batch_size, capped_target_value_count - value_count); - // definition level. only need to process for nullable columns - int d = 0; - if constexpr (nullable) { - if (def) { - d = t < batch_size - ? static_cast(def[rolling_index(value_count + t)]) - : -1; - } else { - d = t < batch_size ? 1 : -1; - } + // definition level + int d; + if (t >= batch_size) { + d = -1; + } else if (def) { + d = static_cast(def[rolling_index(value_count + t)]); + } else { + d = 1; } - int const thread_value_count = t + 1; + int const thread_value_count = t; int const block_value_count = batch_size; // compute our row index, whether we're in row bounds, and validity - int const row_index = (thread_value_count + value_count) - 1; + int const row_index = thread_value_count + value_count; int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row); int const write_start = __ffs(in_write_row_bounds) - 1; // first bit in the warp to store @@ -242,90 +274,74 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( for (int d_idx = 0; d_idx <= max_depth; d_idx++) { auto& ni = s->nesting_info[d_idx]; - int is_valid; - if constexpr (nullable) { - is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0; - } else { - is_valid = in_row_bounds; - } + int is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0; // thread and block validity count - int thread_valid_count, block_valid_count; - if constexpr (nullable) { - using block_scan = cub::BlockScan; - __shared__ typename block_scan::TempStorage scan_storage; - block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count); - __syncthreads(); - - // validity is processed per-warp - // - // nested schemas always read and write to the same bounds (that is, read and write - // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading - // at the first value, even if that is before first_row, because we cannot trivially jump to - // the correct position to start reading. since we are about to write the validity vector - // here we need to adjust our computed mask to take into account the write row bounds. - int warp_null_count = 0; - if (write_start >= 0 && ni.valid_map != nullptr) { - int const valid_map_offset = ni.valid_map_offset; - uint32_t const warp_validity_mask = ballot(is_valid); - // lane 0 from each warp writes out validity - if ((t % cudf::detail::warp_size) == 0) { - int const vindex = - (value_count + thread_value_count) - 1; // absolute input value index - int const bit_offset = (valid_map_offset + vindex + write_start) - - first_row; // absolute bit offset into the output validity map - int const write_end = cudf::detail::warp_size - - __clz(in_write_row_bounds); // last bit in the warp to store - int const bit_count = write_end - write_start; - warp_null_count = bit_count - __popc(warp_validity_mask >> write_start); - - store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count); - } - } + block_scan_results valid_count_results; + scan_block_exclusive_sum(t, is_valid, valid_count_results); + uint32_t const warp_validity_mask = valid_count_results.warp_bits; + int thread_valid_count = valid_count_results.thread_count_within_block; + int block_valid_count = valid_count_results.block_count; - // sum null counts. we have to do it this way instead of just incrementing by (value_count - - // valid_count) because valid_count also includes rows that potentially start before our row - // bounds. if we could come up with a way to clean that up, we could remove this and just - // compute it directly at the end of the kernel. - size_type const block_null_count = - cudf::detail::single_lane_block_sum_reduce(warp_null_count); - if (t == 0) { ni.null_count += block_null_count; } - } - // trivial for non-nullable columns - else { - thread_valid_count = thread_value_count; - block_valid_count = block_value_count; + // validity is processed per-warp + // + // nested schemas always read and write to the same bounds (that is, read and write + // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading + // at the first value, even if that is before first_row, because we cannot trivially jump to + // the correct position to start reading. since we are about to write the validity vector + // here we need to adjust our computed mask to take into account the write row bounds. + int warp_null_count = 0; + // lane 0 from each warp writes out validity + if ((write_start >= 0) && (ni.valid_map != nullptr) && ((t % cudf::detail::warp_size) == 0)) { + int const valid_map_offset = ni.valid_map_offset; + int const vindex = value_count + thread_value_count; // absolute input value index + int const bit_offset = (valid_map_offset + vindex + write_start) - + first_row; // absolute bit offset into the output validity map + int const write_end = + cudf::detail::warp_size - __clz(in_write_row_bounds); // last bit in the warp to store + int const bit_count = write_end - write_start; + warp_null_count = bit_count - __popc(warp_validity_mask >> write_start); + + store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count); } + // sum null counts. we have to do it this way instead of just incrementing by (value_count - + // valid_count) because valid_count also includes rows that potentially start before our row + // bounds. if we could come up with a way to clean that up, we could remove this and just + // compute it directly at the end of the kernel. + size_type const block_null_count = + cudf::detail::single_lane_block_sum_reduce(warp_null_count); + if (t == 0) { ni.null_count += block_null_count; } + // if this is valid and we're at the leaf, output dst_pos - __syncthreads(); // handle modification of ni.value_count from below - if (is_valid && d_idx == max_depth) { - // for non-list types, the value count is always the same across - int const dst_pos = (value_count + thread_value_count) - 1; - int const src_pos = (ni.valid_count + thread_valid_count) - 1; - sb->nz_idx[rolling_index(src_pos)] = dst_pos; + if (d_idx == max_depth) { + if (is_valid) { + // for non-list types, the value count is always the same across + int const dst_pos = value_count + thread_value_count; + int const src_pos = valid_count + thread_valid_count; + sb->nz_idx[rolling_index(src_pos)] = dst_pos; + } + // update stuff + valid_count += block_valid_count; } - __syncthreads(); // handle modification of ni.value_count from below - // update stuff - if (t == 0) { ni.valid_count += block_valid_count; } - } + } // end depth loop value_count += block_value_count; - } + } // end loop if (t == 0) { // update valid value count for decoding and total # of values we've processed - s->nz_count = s->nesting_info[max_depth].valid_count; - s->input_value_count = value_count; - s->input_row_count = value_count; + max_depth_ni.valid_count = valid_count; + s->nz_count = valid_count; + s->input_value_count = value_count; + s->input_row_count = value_count; } - __syncthreads(); - return s->nesting_info[max_depth].valid_count; + return valid_count; } -template +template static __device__ int gpuUpdateValidityAndRowIndicesFlat( int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t) { @@ -346,88 +362,70 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( int const valid_map_offset = ni.valid_map_offset; int const row_index_lower_bound = s->row_index_lower_bound; - __syncthreads(); - while (value_count < capped_target_value_count) { int const batch_size = min(max_batch_size, capped_target_value_count - value_count); - // definition level. only need to process for nullable columns - int d = 0; - if constexpr (nullable) { - if (def) { - d = t < batch_size - ? static_cast(def[rolling_index(value_count + t)]) - : -1; - } else { - d = t < batch_size ? 1 : -1; - } - } - - int const thread_value_count = t + 1; + int const thread_value_count = t; int const block_value_count = batch_size; // compute our row index, whether we're in row bounds, and validity - int const row_index = (thread_value_count + value_count) - 1; + int const row_index = thread_value_count + value_count; int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); + + // use definition level & row bounds to determine if is valid int is_valid; - if constexpr (nullable) { - is_valid = ((d > 0) && in_row_bounds) ? 1 : 0; + if (t >= batch_size) { + is_valid = 0; + } else if (def) { + int const def_level = + static_cast(def[rolling_index(value_count + t)]); + is_valid = ((def_level > 0) && in_row_bounds) ? 1 : 0; } else { is_valid = in_row_bounds; } // thread and block validity count - int thread_valid_count, block_valid_count; - if constexpr (nullable) { - using block_scan = cub::BlockScan; - __shared__ typename block_scan::TempStorage scan_storage; - block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count); - __syncthreads(); - - // validity is processed per-warp - // - // nested schemas always read and write to the same bounds (that is, read and write - // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading - // at the first value, even if that is before first_row, because we cannot trivially jump to - // the correct position to start reading. since we are about to write the validity vector - // here we need to adjust our computed mask to take into account the write row bounds. - int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row); - int const write_start = __ffs(in_write_row_bounds) - 1; // first bit in the warp to store - int warp_null_count = 0; - if (write_start >= 0) { - uint32_t const warp_validity_mask = ballot(is_valid); - // lane 0 from each warp writes out validity - if ((t % cudf::detail::warp_size) == 0) { - int const vindex = (value_count + thread_value_count) - 1; // absolute input value index - int const bit_offset = (valid_map_offset + vindex + write_start) - - first_row; // absolute bit offset into the output validity map - int const write_end = - cudf::detail::warp_size - __clz(in_write_row_bounds); // last bit in the warp to store - int const bit_count = write_end - write_start; - warp_null_count = bit_count - __popc(warp_validity_mask >> write_start); - - store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count); - } - } - - // sum null counts. we have to do it this way instead of just incrementing by (value_count - - // valid_count) because valid_count also includes rows that potentially start before our row - // bounds. if we could come up with a way to clean that up, we could remove this and just - // compute it directly at the end of the kernel. - size_type const block_null_count = - cudf::detail::single_lane_block_sum_reduce(warp_null_count); - if (t == 0) { ni.null_count += block_null_count; } - } - // trivial for non-nullable columns - else { - thread_valid_count = thread_value_count; - block_valid_count = block_value_count; + block_scan_results valid_count_results; + scan_block_exclusive_sum(t, is_valid, valid_count_results); + uint32_t const warp_validity_mask = valid_count_results.warp_bits; + int thread_valid_count = valid_count_results.thread_count_within_block; + int block_valid_count = valid_count_results.block_count; + + // validity is processed per-warp + // + // nested schemas always read and write to the same bounds (that is, read and write + // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading + // at the first value, even if that is before first_row, because we cannot trivially jump to + // the correct position to start reading. since we are about to write the validity vector + // here we need to adjust our computed mask to take into account the write row bounds. + int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row); + int const write_start = __ffs(in_write_row_bounds) - 1; // first bit in the warp to store + int warp_null_count = 0; + // lane 0 from each warp writes out validity + if ((write_start >= 0) && ((t % cudf::detail::warp_size) == 0)) { + int const vindex = value_count + thread_value_count; // absolute input value index + int const bit_offset = (valid_map_offset + vindex + write_start) - + first_row; // absolute bit offset into the output validity map + int const write_end = + cudf::detail::warp_size - __clz(in_write_row_bounds); // last bit in the warp to store + int const bit_count = write_end - write_start; + warp_null_count = bit_count - __popc(warp_validity_mask >> write_start); + + store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count); } + // sum null counts. we have to do it this way instead of just incrementing by (value_count - + // valid_count) because valid_count also includes rows that potentially start before our row + // bounds. if we could come up with a way to clean that up, we could remove this and just + // compute it directly at the end of the kernel. + size_type const block_null_count = + cudf::detail::single_lane_block_sum_reduce(warp_null_count); + if (t == 0) { ni.null_count += block_null_count; } + // output offset if (is_valid) { - int const dst_pos = (value_count + thread_value_count) - 1; - int const src_pos = (valid_count + thread_valid_count) - 1; + int const dst_pos = value_count + thread_value_count; + int const src_pos = valid_count + thread_valid_count; sb->nz_idx[rolling_index(src_pos)] = dst_pos; } @@ -448,6 +446,70 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( return valid_count; } +template +static __device__ int gpuUpdateValidityAndRowIndicesNonNullable(int32_t target_value_count, + page_state_s* s, + state_buf* sb, + int t) +{ + constexpr int num_warps = decode_block_size / cudf::detail::warp_size; + constexpr int max_batch_size = num_warps * cudf::detail::warp_size; + + // cap by last row so that we don't process any rows past what we want to output. + int const first_row = s->first_row; + int const last_row = first_row + s->num_rows; + int const capped_target_value_count = min(target_value_count, last_row); + int const row_index_lower_bound = s->row_index_lower_bound; + + // how many (input) values we've processed in the page so far + int value_count = s->input_value_count; + + int const max_depth = s->col.max_nesting_depth - 1; + auto& ni = s->nesting_info[max_depth]; + int valid_count = ni.valid_count; + + __syncthreads(); + + while (value_count < capped_target_value_count) { + int const batch_size = min(max_batch_size, capped_target_value_count - value_count); + + int const thread_value_count = t; + int const block_value_count = batch_size; + + // compute our row index, whether we're in row bounds, and validity + int const row_index = thread_value_count + value_count; + int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); + + int is_valid = in_row_bounds; + + int thread_valid_count = thread_value_count; + int block_valid_count = block_value_count; + + // if this is valid and we're at the leaf, output dst_pos + if (is_valid) { + // for non-list types, the value count is always the same across + int const dst_pos = value_count + thread_value_count; + int const src_pos = valid_count + thread_valid_count; + sb->nz_idx[rolling_index(src_pos)] = dst_pos; + } + + // update stuff + value_count += block_value_count; + valid_count += block_valid_count; + } // end loop + + if (t == 0) { + // update valid value count for decoding and total # of values we've processed + ni.valid_count = valid_count; + ni.value_count = value_count; + s->nz_count = valid_count; + s->input_value_count = value_count; + s->input_row_count = value_count; + } + + return valid_count; +} + // is the page marked nullable or not __device__ inline bool is_nullable(page_state_s* s) { @@ -614,10 +676,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) __syncthreads(); if constexpr (has_nesting_t) { - next_valid_count = gpuUpdateValidityAndRowIndicesNested( + next_valid_count = gpuUpdateValidityAndRowIndicesNested( processed_count, s, sb, def, t); } else { - next_valid_count = gpuUpdateValidityAndRowIndicesFlat( + next_valid_count = gpuUpdateValidityAndRowIndicesFlat( processed_count, s, sb, def, t); } } @@ -626,15 +688,8 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) // nz_idx. gpuDecodeFixedWidthValues would be the only work that happens. else { processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count); - - if constexpr (has_nesting_t) { - next_valid_count = - gpuUpdateValidityAndRowIndicesNested( - processed_count, s, sb, nullptr, t); - } else { - next_valid_count = gpuUpdateValidityAndRowIndicesFlat( - processed_count, s, sb, nullptr, t); - } + next_valid_count = + gpuUpdateValidityAndRowIndicesNonNullable(processed_count, s, sb, t); } __syncthreads(); From 3ef7b0d8c6109d618c2f114d1970342a0442d9a9 Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Wed, 18 Sep 2024 12:07:40 -0400 Subject: [PATCH 11/38] tweak syncing --- cpp/src/io/parquet/decode_fixed.cu | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 73eb9e87c61..0638b3e5d5a 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -248,6 +248,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( auto& max_depth_ni = s->nesting_info[max_depth]; int valid_count = max_depth_ni.valid_count; + __syncthreads(); + while (value_count < capped_target_value_count) { int const batch_size = min(max_batch_size, capped_target_value_count - value_count); @@ -362,6 +364,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( int const valid_map_offset = ni.valid_map_offset; int const row_index_lower_bound = s->row_index_lower_bound; + __syncthreads(); + while (value_count < capped_target_value_count) { int const batch_size = min(max_batch_size, capped_target_value_count - value_count); @@ -480,16 +484,16 @@ static __device__ int gpuUpdateValidityAndRowIndicesNonNullable(int32_t target_v int const row_index = thread_value_count + value_count; int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); - int is_valid = in_row_bounds; - + int is_valid = in_row_bounds; int thread_valid_count = thread_value_count; int block_valid_count = block_value_count; // if this is valid and we're at the leaf, output dst_pos if (is_valid) { // for non-list types, the value count is always the same across - int const dst_pos = value_count + thread_value_count; - int const src_pos = valid_count + thread_valid_count; + int const dst_pos = value_count + thread_value_count; + int const src_pos = valid_count + thread_valid_count; + sb->nz_idx[rolling_index(src_pos)] = dst_pos; } From 788287936f4f21abc34a6bf9fc04f310e6b2824c Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Wed, 18 Sep 2024 18:06:33 -0400 Subject: [PATCH 12/38] small tweaks --- cpp/src/io/parquet/decode_fixed.cu | 77 +++++++++++++----------------- 1 file changed, 32 insertions(+), 45 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index b47b96b91a2..ac7a628bc19 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -562,13 +562,13 @@ struct scan_results }; template -static __device__ void scan_block(uint32_t warp_bits, int warp_lane, int warp_index, uint32_t thread_mask, scan_results& results) +static __device__ void scan_block(uint32_t warp_bits, int warp_lane, int warp_index, uint32_t lane_mask, scan_results& results) { constexpr int num_warps = decode_block_size / cudf::detail::warp_size; results.warp_bits = warp_bits; results.warp_count = __popc(results.warp_bits); - results.thread_count_within_warp = __popc(results.warp_bits & thread_mask); + results.thread_count_within_warp = __popc(results.warp_bits & lane_mask); __shared__ uint32_t warp_counts[num_warps]; if(warp_lane == 0) { @@ -587,10 +587,10 @@ static __device__ void scan_block(uint32_t warp_bits, int warp_lane, int warp_in } template -static __device__ void scan_block(int thread_bit, int warp_lane, int warp_index, uint32_t thread_mask, scan_results& results) +static __device__ void scan_block(int thread_bit, int warp_lane, int warp_index, uint32_t lane_mask, scan_results& results) { uint32_t warp_bits = ballot(thread_bit); - scan_block(warp_bits, warp_lane, warp_index, thread_mask, results); + scan_block(warp_bits, warp_lane, warp_index, lane_mask, results); } template @@ -625,6 +625,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( int const row_index_lower_bound = s->row_index_lower_bound; int const max_depth = s->col.max_nesting_depth - 1; + int max_depth_valid_count = s->nesting_info[max_depth].valid_count; __syncthreads(); @@ -963,38 +964,39 @@ if constexpr (enable_print_large_list) { // if this is valid and we're at the leaf, output dst_pos // Read these before the sync, so that when thread 0 modifies them we've already read their values int current_value_count = ni.value_count; - int current_valid_count = ni.valid_count; - __syncthreads(); // handle modification of ni.valid_count from below - if (is_valid && d_idx == max_depth) { - // for non-list types, the value count is always the same across - int const dst_pos = current_value_count + thread_value_count; - int const src_pos = current_valid_count + thread_valid_count; - int const output_index = rolling_index(src_pos); + __syncthreads(); // handle modification of ni.value_count from below + if (d_idx == max_depth) { + if (is_valid) { + // for non-list types, the value count is always the same across + int const dst_pos = current_value_count + thread_value_count; + int const src_pos = max_depth_valid_count + thread_valid_count; + int const output_index = rolling_index(src_pos); - if constexpr (enable_print || enable_print_range_error) { - if((output_index < 0) || (output_index >= state_buf::nz_buf_size)) { - printf("WHOA: output index STORE %d out of bounds!\n", output_index); + if constexpr (enable_print || enable_print_range_error) { + if((output_index < 0) || (output_index >= state_buf::nz_buf_size)) { + printf("WHOA: output index STORE %d out of bounds!\n", output_index); + } + if(dst_pos < 0) { printf("WHOA: dst_pos STORE %d out of bounds!\n", dst_pos); } } - if(dst_pos < 0) { printf("WHOA: dst_pos STORE %d out of bounds!\n", dst_pos); } - } - if constexpr (enable_print) { - if (t == 0) { printf("ni.value_count %d, ni.valid_count %d\n", int(ni.value_count), int(ni.valid_count)); } - if (t < 32) { printf("t %d, src_pos %d, output_index %d\n", t, src_pos, output_index); } + if constexpr (enable_print) { + if (t == 0) { printf("ni.value_count %d, max_depth_valid_count %d\n", int(ni.value_count), max_depth_valid_count); } + if (t < 32) { printf("t %d, src_pos %d, output_index %d\n", t, src_pos, output_index); } - if((t == 0) && (src_pos == 0)) {printf("SPECIAL: output_index %d, dst_pos %d, ni.value_count %d, ni.valid_count %d, thread_value_count %d, thread_valid_count %d\n", - output_index, dst_pos, ni.value_count, ni.valid_count, thread_value_count, thread_valid_count);} + if((t == 0) && (src_pos == 0)) {printf("SPECIAL: output_index %d, dst_pos %d, ni.value_count %d, max_depth_valid_count %d, thread_value_count %d, thread_valid_count %d\n", + output_index, dst_pos, ni.value_count, max_depth_valid_count, thread_value_count, thread_valid_count);} - if (t == 0) { printf("OUTPUT_INDICES: output_index %d, dst_pos %d\n", output_index, dst_pos); } - } + if (t == 0) { printf("OUTPUT_INDICES: output_index %d, dst_pos %d\n", output_index, dst_pos); } + } - //Index from rolling buffer of values (which doesn't include nulls) to final array (which includes gaps for nulls) - sb->nz_idx[output_index] = dst_pos; + //Index from rolling buffer of values (which doesn't include nulls) to final array (which includes gaps for nulls) + sb->nz_idx[output_index] = dst_pos; + } + max_depth_valid_count += block_valid_count; } // update stuff if (t == 0) { - ni.valid_count += block_valid_count; ni.value_count += block_value_count; ni.valid_map_offset += block_value_count; } @@ -1022,7 +1024,8 @@ if constexpr (enable_print_large_list) { if (t == 0) { // update valid value count for decoding and total # of values we've processed - s->nz_count = s->nesting_info[max_depth].valid_count; + s->nesting_info[max_depth].valid_count = max_depth_valid_count; + s->nz_count = max_depth_valid_count; s->input_value_count = value_count; // If we have lists # rows != # values @@ -1036,8 +1039,7 @@ if constexpr (enable_print_large_list) { } } - __syncthreads(); - return s->nesting_info[max_depth].valid_count; + return max_depth_valid_count; } // is the page marked nullable or not @@ -1094,7 +1096,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) device_span chunks, size_t min_row, size_t num_rows, - kernel_error::pointer error_code /*, int page_idx = -1, int num_pages = -1*/) + kernel_error::pointer error_code) { constexpr int rolling_buf_size = decode_block_size_t * 2; constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size(); @@ -1108,11 +1110,6 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) page_state_s* const s = &state_g; auto* const sb = &state_buffers; int const page_idx = blockIdx.x; -/* page_idx = (page_idx == -1) ? blockIdx.x : page_idx + blockIdx.x; - if((page_idx >= num_pages) && (num_pages != -1)) { - printf("BAIL ON PAGE %d of %d\n", page_idx, num_pages); - return; - }*/ int const t = threadIdx.x; PageInfo* pp = &pages[page_idx]; @@ -1357,13 +1354,6 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span pages, dim3 dim_block(decode_block_size, 1); dim3 dim_grid(pages.size(), 1); // 1 threadblock per page -/* - auto num_pages = pages.size(); - auto grid_dim = 1; //2, 10, 40, 100 no problem; all = problem - dim3 dim_grid(grid_dim, 1); // 1 threadblock per page - -for(decltype(num_pages) idx = 0; idx < num_pages; idx += grid_dim) { -*/ if (level_type_size == 1) { if (is_list) { gpuDecodePageDataGeneric <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); -// pages.device_ptr(), chunks, min_row, num_rows, error_code, idx, num_pages); } else if (has_nesting) { gpuDecodePageDataGeneric <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); -// pages.device_ptr(), chunks, min_row, num_rows, error_code, idx, num_pages); } else if (has_nesting) { gpuDecodePageDataGeneric pages, From e285fbfd44bb3f8ef14e81a35fdc0301bd2a0a1a Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Mon, 23 Sep 2024 16:42:05 -0400 Subject: [PATCH 13/38] Add skipping to rle_stream, use for lists (chunked reads) --- cpp/src/io/parquet/decode_fixed.cu | 112 +++++++++++++++++------------ cpp/src/io/parquet/rle_stream.cuh | 69 ++++++++++++++++++ 2 files changed, 134 insertions(+), 47 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index ac7a628bc19..fce8f53700d 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -41,7 +41,7 @@ __device__ inline void gpuDecodeFixedWidthValues( static constexpr bool enable_print = false; static constexpr bool enable_print_range_error = false; - static constexpr bool enable_print_large_list = false; +// static constexpr bool enable_print_large_list = true; if constexpr (enable_print) { if(t == 0) { printf("DECODE VALUES: start %d, end %d, first_row %d, leaf_level_index %d, dtype_len %u, " @@ -133,7 +133,7 @@ __device__ inline void gpuDecodeFixedWidthValues( } else { gpuOutputGeneric(s, sb, src_pos, static_cast(dst), dtype_len); } - +/* if constexpr (enable_print_large_list) { if (dtype == INT32) { int value_stored = *static_cast(dst); @@ -143,6 +143,7 @@ __device__ inline void gpuDecodeFixedWidthValues( } } } + */ } pos += batch_size; @@ -628,15 +629,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( int max_depth_valid_count = s->nesting_info[max_depth].valid_count; __syncthreads(); - -if constexpr (enable_print_large_list) { - auto first_ni_value_count = s->nesting_info[0].value_count; - if((value_count != (4*input_row_count)) || (input_row_count != first_ni_value_count)){ - printf("ALGO GARBAGE GET: blockIdx.x %d, value_count %d, target_value_count %d, t %d, value_count %d, input_row_count %d, first_ni_value_count %d\n", - blockIdx.x, value_count, target_value_count, t, value_count, input_row_count, first_ni_value_count); - } -} - + using block_scan = cub::BlockScan; __shared__ typename block_scan::TempStorage scan_storage; @@ -700,15 +693,15 @@ if constexpr (enable_print_large_list) { __syncthreads(); if constexpr (enable_print_large_list) { - if(bool(is_new_row) != (t % 4 == 0)) { - printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, is_new_row %d\n", - blockIdx.x, value_count, target_value_count, t, is_new_row); + if(within_batch && (bool(is_new_row) != (t % 4 == 0))) { + printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, is_new_row %d, start_depth %d, rep_level %d\n", + blockIdx.x, value_count, target_value_count, t, is_new_row, start_depth, rep_level); } - if(num_prior_new_rows != ((t + 3) / 4)) { + if(within_batch && (num_prior_new_rows != ((t + 3) / 4))) { printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, num_prior_new_rows %d\n", blockIdx.x, value_count, target_value_count, t, num_prior_new_rows); } - if(total_num_new_rows != 32) { + if((value_count + 128 <= target_value_count) && (total_num_new_rows != 32)) { printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, total_num_new_rows %d\n", blockIdx.x, value_count, target_value_count, t, total_num_new_rows); } @@ -747,15 +740,17 @@ if constexpr (enable_print_large_list) { int block_value_count = value_count_scan_results.block_count; if constexpr (enable_print_large_list) { - if(in_nesting_bounds != (t % 4 == 0)) { - printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, in_nesting_bounds %d, start_depth %d, end_depth %d, in_row_bounds %d, row_index %d, input_row_count %d\n", - blockIdx.x, value_count, target_value_count, t, in_nesting_bounds, start_depth, end_depth, in_row_bounds, row_index, input_row_count); + if(within_batch && in_row_bounds && (in_nesting_bounds != (t % 4 == 0))) { + printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, in_nesting_bounds %d, start_depth %d, end_depth %d, " + "in_row_bounds %d, row_index %d, input_row_count %d, row_index_lower_bound %d, last_row %d, first_row %d, s->num_rows %d\n", + blockIdx.x, value_count, target_value_count, t, in_nesting_bounds, start_depth, end_depth, in_row_bounds, row_index, input_row_count, + row_index_lower_bound, last_row, first_row, s->num_rows); } - if(thread_value_count != ((t + 3) / 4)) { + if(within_batch && in_row_bounds && (thread_value_count != ((t + 3) / 4))) { printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, thread_value_count %d\n", blockIdx.x, value_count, target_value_count, t, thread_value_count); } - if(block_value_count != 32) { + if((value_count + 128 <= target_value_count) && (input_row_count + total_num_new_rows <= last_row) && (block_value_count != 32)) { printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, block_value_count %d\n", blockIdx.x, value_count, target_value_count, t, block_value_count); } @@ -813,15 +808,15 @@ if constexpr (enable_print_large_list) { int block_valid_count = valid_count_scan_results.block_count; if constexpr (enable_print_large_list) { - if(((d_idx == 0) && (is_valid != (t % 4 == 0))) || ((d_idx == 1) && !is_valid)) { + if(within_batch && in_row_bounds && (((d_idx == 0) && (is_valid != (t % 4 == 0))) || ((d_idx == 1) && !is_valid))) { printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, is_valid %d, in_nesting_bounds %d\n", blockIdx.x, value_count, target_value_count, t, d_idx, is_valid, in_nesting_bounds); } - if (((d_idx == 0) && (thread_valid_count != ((t + 3)/ 4))) || ((d_idx == 1) && (thread_valid_count != t))) { + if (within_batch && in_row_bounds && (((d_idx == 0) && (thread_valid_count != ((t + 3)/ 4))) || ((d_idx == 1) && (thread_valid_count != t)))) { printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, thread_valid_count %d\n", blockIdx.x, value_count, target_value_count, t, d_idx, thread_valid_count); } - if(((d_idx == 0) && (block_valid_count != 32)) || ((d_idx == 1) && (block_valid_count != 128))) { + if((value_count + 128 <= target_value_count) && (input_row_count + total_num_new_rows <= last_row) && (((d_idx == 0) && (block_valid_count != 32)) || ((d_idx == 1) && (block_valid_count != 128)))) { printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, block_valid_count %d\n", blockIdx.x, value_count, target_value_count, t, d_idx, block_valid_count); } @@ -859,17 +854,16 @@ if constexpr (enable_print_large_list) { next_thread_value_count = next_value_count_scan_results.thread_count_within_block; next_block_value_count = next_value_count_scan_results.block_count; - if constexpr (enable_print_large_list) { - if(next_in_nesting_bounds != 1) { + if(within_batch && in_row_bounds && (next_in_nesting_bounds != 1)) { printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, next_in_nesting_bounds %d, start_depth %d, end_depth %d, in_row_bounds %d, row_index %d, input_row_count %d\n", blockIdx.x, value_count, target_value_count, t, next_in_nesting_bounds, start_depth, end_depth, in_row_bounds, row_index, input_row_count); } - if(next_thread_value_count != t) { + if(within_batch && in_row_bounds && (next_thread_value_count != t)) { printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, next_thread_value_count %d\n", blockIdx.x, value_count, target_value_count, t, next_thread_value_count); } - if(next_block_value_count != 128) { + if((value_count + 128 <= target_value_count) && (input_row_count + total_num_new_rows <= last_row) && (next_block_value_count != 128)) { printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, next_block_value_count %d\n", blockIdx.x, value_count, target_value_count, t, next_block_value_count); } @@ -893,9 +887,11 @@ if constexpr (enable_print_large_list) { //STORE THE OFFSET FOR THE NEW LIST LOCATION (reinterpret_cast(ni.data_out))[idx] = ofs; +/* if constexpr (enable_print_large_list) { int overall_index = 4*(blockIdx.x * 20000 + idx); if(overall_index != ofs) { + printf("WHOA BAD OFFSET\n"); printf("WHOA BAD OFFSET: WROTE %d to %d! t %d, blockIdx.x %d, idx %d, d_idx %d, start_depth %d, end_depth %d, max_depth %d, " "in_row_bounds %d, in_nesting_bounds %d, next_in_nesting_bounds %d, row_index %d, row_index_lower_bound %d, last_row %d, " "input_row_count %d, num_prior_new_rows %d, is_new_row %d, total_num_new_rows %d, rep_level %d, def_level %d, ni.value_count %d, " @@ -907,7 +903,7 @@ if constexpr (enable_print_large_list) { next_thread_value_count, next_ni.page_start_value, value_count, target_value_count, block_value_count, next_block_value_count); } } - +*/ if constexpr (enable_print || enable_print_range_error) { if((idx < 0) || (idx > 50000)){ printf("WHOA: offset index %d out of bounds!\n", idx); } if(ofs < 0){ printf("WHOA: offset value %d out of bounds!\n", ofs); } @@ -1030,13 +1026,6 @@ if constexpr (enable_print_large_list) { // If we have lists # rows != # values s->input_row_count = input_row_count; -if constexpr (enable_print_large_list) { - auto first_ni_value_count = s->nesting_info[0].value_count; - if((value_count != (4*input_row_count)) || (input_row_count != first_ni_value_count)){ - printf("ALGO GARBAGE SET: blockIdx.x %d, value_count %d, target_value_count %d, t %d, value_count %d, input_row_count %d, first_ni_value_count %d\n", - blockIdx.x, value_count, target_value_count, t, value_count, input_row_count, first_ni_value_count); - } -} } return max_depth_valid_count; @@ -1069,6 +1058,32 @@ __device__ inline bool maybe_has_nulls(page_state_s* s) return run_val != s->col.max_level[lvl]; } +template +__device__ int skip_decode(stream_type& parquet_stream, int num_to_skip, int t) +{ + static constexpr bool enable_print = false; + + //Dictionary + int num_skipped = parquet_stream.skip_decode(t, num_to_skip); + if constexpr (enable_print) { + if (t == 0) { printf("SKIPPED: num_skipped %d, for %d\n", num_skipped, num_to_skip); } + } + //it could be that (e.g.) we skip 5000 but starting at row 4000 we have a run of length 2000: + //in that case skip_decode() only skips 4000, and we have to process the remaining 1000 up front + //modulo 2 * block_size of course, since that's as many as we process at once + while (num_skipped < num_to_skip) { + auto const to_skip = min(2*decode_block_size_t, num_to_skip - num_skipped); + parquet_stream.decode_next(t, to_skip); + num_skipped += to_skip; + if constexpr (enable_print) { + if (t == 0) { printf("EXTRA SKIPPED: to_skip %d, at %d, for %d\n", to_skip, num_skipped, num_to_skip); } + } + __syncthreads(); + } + + return num_skipped; +} + /** * @brief Kernel for computing fixed width non dictionary column data stored in the pages * @@ -1190,18 +1205,8 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) if(t == 0) { printf("INIT DICT: dict_bits %d, data_start %p, data_end %p, dict_idx %p, page.num_input_values %d, s->dict_pos %d \n", s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values, s->dict_pos); } } - if constexpr (has_lists_t){ - int init_decode = 0; - while (init_decode < s->page.skipped_leaf_values) { - auto const to_skip = min(decode_block_size_t, s->page.skipped_leaf_values - init_decode); - dict_stream.decode_next(t, to_skip); - init_decode += to_skip; - __syncthreads(); - } - } } - __syncthreads(); - + if constexpr (enable_print) { if((t == 0) && (page_idx == 0)){ printf("SIZES: shared_rep_size %d, shared_dict_size %d, shared_def_size %d\n", shared_rep_size, shared_dict_size, shared_def_size); @@ -1225,6 +1230,19 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) // the core loop. decode batches of level stream data using rle_stream objects // and pass the results to gpuDecodeValues + //For lists (which can have skipped values, skip ahead in the decoding so that we don't repeat work + if constexpr (has_lists_t){ + if(s->page.skipped_leaf_values > 0) { + if (should_process_nulls) { + skip_decode(def_decoder, s->page.skipped_leaf_values, t); + } + processed_count = skip_decode(rep_decoder, s->page.skipped_leaf_values, t); + if constexpr (has_dict_t) { + skip_decode(dict_stream, s->page.skipped_leaf_values, t); + } + } + } + if constexpr (enable_print) { if(t == 0) { printf("page_idx %d, nullable %d, should_process_nulls %d, has_lists_t %d, has_dict_t %d, num_rows %lu, page.num_input_values %d\n", page_idx, int(nullable), int(should_process_nulls), int(has_lists_t), int(has_dict_t), num_rows, s->page.num_input_values); } diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh index 4a0791d5c54..490cf1d43c3 100644 --- a/cpp/src/io/parquet/rle_stream.cuh +++ b/cpp/src/io/parquet/rle_stream.cuh @@ -252,6 +252,8 @@ struct rle_stream { run.level_run = level_run; run.remaining = run.size; cur += run_bytes; +//printf("STORE RUN: decode_index %d, fill_index %d, output_pos %d, run.size %d\n", + //decode_index, fill_index, output_pos, run.size); output_pos += run.size; fill_index++; } @@ -353,6 +355,8 @@ struct rle_stream { // this is the last batch we will process this iteration if: // - either this run still has remaining values // - or it is consumed fully and its last index corresponds to output_count +//printf("STATUS: run_index %d, batch_len %d, remaining %d, at_end %d, last_run_pos %d, cur_values %d\n", + //run_index, batch_len, remaining, at_end, last_run_pos, cur_values); if (remaining > 0 || at_end) { values_processed_shared = output_count; } if (remaining == 0 && (at_end || is_last_decode_warp(warp_id))) { decode_index_shared = run_index + 1; @@ -372,6 +376,71 @@ struct rle_stream { return values_processed_shared; } + __device__ inline int skip_runs(int target_count) + { + //we want to process all runs UP TO BUT NOT INCLUDING the run that overlaps with the skip amount + //so thread 0 spins like crazy on fill_run_batch(), skipping writing unnecessary run info + //then when it hits the one that matters, we don't process it at all and bail as if we never started + //basically we're setting up the global vars necessary to start fill_run_batch for the first time + while (cur < end) { + // bytes for the varint header + uint8_t const* _cur = cur; + int const level_run = get_vlq32(_cur, end); + + // run_bytes includes the header size + int run_bytes = _cur - cur; + int run_size; + if (is_literal_run(level_run)) { + // from the parquet spec: literal runs always come in multiples of 8 values. + run_size = (level_run >> 1) * 8; + run_bytes += ((run_size * level_bits) + 7) >> 3; + } else { + // repeated value run + run_size = (level_run >> 1); + run_bytes += ((level_bits) + 7) >> 3; + } + + if((output_pos + run_size) > target_count) { +//printf("SKIPPING: target_count %d, run_size %d, output_pos %d\n", target_count, run_size, output_pos); + return output_pos; //bail! we've reached the starting one + } + + output_pos += run_size; + cur += run_bytes; + } + +//printf("SKIPPING: target_count %d, output_pos %d\n", target_count, output_pos); + return output_pos; //we skipped everything + } + + + __device__ inline int skip_decode(int t, int count) + { + int const output_count = min(count, total_values - cur_values); + + // special case. if level_bits == 0, just return all zeros. this should tremendously speed up + // a very common case: columns with no nulls, especially if they are non-nested + if (level_bits == 0) { + cur_values = output_count; + return output_count; + } + + __shared__ int values_processed_shared; + + __syncthreads(); + + // warp 0 reads ahead and fills `runs` array to be decoded by remaining warps. + if (t == 0) { + values_processed_shared = skip_runs(output_count); + } + __syncthreads(); + + cur_values = values_processed_shared; + + // valid for every thread + return values_processed_shared; + } + __device__ inline int decode_next(int t) { return decode_next(t, max_output_values); } }; From 254f3e9ea9ea1934b82f7996475362b2aa3f8e4c Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Tue, 24 Sep 2024 09:40:29 -0400 Subject: [PATCH 14/38] tweak scan interface for linked lists --- cpp/src/io/parquet/decode_fixed.cu | 46 +++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 0638b3e5d5a..5010e116aa6 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -24,6 +24,8 @@ namespace cudf::io::parquet::detail { namespace { +// Unlike cub's algorithm, this provides warp-wide and block-wide results simultaneously. +// Also, this provides the ability to compute warp_bits & lane_mask manually, which we need for lists. struct block_scan_results { uint32_t warp_bits; int thread_count_within_warp; @@ -34,21 +36,34 @@ struct block_scan_results { }; template -static __device__ void scan_block_exclusive_sum(int t, int thread_bit, block_scan_results& results) +static __device__ void scan_block_exclusive_sum(int thread_bit, block_scan_results& results) { - constexpr int num_warps = decode_block_size / cudf::detail::warp_size; + int const t = threadIdx.x; int const warp_index = t / cudf::detail::warp_size; int const warp_lane = t % cudf::detail::warp_size; uint32_t const lane_mask = (uint32_t(1) << warp_lane) - 1; - results.warp_bits = ballot(thread_bit); + uint32_t warp_bits = ballot(thread_bit); + scan_block_exclusive_sum(warp_bits, warp_lane, warp_index, lane_mask, results); +} + +template +static __device__ void scan_block_exclusive_sum(uint32_t warp_bits, int warp_lane, int warp_index, uint32_t lane_mask, block_scan_results& results) +{ + //Compute # warps + constexpr int num_warps = decode_block_size / cudf::detail::warp_size; + + //Compute the warp-wide results + results.warp_bits = warp_bits; results.warp_count = __popc(results.warp_bits); results.thread_count_within_warp = __popc(results.warp_bits & lane_mask); + //Share the warp counts amongst the block threads __shared__ int warp_counts[num_warps]; if (warp_lane == 0) { warp_counts[warp_index] = results.warp_count; } __syncthreads(); + //Compute block-wide results results.block_count = 0; results.thread_count_within_block = results.thread_count_within_warp; for (int warp_idx = 0; warp_idx < num_warps; ++warp_idx) { @@ -244,9 +259,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( int const row_index_lower_bound = s->row_index_lower_bound; - int const max_depth = s->col.max_nesting_depth - 1; - auto& max_depth_ni = s->nesting_info[max_depth]; - int valid_count = max_depth_ni.valid_count; + int const max_depth = s->col.max_nesting_depth - 1; + auto& max_depth_ni = s->nesting_info[max_depth]; + int max_depth_valid_count = max_depth_ni.valid_count; __syncthreads(); @@ -280,7 +295,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( // thread and block validity count block_scan_results valid_count_results; - scan_block_exclusive_sum(t, is_valid, valid_count_results); + scan_block_exclusive_sum(is_valid, valid_count_results); uint32_t const warp_validity_mask = valid_count_results.warp_bits; int thread_valid_count = valid_count_results.thread_count_within_block; int block_valid_count = valid_count_results.block_count; @@ -320,11 +335,11 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( if (is_valid) { // for non-list types, the value count is always the same across int const dst_pos = value_count + thread_value_count; - int const src_pos = valid_count + thread_valid_count; + int const src_pos = max_depth_valid_count + thread_valid_count; sb->nz_idx[rolling_index(src_pos)] = dst_pos; } // update stuff - valid_count += block_valid_count; + max_depth_valid_count += block_valid_count; } } // end depth loop @@ -334,13 +349,13 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( if (t == 0) { // update valid value count for decoding and total # of values we've processed - max_depth_ni.valid_count = valid_count; - s->nz_count = valid_count; + max_depth_ni.valid_count = max_depth_valid_count; + s->nz_count = max_depth_valid_count; s->input_value_count = value_count; s->input_row_count = value_count; } - return valid_count; + return max_depth_valid_count; } template @@ -390,7 +405,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( // thread and block validity count block_scan_results valid_count_results; - scan_block_exclusive_sum(t, is_valid, valid_count_results); + scan_block_exclusive_sum(is_valid, valid_count_results); uint32_t const warp_validity_mask = valid_count_results.warp_bits; int thread_valid_count = valid_count_results.thread_count_within_block; int block_valid_count = valid_count_results.block_count; @@ -671,7 +686,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) int valid_count = 0; // the core loop. decode batches of level stream data using rle_stream objects // and pass the results to gpuDecodeValues - while (s->error == 0 && processed_count < s->page.num_input_values) { + // For chunked reads we may not process all of the rows on the page; if not stop early + int last_row = s->first_row + s->num_rows; + while ((s->error == 0) && (processed_count < s->page.num_input_values) && + (s->input_row_count <= last_row)) { int next_valid_count; // only need to process definition levels if this is a nullable column From 8ea1e0e723a9558ff462143e46d9feaabe974f2e Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Wed, 25 Sep 2024 13:31:04 -0400 Subject: [PATCH 15/38] style fixes --- cpp/src/io/parquet/decode_fixed.cu | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 5010e116aa6..9214af3e9e4 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -24,8 +24,9 @@ namespace cudf::io::parquet::detail { namespace { -// Unlike cub's algorithm, this provides warp-wide and block-wide results simultaneously. -// Also, this provides the ability to compute warp_bits & lane_mask manually, which we need for lists. +// Unlike cub's algorithm, this provides warp-wide and block-wide results simultaneously. +// Also, this provides the ability to compute warp_bits & lane_mask manually, which we need for +// lists. struct block_scan_results { uint32_t warp_bits; int thread_count_within_warp; @@ -38,7 +39,7 @@ struct block_scan_results { template static __device__ void scan_block_exclusive_sum(int thread_bit, block_scan_results& results) { - int const t = threadIdx.x; + int const t = threadIdx.x; int const warp_index = t / cudf::detail::warp_size; int const warp_lane = t % cudf::detail::warp_size; uint32_t const lane_mask = (uint32_t(1) << warp_lane) - 1; @@ -48,22 +49,26 @@ static __device__ void scan_block_exclusive_sum(int thread_bit, block_scan_resul } template -static __device__ void scan_block_exclusive_sum(uint32_t warp_bits, int warp_lane, int warp_index, uint32_t lane_mask, block_scan_results& results) +static __device__ void scan_block_exclusive_sum(uint32_t warp_bits, + int warp_lane, + int warp_index, + uint32_t lane_mask, + block_scan_results& results) { - //Compute # warps + // Compute # warps constexpr int num_warps = decode_block_size / cudf::detail::warp_size; - - //Compute the warp-wide results + + // Compute the warp-wide results results.warp_bits = warp_bits; results.warp_count = __popc(results.warp_bits); results.thread_count_within_warp = __popc(results.warp_bits & lane_mask); - //Share the warp counts amongst the block threads + // Share the warp counts amongst the block threads __shared__ int warp_counts[num_warps]; if (warp_lane == 0) { warp_counts[warp_index] = results.warp_count; } __syncthreads(); - //Compute block-wide results + // Compute block-wide results results.block_count = 0; results.thread_count_within_block = results.thread_count_within_warp; for (int warp_idx = 0; warp_idx < num_warps; ++warp_idx) { From 41cb98206640c57293d7ea325a6df7d85d08a56b Mon Sep 17 00:00:00 2001 From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com> Date: Thu, 26 Sep 2024 10:16:44 -0400 Subject: [PATCH 16/38] Update cpp/src/io/parquet/decode_fixed.cu Co-authored-by: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com> --- cpp/src/io/parquet/decode_fixed.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 9214af3e9e4..6b8559d400f 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -296,7 +296,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( for (int d_idx = 0; d_idx <= max_depth; d_idx++) { auto& ni = s->nesting_info[d_idx]; - int is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0; + int const is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0; // thread and block validity count block_scan_results valid_count_results; From 6e705549e708c02795cfd3da52ffd3fa9cdfd4d7 Mon Sep 17 00:00:00 2001 From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com> Date: Thu, 26 Sep 2024 10:17:05 -0400 Subject: [PATCH 17/38] Update cpp/src/io/parquet/decode_fixed.cu Co-authored-by: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com> --- cpp/src/io/parquet/decode_fixed.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 6b8559d400f..f84cd7e4944 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -302,8 +302,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( block_scan_results valid_count_results; scan_block_exclusive_sum(is_valid, valid_count_results); uint32_t const warp_validity_mask = valid_count_results.warp_bits; - int thread_valid_count = valid_count_results.thread_count_within_block; - int block_valid_count = valid_count_results.block_count; + int const thread_valid_count = valid_count_results.thread_count_within_block; + int const block_valid_count = valid_count_results.block_count; // validity is processed per-warp // From 9ad44155988e2702e4b4526c5b60d9532cc59cd7 Mon Sep 17 00:00:00 2001 From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com> Date: Thu, 26 Sep 2024 10:18:00 -0400 Subject: [PATCH 18/38] Update cpp/src/io/parquet/decode_fixed.cu Co-authored-by: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com> --- cpp/src/io/parquet/decode_fixed.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index f84cd7e4944..b18813551d9 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -504,9 +504,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesNonNullable(int32_t target_v int const row_index = thread_value_count + value_count; int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); - int is_valid = in_row_bounds; - int thread_valid_count = thread_value_count; - int block_valid_count = block_value_count; + int const is_valid = in_row_bounds; + int const thread_valid_count = thread_value_count; + int const block_valid_count = block_value_count; // if this is valid and we're at the leaf, output dst_pos if (is_valid) { From 3a1fc951fb04bc844c3cea8d327c688d3b49487d Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Thu, 26 Sep 2024 11:01:27 -0400 Subject: [PATCH 19/38] Unroll block-count loop --- cpp/src/io/parquet/decode_fixed.cu | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 9214af3e9e4..98e64bf3475 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -71,9 +71,19 @@ static __device__ void scan_block_exclusive_sum(uint32_t warp_bits, // Compute block-wide results results.block_count = 0; results.thread_count_within_block = results.thread_count_within_warp; - for (int warp_idx = 0; warp_idx < num_warps; ++warp_idx) { - results.block_count += warp_counts[warp_idx]; - if (warp_idx < warp_index) { results.thread_count_within_block += warp_counts[warp_idx]; } + if constexpr ((num_warps == 4) || (num_warps == 8)) { + results.block_count = warp_counts[0] + warp_counts[1] + warp_counts[2] + warp_counts[3]; + if constexpr (num_warps == 8) { + results.block_count += warp_counts[4] + warp_counts[5] + warp_counts[6] + warp_counts[7]; + } + for (int warp_idx = 0; warp_idx < warp_index; ++warp_idx) { + results.thread_count_within_block += warp_counts[warp_idx]; + } + } else { + for (int warp_idx = 0; warp_idx < num_warps; ++warp_idx) { + results.block_count += warp_counts[warp_idx]; + if (warp_idx < warp_index) { results.thread_count_within_block += warp_counts[warp_idx]; } + } } } @@ -338,7 +348,6 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( // if this is valid and we're at the leaf, output dst_pos if (d_idx == max_depth) { if (is_valid) { - // for non-list types, the value count is always the same across int const dst_pos = value_count + thread_value_count; int const src_pos = max_depth_valid_count + thread_valid_count; sb->nz_idx[rolling_index(src_pos)] = dst_pos; From 5ab9829c59d63ff112680ec088054696b18e6069 Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Thu, 26 Sep 2024 13:10:53 -0400 Subject: [PATCH 20/38] more style fixes --- cpp/src/io/parquet/decode_fixed.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index e6ea4dbbebe..993021fa5ef 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -312,8 +312,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( block_scan_results valid_count_results; scan_block_exclusive_sum(is_valid, valid_count_results); uint32_t const warp_validity_mask = valid_count_results.warp_bits; - int const thread_valid_count = valid_count_results.thread_count_within_block; - int const block_valid_count = valid_count_results.block_count; + int const thread_valid_count = valid_count_results.thread_count_within_block; + int const block_valid_count = valid_count_results.block_count; // validity is processed per-warp // From 447102230c3355b3a1cf61642e8f4b196fa1afb4 Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Wed, 2 Oct 2024 15:43:47 -0400 Subject: [PATCH 21/38] Disable manual block scan for non-lists --- cpp/src/io/parquet/decode_fixed.cu | 48 ++++++++++++++++-------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 993021fa5ef..552cadcc509 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -49,7 +49,7 @@ static __device__ void scan_block_exclusive_sum(int thread_bit, block_scan_resul } template -static __device__ void scan_block_exclusive_sum(uint32_t warp_bits, +__device__ static void scan_block_exclusive_sum(uint32_t warp_bits, int warp_lane, int warp_index, uint32_t lane_mask, @@ -309,11 +309,10 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( int const is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0; // thread and block validity count - block_scan_results valid_count_results; - scan_block_exclusive_sum(is_valid, valid_count_results); - uint32_t const warp_validity_mask = valid_count_results.warp_bits; - int const thread_valid_count = valid_count_results.thread_count_within_block; - int const block_valid_count = valid_count_results.block_count; + using block_scan = cub::BlockScan; + __shared__ typename block_scan::TempStorage scan_storage; + int thread_valid_count, block_valid_count; + block_scan(scan_storage).ExclusiveSum(is_valid, thread_valid_count, block_valid_count); // validity is processed per-warp // @@ -323,18 +322,21 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( // the correct position to start reading. since we are about to write the validity vector // here we need to adjust our computed mask to take into account the write row bounds. int warp_null_count = 0; - // lane 0 from each warp writes out validity - if ((write_start >= 0) && (ni.valid_map != nullptr) && ((t % cudf::detail::warp_size) == 0)) { - int const valid_map_offset = ni.valid_map_offset; - int const vindex = value_count + thread_value_count; // absolute input value index - int const bit_offset = (valid_map_offset + vindex + write_start) - - first_row; // absolute bit offset into the output validity map - int const write_end = - cudf::detail::warp_size - __clz(in_write_row_bounds); // last bit in the warp to store - int const bit_count = write_end - write_start; - warp_null_count = bit_count - __popc(warp_validity_mask >> write_start); - - store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count); + if (ni.valid_map != nullptr) { + uint32_t const warp_validity_mask = ballot(is_valid); + // lane 0 from each warp writes out validity + if ((write_start >= 0) && ((t % cudf::detail::warp_size) == 0)) { + int const valid_map_offset = ni.valid_map_offset; + int const vindex = value_count + thread_value_count; // absolute input value index + int const bit_offset = (valid_map_offset + vindex + write_start) - + first_row; // absolute bit offset into the output validity map + int const write_end = + cudf::detail::warp_size - __clz(in_write_row_bounds); // last bit in the warp to store + int const bit_count = write_end - write_start; + warp_null_count = bit_count - __popc(warp_validity_mask >> write_start); + + store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count); + } } // sum null counts. we have to do it this way instead of just incrementing by (value_count - @@ -418,11 +420,11 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( } // thread and block validity count - block_scan_results valid_count_results; - scan_block_exclusive_sum(is_valid, valid_count_results); - uint32_t const warp_validity_mask = valid_count_results.warp_bits; - int thread_valid_count = valid_count_results.thread_count_within_block; - int block_valid_count = valid_count_results.block_count; + using block_scan = cub::BlockScan; + __shared__ typename block_scan::TempStorage scan_storage; + int thread_valid_count, block_valid_count; + block_scan(scan_storage).ExclusiveSum(is_valid, thread_valid_count, block_valid_count); + uint32_t const warp_validity_mask = ballot(is_valid); // validity is processed per-warp // From c0ed2cb3175183d85579c2197ac5f80bdc4e0a17 Mon Sep 17 00:00:00 2001 From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com> Date: Fri, 4 Oct 2024 12:29:17 -0400 Subject: [PATCH 22/38] Update cpp/src/io/parquet/decode_fixed.cu Co-authored-by: Vukasin Milovanovic --- cpp/src/io/parquet/decode_fixed.cu | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 993021fa5ef..c2548fcd42a 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -284,13 +284,11 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( int const batch_size = min(max_batch_size, capped_target_value_count - value_count); // definition level - int d; + int d = 1; if (t >= batch_size) { d = -1; } else if (def) { d = static_cast(def[rolling_index(value_count + t)]); - } else { - d = 1; } int const thread_value_count = t; From b898cbabbf2821da8dcaba92e6c724a24069c8bc Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Fri, 4 Oct 2024 12:32:53 -0400 Subject: [PATCH 23/38] Style fixes --- cpp/src/io/parquet/decode_fixed.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 62709e0b27f..42f90880fe9 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -327,7 +327,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( int const valid_map_offset = ni.valid_map_offset; int const vindex = value_count + thread_value_count; // absolute input value index int const bit_offset = (valid_map_offset + vindex + write_start) - - first_row; // absolute bit offset into the output validity map + first_row; // absolute bit offset into the output validity map int const write_end = cudf::detail::warp_size - __clz(in_write_row_bounds); // last bit in the warp to store int const bit_count = write_end - write_start; From b0ee9fc97873a36da2f0dd0c23fc9fcd787b9905 Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Mon, 7 Oct 2024 10:34:58 -0400 Subject: [PATCH 24/38] renaming --- cpp/src/io/parquet/decode_fixed.cu | 188 +++++++++++++++-------------- 1 file changed, 97 insertions(+), 91 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index fce8f53700d..5fe14d09e9f 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -24,6 +24,54 @@ namespace cudf::io::parquet::detail { namespace { +// Unlike cub's algorithm, this provides warp-wide and block-wide results simultaneously. +// Also, this provides the ability to compute warp_bits & lane_mask manually, which we need for lists. +struct block_scan_results { + uint32_t warp_bits; + int thread_count_within_warp; + int warp_count; + + int thread_count_within_block; + int block_count; +}; + +template +__device__ inline static void scan_block_exclusive_sum(int thread_bit, block_scan_results& results) +{ + int const t = threadIdx.x; + int const warp_index = t / cudf::detail::warp_size; + int const warp_lane = t % cudf::detail::warp_size; + uint32_t const lane_mask = (uint32_t(1) << warp_lane) - 1; + + uint32_t warp_bits = ballot(thread_bit); + scan_block_exclusive_sum(warp_bits, warp_lane, warp_index, lane_mask, results); +} + +template +__device__ inline static void scan_block_exclusive_sum(uint32_t warp_bits, int warp_lane, int warp_index, uint32_t lane_mask, block_scan_results& results) +{ + //Compute # warps + constexpr int num_warps = decode_block_size / cudf::detail::warp_size; + + //Compute the warp-wide results + results.warp_bits = warp_bits; + results.warp_count = __popc(results.warp_bits); + results.thread_count_within_warp = __popc(results.warp_bits & lane_mask); + + //Share the warp counts amongst the block threads + __shared__ int warp_counts[num_warps]; + if (warp_lane == 0) { warp_counts[warp_index] = results.warp_count; } + __syncthreads(); + + //Compute block-wide results + results.block_count = 0; + results.thread_count_within_block = results.thread_count_within_warp; + for (int warp_idx = 0; warp_idx < num_warps; ++warp_idx) { + results.block_count += warp_counts[warp_idx]; + if (warp_idx < warp_index) { results.thread_count_within_block += warp_counts[warp_idx]; } + } +} + template __device__ inline void gpuDecodeFixedWidthValues( page_state_s* s, state_buf* const sb, int start, int end, int t) @@ -265,7 +313,7 @@ struct decode_fixed_width_split_values_func { }; template -static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists( +static __device__ int gpuUpdateValidityAndRowIndicesNested( int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t) { constexpr int num_warps = decode_block_size / cudf::detail::warp_size; @@ -552,48 +600,6 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( return valid_count; } -struct scan_results -{ - uint32_t warp_bits; - int thread_count_within_warp; - int warp_count; - - int thread_count_within_block; - int block_count; -}; - -template -static __device__ void scan_block(uint32_t warp_bits, int warp_lane, int warp_index, uint32_t lane_mask, scan_results& results) -{ - constexpr int num_warps = decode_block_size / cudf::detail::warp_size; - - results.warp_bits = warp_bits; - results.warp_count = __popc(results.warp_bits); - results.thread_count_within_warp = __popc(results.warp_bits & lane_mask); - - __shared__ uint32_t warp_counts[num_warps]; - if(warp_lane == 0) { - warp_counts[warp_index] = results.warp_count; - } - __syncthreads(); - - results.block_count = 0; - results.thread_count_within_block = results.thread_count_within_warp; - for(int warp_idx = 0; warp_idx < num_warps; ++warp_idx) { - results.block_count += warp_counts[warp_idx]; - if(warp_idx < warp_index) { - results.thread_count_within_block += warp_counts[warp_idx]; - } - } -} - -template -static __device__ void scan_block(int thread_bit, int warp_lane, int warp_index, uint32_t lane_mask, scan_results& results) -{ - uint32_t warp_bits = ballot(thread_bit); - scan_block(warp_bits, warp_lane, warp_index, lane_mask, results); -} - template static __device__ int gpuUpdateValidityAndRowIndicesLists( int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, @@ -630,13 +636,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( __syncthreads(); - using block_scan = cub::BlockScan; - __shared__ typename block_scan::TempStorage scan_storage; - - int const warp_lane = t % cudf::detail::warp_size; - bool const is_first_lane = warp_lane == 0; - int const warp_index = t / cudf::detail::warp_size; - uint32_t const lane_mask = (uint32_t(1) << warp_lane) - 1; + int const warp_index = t / cudf::detail::warp_size; + int const warp_lane = t % cudf::detail::warp_size; + bool const is_first_lane = (warp_lane == 0); while (value_count < target_value_count) { @@ -647,10 +649,10 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( // get definition level, use repitition level to get start/end depth // different for each thread, as each thread has a different r/d - int rep_level = -1, def_level = -1, start_depth = -1, end_depth = -1; + int def_level = -1, start_depth = -1, end_depth = -1; if (within_batch) { int const index = rolling_index(value_count + t); - rep_level = static_cast(rep[index]); + int rep_level = static_cast(rep[index]); if constexpr (nullable) { def_level = static_cast(def[index]); end_depth = s->nesting_info[def_level].end_depth; @@ -686,16 +688,19 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( //Determine value count & row index // track (page-relative) row index for the thread so we can compare against input bounds // keep track of overall # of rows we've read. - //THIS IS THE UNDO POINT int const is_new_row = start_depth == 0 ? 1 : 0; int num_prior_new_rows, total_num_new_rows; - block_scan(scan_storage).ExclusiveSum(is_new_row, num_prior_new_rows, total_num_new_rows); - __syncthreads(); + { + block_scan_results new_row_scan_results; + scan_block_exclusive_sum(is_new_row, new_row_scan_results); + num_prior_new_rows = new_row_scan_results.thread_count_within_block; + total_num_new_rows = new_row_scan_results.block_count; + } if constexpr (enable_print_large_list) { if(within_batch && (bool(is_new_row) != (t % 4 == 0))) { - printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, is_new_row %d, start_depth %d, rep_level %d\n", - blockIdx.x, value_count, target_value_count, t, is_new_row, start_depth, rep_level); + printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, is_new_row %d, start_depth %d\n", + blockIdx.x, value_count, target_value_count, t, is_new_row, start_depth); } if(within_batch && (num_prior_new_rows != ((t + 3) / 4))) { printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, num_prior_new_rows %d\n", @@ -731,13 +736,16 @@ if constexpr (enable_print_large_list) { // queries is_valid from all threads, stores prior total and total total //WARP VALUE COUNT: - scan_results value_count_scan_results; - scan_block(in_nesting_bounds, warp_lane, warp_index, lane_mask, value_count_scan_results); - - int thread_value_count_within_warp = value_count_scan_results.thread_count_within_warp; - int warp_value_count = value_count_scan_results.warp_count; - int thread_value_count = value_count_scan_results.thread_count_within_block; - int block_value_count = value_count_scan_results.block_count; + int thread_value_count_within_warp, warp_value_count, thread_value_count, block_value_count; + { + block_scan_results value_count_scan_results; + scan_block_exclusive_sum(in_nesting_bounds, value_count_scan_results); + + thread_value_count_within_warp = value_count_scan_results.thread_count_within_warp; + warp_value_count = value_count_scan_results.warp_count; + thread_value_count = value_count_scan_results.thread_count_within_block; + block_value_count = value_count_scan_results.block_count; + } if constexpr (enable_print_large_list) { if(within_batch && in_row_bounds && (in_nesting_bounds != (t % 4 == 0))) { @@ -798,14 +806,15 @@ if constexpr (enable_print_large_list) { // position for thread t's bit is thread_value_count. for cuda 11 we could use // __reduce_or_sync(), but until then we have to do a warp reduce. uint32_t const warp_valid_mask = WarpReduceOr32((uint32_t)is_valid << thread_value_count_within_warp); - auto thread_mask = (uint32_t(1) << thread_value_count_within_warp) - 1; - - scan_results valid_count_scan_results; - scan_block(warp_valid_mask, warp_lane, warp_index, thread_mask, valid_count_scan_results); - - int warp_valid_count = valid_count_scan_results.warp_count; - int thread_valid_count = valid_count_scan_results.thread_count_within_block; - int block_valid_count = valid_count_scan_results.block_count; + int thread_valid_count, block_valid_count; + { + auto thread_mask = (uint32_t(1) << thread_value_count_within_warp) - 1; + + block_scan_results valid_count_scan_results; + scan_block_exclusive_sum(warp_valid_mask, warp_lane, warp_index, thread_mask, valid_count_scan_results); + thread_valid_count = valid_count_scan_results.thread_count_within_block; + block_valid_count = valid_count_scan_results.block_count; + } if constexpr (enable_print_large_list) { if(within_batch && in_row_bounds && (((d_idx == 0) && (is_valid != (t % 4 == 0))) || ((d_idx == 1) && !is_valid))) { @@ -846,13 +855,15 @@ if constexpr (enable_print_large_list) { (d_idx + 1 >= start_depth && d_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0; //NEXT WARP VALUE COUNT: - scan_results next_value_count_scan_results; - scan_block(next_in_nesting_bounds, warp_lane, warp_index, lane_mask, next_value_count_scan_results); - - next_thread_value_count_within_warp = next_value_count_scan_results.thread_count_within_warp; - next_warp_value_count = next_value_count_scan_results.warp_count; - next_thread_value_count = next_value_count_scan_results.thread_count_within_block; - next_block_value_count = next_value_count_scan_results.block_count; + { + block_scan_results next_value_count_scan_results; + scan_block_exclusive_sum(next_in_nesting_bounds, next_value_count_scan_results); + + next_thread_value_count_within_warp = next_value_count_scan_results.thread_count_within_warp; + next_warp_value_count = next_value_count_scan_results.warp_count; + next_thread_value_count = next_value_count_scan_results.thread_count_within_block; + next_block_value_count = next_value_count_scan_results.block_count; + } if constexpr (enable_print_large_list) { if(within_batch && in_row_bounds && (next_in_nesting_bounds != 1)) { @@ -894,12 +905,12 @@ if constexpr (enable_print_large_list) { printf("WHOA BAD OFFSET\n"); printf("WHOA BAD OFFSET: WROTE %d to %d! t %d, blockIdx.x %d, idx %d, d_idx %d, start_depth %d, end_depth %d, max_depth %d, " "in_row_bounds %d, in_nesting_bounds %d, next_in_nesting_bounds %d, row_index %d, row_index_lower_bound %d, last_row %d, " - "input_row_count %d, num_prior_new_rows %d, is_new_row %d, total_num_new_rows %d, rep_level %d, def_level %d, ni.value_count %d, " + "input_row_count %d, num_prior_new_rows %d, is_new_row %d, total_num_new_rows %d, def_level %d, ni.value_count %d, " "thread_value_count %d, next_ni.value_count %d, next_thread_value_count %d, next_ni.page_start_value %d, value_count %d, " "target_value_count %d, block_value_count %d, next_block_value_count %d\n", ofs, overall_index, t, blockIdx.x, idx, d_idx, start_depth, end_depth, max_depth, in_row_bounds, in_nesting_bounds, next_in_nesting_bounds, row_index, row_index_lower_bound, last_row, input_row_count, num_prior_new_rows, is_new_row, - total_num_new_rows, rep_level, def_level, ni.value_count, thread_value_count, next_ni.value_count, + total_num_new_rows, def_level, ni.value_count, thread_value_count, next_ni.value_count, next_thread_value_count, next_ni.page_start_value, value_count, target_value_count, block_value_count, next_block_value_count); } } @@ -927,7 +938,6 @@ if constexpr (enable_print_large_list) { //TODO: Consider OR'ING for next_thread_value_count and popc() for next_thread_value_count //so that we don't have to take a ballot here. Is uint128 so may deconstruct to this anyway ... - int warp_null_count = 0; if(is_first_lane && (ni.valid_map != nullptr) && (warp_value_count > 0)) { // last bit in the warp to store //in old is warp_valid_mask_bit_count //so it's a count of everything in nesting bounds, though bits can be zero if NULL at this level @@ -936,14 +946,11 @@ if constexpr (enable_print_large_list) { //is cumulative sum of warp_value_count at the given nesting depth // DON'T subtract by first_row: since it's lists it's not 1-row-per-value int const bit_offset = ni.valid_map_offset + thread_value_count; - store_validity(bit_offset, ni.valid_map, warp_valid_mask, warp_value_count); - warp_null_count = warp_value_count - warp_valid_count; if constexpr (enable_print) { printf("STORE VALIDITY: t %d, depth %d, thread_value_count %d, valid_map_offset %d, bit_offset %d, warp_value_count %d, warp_valid_mask %u\n", t, d_idx, thread_value_count, ni.valid_map_offset, bit_offset, warp_value_count, warp_valid_mask); - printf("NUM NULLS: t %d, depth %d, warp_null_count %d\n", t, d_idx, warp_null_count); } } @@ -1148,8 +1155,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) DecodeValuesFunc decode_values; - bool const nullable = is_nullable(s); - bool const should_process_nulls = nullable && maybe_has_nulls(s); + bool const should_process_nulls = is_nullable(s) && maybe_has_nulls(s); // shared buffer. all shared memory is suballocated out of here static constexpr auto align_test = false; @@ -1244,8 +1250,8 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) } if constexpr (enable_print) { - if(t == 0) { printf("page_idx %d, nullable %d, should_process_nulls %d, has_lists_t %d, has_dict_t %d, num_rows %lu, page.num_input_values %d\n", - page_idx, int(nullable), int(should_process_nulls), int(has_lists_t), int(has_dict_t), num_rows, s->page.num_input_values); } + if(t == 0) { printf("page_idx %d, should_process_nulls %d, has_lists_t %d, has_dict_t %d, num_rows %lu, page.num_input_values %d\n", + page_idx, int(should_process_nulls), int(has_lists_t), int(has_dict_t), num_rows, s->page.num_input_values); } } auto print_nestings = [&](bool is_post){ @@ -1295,7 +1301,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) s->page.num_input_values, s->input_value_count, value_count, s->input_value_count, processed_count, valid_count, next_valid_count); } } } else if constexpr (has_nesting_t) { - next_valid_count = gpuUpdateValidityAndRowIndicesNestedNonLists( + next_valid_count = gpuUpdateValidityAndRowIndicesNested( processed_count, s, sb, def, t); if constexpr (enable_print) { if(t == 0) { printf("NESTED NEXT: next_valid_count %d\n", next_valid_count); } @@ -1321,7 +1327,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) if constexpr (has_nesting_t) { next_valid_count = - gpuUpdateValidityAndRowIndicesNestedNonLists( + gpuUpdateValidityAndRowIndicesNested( processed_count, s, sb, nullptr, t); } else { next_valid_count = gpuUpdateValidityAndRowIndicesFlat( From 4b7d1df38570663ffcfc25cc4eb5223331ce7c71 Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Mon, 7 Oct 2024 17:05:53 -0400 Subject: [PATCH 25/38] minor tweaks --- cpp/src/io/parquet/decode_fixed.cu | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index f058ac310db..05d9aeb1b5b 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -1098,20 +1098,18 @@ __device__ int skip_decode(stream_type& parquet_stream, int num_to_skip, int t) { static constexpr bool enable_print = false; - //Dictionary + // it could be that (e.g.) we skip 5000 but starting at row 4000 we have a run of length 2000: + // in that case skip_decode() only skips 4000, and we have to process the remaining 1000 up front + // modulo 2 * block_size of course, since that's as many as we process at once int num_skipped = parquet_stream.skip_decode(t, num_to_skip); if constexpr (enable_print) { if (t == 0) { printf("SKIPPED: num_skipped %d, for %d\n", num_skipped, num_to_skip); } } - //it could be that (e.g.) we skip 5000 but starting at row 4000 we have a run of length 2000: - //in that case skip_decode() only skips 4000, and we have to process the remaining 1000 up front - //modulo 2 * block_size of course, since that's as many as we process at once while (num_skipped < num_to_skip) { - auto const to_skip = min(2*decode_block_size_t, num_to_skip - num_skipped); - parquet_stream.decode_next(t, to_skip); - num_skipped += to_skip; + auto const to_decode = min(2 * decode_block_size_t, num_to_skip - num_skipped); + num_skipped += parquet_stream.decode_next(t, to_decode); if constexpr (enable_print) { - if (t == 0) { printf("EXTRA SKIPPED: to_skip %d, at %d, for %d\n", to_skip, num_skipped, num_to_skip); } + if (t == 0) { printf("EXTRA SKIPPED: to_decode %d, at %d, for %d\n", to_decode, num_skipped, num_to_skip); } } __syncthreads(); } @@ -1240,7 +1238,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values, s->dict_pos); } } } - + if constexpr (enable_print) { if((t == 0) && (page_idx == 0)){ printf("SIZES: shared_rep_size %d, shared_dict_size %d, shared_def_size %d\n", shared_rep_size, shared_dict_size, shared_def_size); From b36b3b29769f7c6a088c44d673684c6cb187afc8 Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Mon, 7 Oct 2024 17:47:45 -0400 Subject: [PATCH 26/38] delete some debug printing --- .../cudf/table/experimental/row_operators.cuh | 14 +------------- cpp/src/io/parquet/page_data.cuh | 18 ------------------ cpp/src/io/parquet/reader_impl.cpp | 9 --------- cpp/src/io/parquet/reader_impl_preprocess.cu | 12 ------------ cpp/src/io/parquet/rle_stream.cuh | 6 ------ 5 files changed, 1 insertion(+), 58 deletions(-) diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh index e4aca2f142a..3f33c70c29a 100644 --- a/cpp/include/cudf/table/experimental/row_operators.cuh +++ b/cpp/include/cudf/table/experimental/row_operators.cuh @@ -1429,30 +1429,18 @@ class device_row_comparator { __device__ bool operator()(size_type const lhs_element_index, size_type const rhs_element_index) const noexcept { - static constexpr bool enable_print = false; if (check_nulls) { bool const lhs_is_null{lhs.is_null(lhs_element_index)}; bool const rhs_is_null{rhs.is_null(rhs_element_index)}; if (lhs_is_null and rhs_is_null) { return nulls_are_equal == null_equality::EQUAL; } else if (lhs_is_null != rhs_is_null) { - if constexpr (enable_print) { - printf("NULLS UNEQUAL AT %d, %d; values: %d %d\n", - lhs_element_index, rhs_element_index, int(lhs_is_null), int(rhs_is_null)); - } return false; } } - bool result = comparator(lhs.element(lhs_element_index), + return comparator(lhs.element(lhs_element_index), rhs.element(rhs_element_index)); - if constexpr (enable_print && cuda::std::is_integral_v) { - if(!result) { - printf("VALUES UNEQUAL: AT %d, %d, VALUES %d, %d\n", lhs_element_index, rhs_element_index, - (int)lhs.element(lhs_element_index), (int)rhs.element(rhs_element_index)); - } - } - return result; } template data_start; } dict_pos *= (uint32_t)s->dtype_len_in; - - static constexpr bool enable_print = false; - if constexpr (enable_print) { - if (threadIdx.x == 0) { - auto dict_lookup_idx = rolling_index(src_pos); - printf("PREP OUTPUT VALUE at dst %p, dict %p, dict_pos %u, dict_size %u, dict_base %p, dict_bits %d, dict_lookup_idx %d, dtype_len_in %d\n", - dst, dict, dict_pos, dict_size, s->dict_base, s->dict_bits, dict_lookup_idx, s->dtype_len_in); - } - } - gpuStoreOutput(dst, dict, dict_pos, dict_size); } diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 43c11f917ab..9f66160f73c 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -50,8 +50,6 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num { auto& pass = *_pass_itm_data; auto& subpass = *pass.subpass; -//printf("PREP LAUNCH: decode_page_data: mode %d, skip_rows %lu, num_rows %lu, #pages %lu\n", -// (int)mode, skip_rows, num_rows, subpass.pages.size()); auto& page_nesting = subpass.page_nesting_info; auto& page_nesting_decode = subpass.page_nesting_decode_info; @@ -223,11 +221,6 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num int const nkernels = std::bitset<32>(kernel_mask).count(); auto streams = cudf::detail::fork_streams(_stream, nkernels); - static constexpr bool enable_print = false; - if constexpr (enable_print) { - printf("PAGE DATA DECODE MASK: %d\n", kernel_mask); - } - // launch string decoder int s_idx = 0; if (BitAnd(kernel_mask, decode_kernel_mask::STRING) != 0) { @@ -419,11 +412,9 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num page_nesting.device_to_host_async(_stream); page_nesting_decode.device_to_host_async(_stream); -//printf("SYNC ERROR CODE\n"); if (auto const error = error_code.value_sync(_stream); error != 0) { CUDF_FAIL("Parquet data decode failed with code(s) " + kernel_error::to_string(error)); } -//printf("ERROR CODE SUNK\n"); // for list columns, add the final offset to every offset buffer. // TODO : make this happen in more efficiently. Maybe use thrust::for_each diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index 2bb96f0087d..8e67f233213 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -138,12 +138,7 @@ void generate_depth_remappings( // depth. // - static constexpr bool enable_print = false; - // compute "X" from above - if constexpr (enable_print) { - printf("REMAPPING: max def %d, max rep %d\n", schema.max_definition_level, schema.max_repetition_level); - } for (int s_idx = schema.max_repetition_level; s_idx >= 0; s_idx--) { auto find_shallowest = [&](int r) { int shallowest = -1; @@ -162,9 +157,6 @@ void generate_depth_remappings( if (!cur_schema.is_stub()) { cur_depth--; } schema_idx = cur_schema.parent_idx; } - if constexpr (enable_print) { - printf("REMAPPING: s_idx / r %d, shallowest %d\n", r, shallowest); - } return shallowest; }; rep_depth_remap[s_idx] = find_shallowest(s_idx); @@ -203,10 +195,6 @@ void generate_depth_remappings( prev_schema = cur_schema; schema_idx = cur_schema.parent_idx; } - - if constexpr (enable_print) { - printf("REMAPPING: s_idx %d, r1 %d, end_depth %d\n", s_idx, r1, depth); - } return depth; }; def_depth_remap[s_idx] = find_deepest(s_idx); diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh index 490cf1d43c3..24db60d11b6 100644 --- a/cpp/src/io/parquet/rle_stream.cuh +++ b/cpp/src/io/parquet/rle_stream.cuh @@ -252,8 +252,6 @@ struct rle_stream { run.level_run = level_run; run.remaining = run.size; cur += run_bytes; -//printf("STORE RUN: decode_index %d, fill_index %d, output_pos %d, run.size %d\n", - //decode_index, fill_index, output_pos, run.size); output_pos += run.size; fill_index++; } @@ -355,8 +353,6 @@ struct rle_stream { // this is the last batch we will process this iteration if: // - either this run still has remaining values // - or it is consumed fully and its last index corresponds to output_count -//printf("STATUS: run_index %d, batch_len %d, remaining %d, at_end %d, last_run_pos %d, cur_values %d\n", - //run_index, batch_len, remaining, at_end, last_run_pos, cur_values); if (remaining > 0 || at_end) { values_processed_shared = output_count; } if (remaining == 0 && (at_end || is_last_decode_warp(warp_id))) { decode_index_shared = run_index + 1; @@ -401,7 +397,6 @@ struct rle_stream { } if((output_pos + run_size) > target_count) { -//printf("SKIPPING: target_count %d, run_size %d, output_pos %d\n", target_count, run_size, output_pos); return output_pos; //bail! we've reached the starting one } @@ -409,7 +404,6 @@ struct rle_stream { cur += run_bytes; } -//printf("SKIPPING: target_count %d, output_pos %d\n", target_count, output_pos); return output_pos; //we skipped everything } From 5b157042d5a2e2cd9de65e141be3c0e2e8528a47 Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Mon, 7 Oct 2024 17:54:41 -0400 Subject: [PATCH 27/38] Remove more prints --- cpp/src/io/parquet/decode_fixed.cu | 343 ----------------------------- 1 file changed, 343 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 05d9aeb1b5b..e7d7582cd2c 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -88,18 +88,6 @@ __device__ inline void gpuDecodeFixedWidthValues( auto const data_out = nesting_info_base[leaf_level_index].data_out; uint32_t const skipped_leaf_values = s->page.skipped_leaf_values; - static constexpr bool enable_print = false; - static constexpr bool enable_print_range_error = false; -// static constexpr bool enable_print_large_list = true; - - if constexpr (enable_print) { - if(t == 0) { printf("DECODE VALUES: start %d, end %d, first_row %d, leaf_level_index %d, dtype_len %u, " - "data_out %p, dict_base %p, dict_size %d, dict_bits %d, dict_val %d, data_start %p, skipped_leaf_values %u, input_row_count %d\n", - start, end, s->first_row, leaf_level_index, dtype_len, data_out, s->dict_base, s->dict_bits, s->dict_val, - s->dict_size, s->data_start, skipped_leaf_values, s->input_row_count); - } - } - // decode values int pos = start; while (pos < end) { @@ -116,18 +104,8 @@ __device__ inline void gpuDecodeFixedWidthValues( dst_pos -= s->first_row; } - if constexpr (has_lists_t && enable_print_range_error) { - if((dst_pos < 0) && (src_pos < target_pos)) { printf("WHOA: decode dst_pos %d out of bounds, src_pos %d, start %d\n", dst_pos, src_pos, start); } - } - int dict_idx = rolling_index(src_pos + skipped_leaf_values); int dict_pos = sb->dict_idx[dict_idx]; - if constexpr (enable_print) { - if(t == 0) { - printf("DECODE OFFSETS: pos %d, src_pos %d, offset %d, dst_pos %d, target_pos %d, dict_idx %d, dict_pos %d\n", - pos, src_pos, offset, dst_pos, target_pos, dict_idx, dict_pos); - } - } // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values // before first_row) in the flat hierarchy case. @@ -143,12 +121,6 @@ __device__ inline void gpuDecodeFixedWidthValues( } void* dst = data_out + static_cast(dst_pos) * dtype_len; - if constexpr (enable_print) { - if(dst_pos == 0) { - printf("WRITTEN TO dst_pos ZERO: t %d, data_out %p, dst %p, src_pos %d, dict_idx %d, dict_pos %d, dict_base %p\n", - t, data_out, dst, src_pos, dict_idx, dict_pos, s->dict_base); - } - } if (s->col.logical_type.has_value() && s->col.logical_type->type == LogicalType::DECIMAL) { switch (dtype) { @@ -182,17 +154,6 @@ __device__ inline void gpuDecodeFixedWidthValues( } else { gpuOutputGeneric(s, sb, src_pos, static_cast(dst), dtype_len); } -/* - if constexpr (enable_print_large_list) { - if (dtype == INT32) { - int value_stored = *static_cast(dst); - int overall_index = blockIdx.x * 20000 * 4 + src_pos; - if((overall_index % 1024) != value_stored) { - printf("WHOA BAD VALUE: WROTE %d to %d!\n", value_stored, overall_index); - } - } - } - */ } pos += batch_size; @@ -328,12 +289,6 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( int const last_row = first_row + s->num_rows; int const capped_target_value_count = min(target_value_count, last_row); - static constexpr bool enable_print = false; - if constexpr (enable_print) { - if (t == 0) { printf("NESTED: s->input_value_count %d, first_row %d, last_row %d, target_value_count %d, capped_target_value_count %d\n", - s->input_value_count, first_row, last_row, target_value_count, capped_target_value_count); } - } - int const row_index_lower_bound = s->row_index_lower_bound; int const max_depth = s->col.max_nesting_depth - 1; @@ -343,9 +298,6 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( __syncthreads(); while (value_count < capped_target_value_count) { - if constexpr (enable_print) { - if(t == 0) { printf("NESTED VALUE COUNT: %d\n", value_count); } - } int const batch_size = min(max_batch_size, capped_target_value_count - value_count); // definition level @@ -365,11 +317,6 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row); int const write_start = __ffs(in_write_row_bounds) - 1; // first bit in the warp to store - if constexpr (enable_print) { - if(t == 0) { printf("NESTED ROWS: row_index %d, row_index_lower_bound %d, last_row %d, in_row_bounds %d\n", - row_index, row_index_lower_bound, last_row, in_row_bounds); } - } - // iterate by depth for (int d_idx = 0; d_idx <= max_depth; d_idx++) { auto& ni = s->nesting_info[d_idx]; @@ -421,10 +368,6 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( int const dst_pos = value_count + thread_value_count; int const src_pos = max_depth_valid_count + thread_valid_count; sb->nz_idx[rolling_index(src_pos)] = dst_pos; - if constexpr (enable_print) { - if(t == 0) {printf("NESTED STORE: first_row %d, row_index %d dst_pos %d, src_pos %d\n", - first_row, row_index, dst_pos, src_pos);} - } } // update stuff max_depth_valid_count += block_valid_count; @@ -464,22 +407,12 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( int const last_row = first_row + s->num_rows; int const capped_target_value_count = min(target_value_count, last_row); - static constexpr bool enable_print = false; - if constexpr (enable_print) { - if (t == 0) { printf("FLAT: s->input_value_count %d, first_row %d, last_row %d, target_value_count %d, capped_target_value_count %d\n", - s->input_value_count, first_row, last_row, target_value_count, capped_target_value_count); } - } - int const valid_map_offset = ni.valid_map_offset; int const row_index_lower_bound = s->row_index_lower_bound; __syncthreads(); while (value_count < capped_target_value_count) { - if constexpr (enable_print) { - if(t == 0) { printf("FLAT VALUE COUNT: %d\n", value_count); } - } - int const batch_size = min(max_batch_size, capped_target_value_count - value_count); int const thread_value_count = t; @@ -640,23 +573,12 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( // how many (input) values we've processed in the page so far, prior to this loop iteration int value_count = s->input_value_count; - static constexpr bool enable_print = false; - static constexpr bool enable_print_range_error = false; - static constexpr bool enable_print_large_list = false; - // how many rows we've processed in the page so far int input_row_count = s->input_row_count; - if constexpr (enable_print) { - if (t == 0) { printf("value_count %d, input_row_count %d\n", value_count, input_row_count); } - } // cap by last row so that we don't process any rows past what we want to output. int const first_row = s->first_row; int const last_row = first_row + s->num_rows; - if constexpr (enable_print) { - if (t == 0) { printf("LIST s->input_value_count %d, first_row %d, last_row %d, target_value_count %d\n", - s->input_value_count, first_row, last_row, target_value_count); } - } int const row_index_lower_bound = s->row_index_lower_bound; int const max_depth = s->col.max_nesting_depth - 1; @@ -669,10 +591,6 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( bool const is_first_lane = (warp_lane == 0); while (value_count < target_value_count) { - - if constexpr (enable_print) { - if(t == 0) { printf("LIST VALUE COUNT: %d\n", value_count); } - } bool const within_batch = value_count + t < target_value_count; // get definition level, use repitition level to get start/end depth @@ -689,28 +607,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( } //computed by generate_depth_remappings() - if constexpr (enable_print || enable_print_range_error) { - if((rep_level < 0) || (rep_level > max_depth)) { - printf("WHOA: rep level %d out of bounds %d!\n", rep_level, max_depth); - } - if(nullable && ((def_level < 0)/* || (def_level > (max_depth + 1)) */ )) { - printf("WHOA: def level %d out of bounds (max_depth %d) (index %d)!\n", def_level, max_depth, index); - } - } - start_depth = s->nesting_info[rep_level].start_depth; - if constexpr (enable_print || enable_print_range_error) { - if((start_depth < 0) || (start_depth > (max_depth + 1))) { - printf("WHOA: start_depth %d out of bounds (max_depth %d) (index %d)!\n", start_depth, max_depth, index); - } - if((end_depth < 0) || (end_depth > (max_depth + 1))) { - printf("WHOA: end_depth %d out of bounds (max_depth %d) (index %d)!\n", end_depth, max_depth, index); - } - } - if constexpr (enable_print) { - if (t == 0) { printf("t %d, def_level %d, rep_level %d, start_depth %d, end_depth %d, max_depth %d\n", \ - t, def_level, rep_level, start_depth, end_depth, max_depth); } - } } //Determine value count & row index @@ -725,25 +622,6 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( total_num_new_rows = new_row_scan_results.block_count; } -if constexpr (enable_print_large_list) { - if(within_batch && (bool(is_new_row) != (t % 4 == 0))) { - printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, is_new_row %d, start_depth %d\n", - blockIdx.x, value_count, target_value_count, t, is_new_row, start_depth); - } - if(within_batch && (num_prior_new_rows != ((t + 3) / 4))) { - printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, num_prior_new_rows %d\n", - blockIdx.x, value_count, target_value_count, t, num_prior_new_rows); - } - if((value_count + 128 <= target_value_count) && (total_num_new_rows != 32)) { - printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, total_num_new_rows %d\n", - blockIdx.x, value_count, target_value_count, t, total_num_new_rows); - } -} - - if constexpr (enable_print) { - if (t == 0) { printf("num_prior_new_rows %d, total_num_new_rows %d\n", num_prior_new_rows, total_num_new_rows); } - } - int const row_index = input_row_count + (num_prior_new_rows + is_new_row - 1); input_row_count += total_num_new_rows; int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); @@ -754,13 +632,6 @@ if constexpr (enable_print_large_list) { // is from/in current rep level to/in the rep level AT the depth with the def value int in_nesting_bounds = ((0 >= start_depth && 0 <= end_depth) && in_row_bounds) ? 1 : 0; - if constexpr (enable_print) { - if(t == 0) { printf("LIST ROWS: row_index %d, row_index_lower_bound %d, last_row %d, in_row_bounds %d, in_nesting_bounds %d\n", - row_index, row_index_lower_bound, last_row, in_row_bounds, in_nesting_bounds); } - if (t < 32) { printf("t %d, is_new_row %d, num_prior_new_rows %d, row_index %d, in_row_bounds %d\n", - t, is_new_row, num_prior_new_rows, row_index, in_row_bounds); } - } - // queries is_valid from all threads, stores prior total and total total //WARP VALUE COUNT: @@ -775,29 +646,6 @@ if constexpr (enable_print_large_list) { block_value_count = value_count_scan_results.block_count; } -if constexpr (enable_print_large_list) { - if(within_batch && in_row_bounds && (in_nesting_bounds != (t % 4 == 0))) { - printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, in_nesting_bounds %d, start_depth %d, end_depth %d, " - "in_row_bounds %d, row_index %d, input_row_count %d, row_index_lower_bound %d, last_row %d, first_row %d, s->num_rows %d\n", - blockIdx.x, value_count, target_value_count, t, in_nesting_bounds, start_depth, end_depth, in_row_bounds, row_index, input_row_count, - row_index_lower_bound, last_row, first_row, s->num_rows); - } - if(within_batch && in_row_bounds && (thread_value_count != ((t + 3) / 4))) { - printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, thread_value_count %d\n", - blockIdx.x, value_count, target_value_count, t, thread_value_count); - } - if((value_count + 128 <= target_value_count) && (input_row_count + total_num_new_rows <= last_row) && (block_value_count != 32)) { - printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, block_value_count %d\n", - blockIdx.x, value_count, target_value_count, t, block_value_count); - } -} - - if constexpr (enable_print) { - if (t == 0) { printf("block_value_count %d\n", block_value_count); } - if (t < 32) { printf("t %d, thread_value_count %d, in_nesting_bounds %d\n", - t, thread_value_count, in_nesting_bounds); } - } - // column is either nullable or is a list (or both): iterate by depth for (int d_idx = 0; d_idx <= max_depth; d_idx++) { @@ -811,13 +659,6 @@ if constexpr (enable_print_large_list) { is_valid = in_nesting_bounds; } - if constexpr (enable_print) { - if (t == 0) { printf("nullable %d, depth %d, max_depth %d, max_def_level %d, value_count %d\n", - int(nullable), d_idx, max_depth, ni.max_def_level, value_count); } - if (t < 32) { printf("t %d, def_level %d, in_nesting_bounds %d, is_valid %d\n", - t, def_level, in_nesting_bounds, is_valid); } - } - // thread and block validity count // queries is_valid of all threads, stores prior total and total total @@ -844,32 +685,6 @@ if constexpr (enable_print_large_list) { block_valid_count = valid_count_scan_results.block_count; } -if constexpr (enable_print_large_list) { - if(within_batch && in_row_bounds && (((d_idx == 0) && (is_valid != (t % 4 == 0))) || ((d_idx == 1) && !is_valid))) { - printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, is_valid %d, in_nesting_bounds %d\n", - blockIdx.x, value_count, target_value_count, t, d_idx, is_valid, in_nesting_bounds); - } - if (within_batch && in_row_bounds && (((d_idx == 0) && (thread_valid_count != ((t + 3)/ 4))) || ((d_idx == 1) && (thread_valid_count != t)))) { - printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, thread_valid_count %d\n", - blockIdx.x, value_count, target_value_count, t, d_idx, thread_valid_count); - } - if((value_count + 128 <= target_value_count) && (input_row_count + total_num_new_rows <= last_row) && (((d_idx == 0) && (block_valid_count != 32)) || ((d_idx == 1) && (block_valid_count != 128)))) { - printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, block_valid_count %d\n", - blockIdx.x, value_count, target_value_count, t, d_idx, block_valid_count); - } -} - - if constexpr (enable_print) { - if((block_valid_count == 0) && (t == 0) && (d_idx == max_depth)) { - printf("EMPTY VALID MASK: def_level %d, max_def_level %d, in_nesting_bounds %d, start_depth %d, " - "end_depth %d, in_row_bounds %d, row_index %d, row_index_lower_bound %d, last_row %d, input_row_count %d\n", - def_level, ni.max_def_level, in_nesting_bounds, start_depth, end_depth, in_row_bounds, row_index, - row_index_lower_bound, last_row, input_row_count); } - - if (t == 0) { printf("block_valid_count %u\n", int(block_valid_count)); } - if (t < 32) { printf("t %d, thread_valid_count %d\n", t, thread_valid_count); } - } - // compute warp and thread value counts for the -next- nesting level. we need to // do this for nested schemas so that we can emit an offset for the -current- nesting // level. more concretely : the offset for the current nesting level == current length of the @@ -893,28 +708,6 @@ if constexpr (enable_print_large_list) { next_block_value_count = next_value_count_scan_results.block_count; } -if constexpr (enable_print_large_list) { - if(within_batch && in_row_bounds && (next_in_nesting_bounds != 1)) { - printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, next_in_nesting_bounds %d, start_depth %d, end_depth %d, in_row_bounds %d, row_index %d, input_row_count %d\n", - blockIdx.x, value_count, target_value_count, t, next_in_nesting_bounds, start_depth, end_depth, in_row_bounds, row_index, input_row_count); - } - if(within_batch && in_row_bounds && (next_thread_value_count != t)) { - printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, next_thread_value_count %d\n", - blockIdx.x, value_count, target_value_count, t, next_thread_value_count); - } - if((value_count + 128 <= target_value_count) && (input_row_count + total_num_new_rows <= last_row) && (next_block_value_count != 128)) { - printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, next_block_value_count %d\n", - blockIdx.x, value_count, target_value_count, t, next_block_value_count); - } -} - - if constexpr (enable_print) { - if (t == 0) { printf("next depth %d, next_block_value_count %d\n", d_idx + 1, next_block_value_count); } - if (t < 32) { printf("t %d, start_depth %d, end_depth %d, in_row_bounds %d, next_in_nesting_bounds %d\n", - t, start_depth, end_depth, in_row_bounds, next_in_nesting_bounds); } - if (t < 32) { printf("t %d, next_thread_value_count %d\n", t, next_thread_value_count); } - } - // if we're -not- at a leaf column and we're within nesting/row bounds // and we have a valid data_out pointer, it implies this is a list column, so // emit an offset. @@ -925,34 +718,6 @@ if constexpr (enable_print_large_list) { //STORE THE OFFSET FOR THE NEW LIST LOCATION (reinterpret_cast(ni.data_out))[idx] = ofs; - -/* -if constexpr (enable_print_large_list) { - int overall_index = 4*(blockIdx.x * 20000 + idx); - if(overall_index != ofs) { - printf("WHOA BAD OFFSET\n"); - printf("WHOA BAD OFFSET: WROTE %d to %d! t %d, blockIdx.x %d, idx %d, d_idx %d, start_depth %d, end_depth %d, max_depth %d, " - "in_row_bounds %d, in_nesting_bounds %d, next_in_nesting_bounds %d, row_index %d, row_index_lower_bound %d, last_row %d, " - "input_row_count %d, num_prior_new_rows %d, is_new_row %d, total_num_new_rows %d, def_level %d, ni.value_count %d, " - "thread_value_count %d, next_ni.value_count %d, next_thread_value_count %d, next_ni.page_start_value %d, value_count %d, " - "target_value_count %d, block_value_count %d, next_block_value_count %d\n", - ofs, overall_index, t, blockIdx.x, idx, d_idx, start_depth, end_depth, max_depth, in_row_bounds, in_nesting_bounds, - next_in_nesting_bounds, row_index, row_index_lower_bound, last_row, input_row_count, num_prior_new_rows, is_new_row, - total_num_new_rows, def_level, ni.value_count, thread_value_count, next_ni.value_count, - next_thread_value_count, next_ni.page_start_value, value_count, target_value_count, block_value_count, next_block_value_count); - } -} -*/ - if constexpr (enable_print || enable_print_range_error) { - if((idx < 0) || (idx > 50000)){ printf("WHOA: offset index %d out of bounds!\n", idx); } - if(ofs < 0){ printf("WHOA: offset value %d out of bounds!\n", ofs); } - } - - if constexpr (enable_print) { - if(idx < 0) { printf("WHOA: offset index out of bounds!\n"); } - if (t < 32) { printf("OFFSETS: t %d, idx %d, next value count %d, next page_start_value %d, ofs %d\n", - t, idx, next_ni.value_count, next_ni.page_start_value, ofs); } - } } } @@ -975,19 +740,10 @@ if constexpr (enable_print_large_list) { // DON'T subtract by first_row: since it's lists it's not 1-row-per-value int const bit_offset = ni.valid_map_offset + thread_value_count; store_validity(bit_offset, ni.valid_map, warp_valid_mask, warp_value_count); - - if constexpr (enable_print) { - printf("STORE VALIDITY: t %d, depth %d, thread_value_count %d, valid_map_offset %d, bit_offset %d, warp_value_count %d, warp_valid_mask %u\n", - t, d_idx, thread_value_count, ni.valid_map_offset, bit_offset, warp_value_count, warp_valid_mask); - } } if (t == 0) { size_type const block_null_count = block_value_count - block_valid_count; - if constexpr (enable_print) { - if (t == 0) { printf("BLOCK NULLS: depth %d, prior %d, block_null_count %u\n", - d_idx, ni.null_count, block_null_count); } - } ni.null_count += block_null_count; } } @@ -1003,23 +759,6 @@ if constexpr (enable_print_large_list) { int const src_pos = max_depth_valid_count + thread_valid_count; int const output_index = rolling_index(src_pos); - if constexpr (enable_print || enable_print_range_error) { - if((output_index < 0) || (output_index >= state_buf::nz_buf_size)) { - printf("WHOA: output index STORE %d out of bounds!\n", output_index); - } - if(dst_pos < 0) { printf("WHOA: dst_pos STORE %d out of bounds!\n", dst_pos); } - } - - if constexpr (enable_print) { - if (t == 0) { printf("ni.value_count %d, max_depth_valid_count %d\n", int(ni.value_count), max_depth_valid_count); } - if (t < 32) { printf("t %d, src_pos %d, output_index %d\n", t, src_pos, output_index); } - - if((t == 0) && (src_pos == 0)) {printf("SPECIAL: output_index %d, dst_pos %d, ni.value_count %d, max_depth_valid_count %d, thread_value_count %d, thread_valid_count %d\n", - output_index, dst_pos, ni.value_count, max_depth_valid_count, thread_value_count, thread_valid_count);} - - if (t == 0) { printf("OUTPUT_INDICES: output_index %d, dst_pos %d\n", output_index, dst_pos); } - } - //Index from rolling buffer of values (which doesn't include nulls) to final array (which includes gaps for nulls) sb->nz_idx[output_index] = dst_pos; } @@ -1041,18 +780,10 @@ if constexpr (enable_print_large_list) { thread_value_count_within_warp = next_thread_value_count_within_warp; } //END OF DEPTH LOOP - if constexpr (enable_print) { - if (t == 0) { printf("END DEPTH LOOP\n"); } - } - int const batch_size = min(max_batch_size, target_value_count - value_count); value_count += batch_size; } - if constexpr (enable_print) { - if (t == 0) { printf("END LOOP\n"); } - } - if (t == 0) { // update valid value count for decoding and total # of values we've processed s->nesting_info[max_depth].valid_count = max_depth_valid_count; @@ -1096,21 +827,13 @@ __device__ inline bool maybe_has_nulls(page_state_s* s) template __device__ int skip_decode(stream_type& parquet_stream, int num_to_skip, int t) { - static constexpr bool enable_print = false; - // it could be that (e.g.) we skip 5000 but starting at row 4000 we have a run of length 2000: // in that case skip_decode() only skips 4000, and we have to process the remaining 1000 up front // modulo 2 * block_size of course, since that's as many as we process at once int num_skipped = parquet_stream.skip_decode(t, num_to_skip); - if constexpr (enable_print) { - if (t == 0) { printf("SKIPPED: num_skipped %d, for %d\n", num_skipped, num_to_skip); } - } while (num_skipped < num_to_skip) { auto const to_decode = min(2 * decode_block_size_t, num_to_skip - num_skipped); num_skipped += parquet_stream.decode_next(t, to_decode); - if constexpr (enable_print) { - if (t == 0) { printf("EXTRA SKIPPED: to_decode %d, at %d, for %d\n", to_decode, num_skipped, num_to_skip); } - } __syncthreads(); } @@ -1227,27 +950,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) s->page.num_input_values); } - static constexpr bool enable_print = false; - rle_stream dict_stream{dict_runs}; if constexpr (has_dict_t) { dict_stream.init( s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values); - if constexpr (enable_print) { - if(t == 0) { printf("INIT DICT: dict_bits %d, data_start %p, data_end %p, dict_idx %p, page.num_input_values %d, s->dict_pos %d \n", - s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values, s->dict_pos); } - } - } - - if constexpr (enable_print) { - if((t == 0) && (page_idx == 0)){ - printf("SIZES: shared_rep_size %d, shared_dict_size %d, shared_def_size %d\n", shared_rep_size, shared_dict_size, shared_def_size); - } - if constexpr (has_lists_t){ - printf("Is fixed list page\n"); - } else { - printf("Is fixed non-list page\n"); - } } // We use two counters in the loop below: processed_count and valid_count. @@ -1275,36 +981,6 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) } } - if constexpr (enable_print) { - if(t == 0) { printf("page_idx %d, should_process_nulls %d, has_lists_t %d, has_dict_t %d, num_rows %lu, page.num_input_values %d\n", - page_idx, int(should_process_nulls), int(has_lists_t), int(has_dict_t), num_rows, s->page.num_input_values); } - } - - auto print_nestings = [&](bool is_post){ - if constexpr (enable_print) { - auto print_nesting_level = [&](const PageNestingDecodeInfo& ni) { - printf("page_idx %d, max_def_level %d, start_depth %d, end_depth %d, page_start_value %d, null_count %d, " - "valid_map_offset %d, valid_count %d, value_count %d\n", - page_idx, ni.max_def_level, ni.start_depth, ni.end_depth, ni.page_start_value, ni.null_count, - ni.valid_map_offset, ni.valid_count, ni.value_count); - }; - - if(t == 0) { - printf("POST %d NESTING 0: ", int(is_post)); - print_nesting_level(s->nesting_info[0]); - printf("POST %d NESTING 1: ", int(is_post)); - print_nesting_level(s->nesting_info[1]); - //printf("POST %d NESTING 2: ", int(is_post)); - //print_nesting_level(s->nesting_info[2]); - } - } - }; - - print_nestings(false); - if constexpr (enable_print) { - if(t == 0) {printf("LOOP START page_idx %d\n", page_idx);} - } - int last_row = s->first_row + s->num_rows; while ((s->error == 0) && (processed_count < s->page.num_input_values) && (s->input_row_count <= last_row)) { @@ -1318,22 +994,11 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) if constexpr (has_lists_t) { rep_decoder.decode_next(t); __syncthreads(); - - int value_count = s->input_value_count; next_valid_count = gpuUpdateValidityAndRowIndicesLists( processed_count, s, sb, def, rep, t); - if constexpr (enable_print) { - if(t == 0) { printf("LISTS NEXT: next_valid_count %d\n", next_valid_count); } - if(t == 0) { printf("PROCESSING: page total values %d, num_input_values %d, pre value_count %d, post value_count %d, " - "processed_count %d, valid_count %d, next_valid_count %d\n", - s->page.num_input_values, s->input_value_count, value_count, s->input_value_count, processed_count, valid_count, next_valid_count); } - } } else if constexpr (has_nesting_t) { next_valid_count = gpuUpdateValidityAndRowIndicesNested( processed_count, s, sb, def, t); - if constexpr (enable_print) { - if(t == 0) { printf("NESTED NEXT: next_valid_count %d\n", next_valid_count); } - } } else { next_valid_count = gpuUpdateValidityAndRowIndicesFlat( processed_count, s, sb, def, t); @@ -1346,7 +1011,6 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) if constexpr (has_lists_t) { processed_count += rep_decoder.decode_next(t); __syncthreads(); - next_valid_count = gpuUpdateValidityAndRowIndicesLists( processed_count, s, sb, nullptr, rep, t); @@ -1371,15 +1035,8 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) __syncthreads(); valid_count = next_valid_count; - - if constexpr (enable_print) { - if(t == 0) { printf("LOOP: processed_count %d, #page values %d, error %d\n", - processed_count, s->page.num_input_values, s->error); } - } } if (t == 0 and s->error != 0) { set_error(s->error, error_code); } - - print_nestings(true); } } // anonymous namespace From e84af82274bd241f39c686f1286eed02ebb4f2bc Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Tue, 8 Oct 2024 13:03:14 -0400 Subject: [PATCH 28/38] cleanup --- cpp/src/io/parquet/decode_fixed.cu | 46 +++++++++++++----------------- cpp/src/io/parquet/page_decode.cuh | 2 +- cpp/src/io/parquet/page_hdr.cu | 22 +++++--------- cpp/src/io/parquet/rle_stream.cuh | 30 +++++-------------- 4 files changed, 35 insertions(+), 65 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index e7d7582cd2c..d60b4f79168 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -80,40 +80,35 @@ __device__ inline void gpuDecodeFixedWidthValues( constexpr int num_warps = block_size / cudf::detail::warp_size; constexpr int max_batch_size = num_warps * cudf::detail::warp_size; - PageNestingDecodeInfo* nesting_info_base = s->nesting_info; - int const dtype = s->col.physical_type; - + // nesting level that is storing actual leaf values int const leaf_level_index = s->col.max_nesting_depth - 1; + auto const data_out = s->nesting_info[leaf_level_index].data_out; + + int const dtype = s->col.physical_type; uint32_t dtype_len = s->dtype_len; - auto const data_out = nesting_info_base[leaf_level_index].data_out; + uint32_t const skipped_leaf_values = s->page.skipped_leaf_values; // decode values int pos = start; while (pos < end) { int const batch_size = min(max_batch_size, end - pos); - int const target_pos = pos + batch_size; int src_pos = pos + t; - // the position in the output column/buffer -//Index from rolling buffer of values (which doesn't include nulls) to final array (which includes gaps for nulls) - auto offset = sb->nz_idx[rolling_index(src_pos)]; - int dst_pos = offset; + //Index from value buffer (doesn't include nulls) to final array (has gaps for nulls) + int dst_pos = sb->nz_idx[rolling_index(src_pos)]; if constexpr (!has_lists_t) { dst_pos -= s->first_row; } - int dict_idx = rolling_index(src_pos + skipped_leaf_values); - int dict_pos = sb->dict_idx[dict_idx]; - // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values // before first_row) in the flat hierarchy case. if (src_pos < target_pos && dst_pos >= 0) { // nesting level that is storing actual leaf values // src_pos represents the logical row position we want to read from. But in the case of - // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So our true read position + // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So src_pos // has to take into account the # of values we have to skip in the page to get to the // desired logical row. For flat hierarchies, skipped_leaf_values will always be 0. if constexpr (has_lists_t) { @@ -176,10 +171,14 @@ __device__ inline void gpuDecodeFixedWidthSplitValues( constexpr int num_warps = block_size / warp_size; constexpr int max_batch_size = num_warps * warp_size; - PageNestingDecodeInfo* nesting_info_base = s->nesting_info; + // nesting level that is storing actual leaf values + int const leaf_level_index = s->col.max_nesting_depth - 1; + auto const data_out = s->nesting_info[leaf_level_index].data_out; + int const dtype = s->col.physical_type; auto const data_len = thrust::distance(s->data_start, s->data_end); auto const num_values = data_len / s->dtype_len_in; + uint32_t const skipped_leaf_values = s->page.skipped_leaf_values; // decode values @@ -199,11 +198,9 @@ __device__ inline void gpuDecodeFixedWidthSplitValues( // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values // before first_row) in the flat hierarchy case. if (src_pos < target_pos && dst_pos >= 0) { - // nesting level that is storing actual leaf values - int const leaf_level_index = s->col.max_nesting_depth - 1; // src_pos represents the logical row position we want to read from. But in the case of - // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So our true read position + // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So src_pos // has to take into account the # of values we have to skip in the page to get to the // desired logical row. For flat hierarchies, skipped_leaf_values will always be 0. if constexpr (has_lists_t) { @@ -212,8 +209,7 @@ __device__ inline void gpuDecodeFixedWidthSplitValues( uint32_t dtype_len = s->dtype_len; uint8_t const* src = s->data_start + src_pos; - uint8_t* dst = - nesting_info_base[leaf_level_index].data_out + static_cast(dst_pos) * dtype_len; + uint8_t* dst = data_out + static_cast(dst_pos) * dtype_len; auto const is_decimal = s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL; @@ -862,7 +858,7 @@ template typename DecodeValuesFunc> -CUDF_KERNEL void __launch_bounds__(decode_block_size_t) +CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) gpuDecodePageDataGeneric(PageInfo* pages, device_span chunks, size_t min_row, @@ -907,18 +903,16 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) bool const should_process_nulls = is_nullable(s) && maybe_has_nulls(s); // shared buffer. all shared memory is suballocated out of here - static constexpr auto align_test = false; - static constexpr size_t buffer_alignment = align_test ? 128 : 16; constexpr int shared_rep_size = has_lists_t ? cudf::util::round_up_unsafe(rle_run_buffer_size * - sizeof(rle_run), buffer_alignment) : 0; + sizeof(rle_run), size_t{16}) : 0; constexpr int shared_dict_size = has_dict_t - ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), buffer_alignment) + ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), size_t{16}) : 0; constexpr int shared_def_size = - cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), buffer_alignment); + cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), size_t{16}); constexpr int shared_buf_size = shared_rep_size + shared_dict_size + shared_def_size; - __shared__ __align__(buffer_alignment) uint8_t shared_buf[shared_buf_size]; + __shared__ __align__(16) uint8_t shared_buf[shared_buf_size]; // setup all shared memory buffers int shared_offset = 0; diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index 3adc02c9387..9ed2929a70e 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -1431,4 +1431,4 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, return true; } -} // namespace cudf::io::parquet::detail \ No newline at end of file +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index 3fad8e344ea..85a55fa97c9 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -181,29 +181,21 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page, } else if (is_string_col(chunk)) { // check for string before byte_stream_split so FLBA will go to the right kernel return decode_kernel_mask::STRING; - } - - if (is_list(chunk) && !is_string_col(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) { - if (page.encoding == Encoding::PLAIN) { - return decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST; - } else if (page.encoding == Encoding::BYTE_STREAM_SPLIT) { - return decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST; - } else if (page.encoding == Encoding::PLAIN_DICTIONARY || - page.encoding == Encoding::RLE_DICTIONARY) { - return decode_kernel_mask::FIXED_WIDTH_DICT_LIST; - } } - if (!is_list(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) { + if (!is_byte_array(chunk) && !is_boolean(chunk)) { if (page.encoding == Encoding::PLAIN) { - return is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED + return is_list(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST : + is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED : decode_kernel_mask::FIXED_WIDTH_NO_DICT; } else if (page.encoding == Encoding::PLAIN_DICTIONARY || page.encoding == Encoding::RLE_DICTIONARY) { - return is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_NESTED + return is_list(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_LIST : + is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_NESTED : decode_kernel_mask::FIXED_WIDTH_DICT; } else if (page.encoding == Encoding::BYTE_STREAM_SPLIT) { - return is_nested(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED + return is_list(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST : + is_nested(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED : decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT; } } diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh index 24db60d11b6..caa7c45840e 100644 --- a/cpp/src/io/parquet/rle_stream.cuh +++ b/cpp/src/io/parquet/rle_stream.cuh @@ -375,9 +375,9 @@ struct rle_stream { __device__ inline int skip_runs(int target_count) { //we want to process all runs UP TO BUT NOT INCLUDING the run that overlaps with the skip amount - //so thread 0 spins like crazy on fill_run_batch(), skipping writing unnecessary run info + //so threads spin like crazy on fill_run_batch(), skipping writing unnecessary run info //then when it hits the one that matters, we don't process it at all and bail as if we never started - //basically we're setting up the global vars necessary to start fill_run_batch for the first time + //basically we're setting up the rle_stream vars necessary to start fill_run_batch for the first time while (cur < end) { // bytes for the varint header uint8_t const* _cur = cur; @@ -397,9 +397,10 @@ struct rle_stream { } if((output_pos + run_size) > target_count) { - return output_pos; //bail! we've reached the starting one + return output_pos; //bail! we've reached the starting run } + //skip this run output_pos += run_size; cur += run_bytes; } @@ -412,27 +413,10 @@ struct rle_stream { { int const output_count = min(count, total_values - cur_values); - // special case. if level_bits == 0, just return all zeros. this should tremendously speed up + // if level_bits == 0, there's nothing to do // a very common case: columns with no nulls, especially if they are non-nested - if (level_bits == 0) { - cur_values = output_count; - return output_count; - } - - __shared__ int values_processed_shared; - - __syncthreads(); - - // warp 0 reads ahead and fills `runs` array to be decoded by remaining warps. - if (t == 0) { - values_processed_shared = skip_runs(output_count); - } - __syncthreads(); - - cur_values = values_processed_shared; - - // valid for every thread - return values_processed_shared; + cur_values = (level_bits == 0) ? output_count : skip_runs(output_count); + return cur_values; } __device__ inline int decode_next(int t) { return decode_next(t, max_output_values); } From f2007484fe06ec2d18dfee18f6fd9c2d86d269eb Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Tue, 8 Oct 2024 15:47:55 -0400 Subject: [PATCH 29/38] cleanup comments --- cpp/src/io/parquet/decode_fixed.cu | 107 ++++++++++++----------------- 1 file changed, 43 insertions(+), 64 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index d60b4f79168..2f3923de8fe 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -562,7 +562,6 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, level_t const* const rep, int t) { - //What is the output of this? Validity bits and offsets to list starts constexpr int num_warps = decode_block_size / cudf::detail::warp_size; constexpr int max_batch_size = num_warps * cudf::detail::warp_size; @@ -579,25 +578,30 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( int const row_index_lower_bound = s->row_index_lower_bound; int const max_depth = s->col.max_nesting_depth - 1; int max_depth_valid_count = s->nesting_info[max_depth].valid_count; - - __syncthreads(); int const warp_index = t / cudf::detail::warp_size; int const warp_lane = t % cudf::detail::warp_size; bool const is_first_lane = (warp_lane == 0); + __syncthreads(); + while (value_count < target_value_count) { bool const within_batch = value_count + t < target_value_count; - // get definition level, use repitition level to get start/end depth + // get definition level, use repetition level to get start/end depth // different for each thread, as each thread has a different r/d int def_level = -1, start_depth = -1, end_depth = -1; if (within_batch) { int const index = rolling_index(value_count + t); int rep_level = static_cast(rep[index]); if constexpr (nullable) { - def_level = static_cast(def[index]); - end_depth = s->nesting_info[def_level].end_depth; + if(def != nullptr) { + def_level = static_cast(def[index]); + end_depth = s->nesting_info[def_level].end_depth; + } else { + def_level = 1; + end_depth = max_depth; + } } else { end_depth = max_depth; } @@ -622,15 +626,10 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( input_row_count += total_num_new_rows; int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); - // thread and block value count - + // VALUE COUNT: // if we are within the range of nesting levels we should be adding value indices for // is from/in current rep level to/in the rep level AT the depth with the def value int in_nesting_bounds = ((0 >= start_depth && 0 <= end_depth) && in_row_bounds) ? 1 : 0; - - // queries is_valid from all threads, stores prior total and total total - - //WARP VALUE COUNT: int thread_value_count_within_warp, warp_value_count, thread_value_count, block_value_count; { block_scan_results value_count_scan_results; @@ -642,9 +641,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( block_value_count = value_count_scan_results.block_count; } - // column is either nullable or is a list (or both): iterate by depth + // iterate by depth for (int d_idx = 0; d_idx <= max_depth; d_idx++) { - auto& ni = s->nesting_info[d_idx]; // everything up to the max_def_level is a non-null value @@ -655,45 +653,32 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( is_valid = in_nesting_bounds; } - // thread and block validity count - // queries is_valid of all threads, stores prior total and total total - - // for nested lists, it's more complicated. This block will visit 128 incoming values, - // however not all of them will necessarily represent a value at this nesting level. so - // the validity bit for thread t might actually represent output value t-6. the correct - // position for thread t's bit is thread_value_count. - - -//WARP VALID COUNT: - // for nested schemas, it's more complicated. This warp will visit 32 incoming values, - // however not all of them will necessarily represent a value at this nesting level. so - // the validity bit for thread t might actually represent output value t-6. the correct - // position for thread t's bit is thread_value_count. for cuda 11 we could use - // __reduce_or_sync(), but until then we have to do a warp reduce. - uint32_t const warp_valid_mask = WarpReduceOr32((uint32_t)is_valid << thread_value_count_within_warp); - int thread_valid_count, block_valid_count; - { - auto thread_mask = (uint32_t(1) << thread_value_count_within_warp) - 1; + // VALID COUNT: + // Not all values visited by this block will represent a value at this nesting level. + // the validity bit for thread t might actually represent output value t-6. + // the correct position for thread t's bit is thread_value_count. + uint32_t const warp_valid_mask = WarpReduceOr32((uint32_t)is_valid << thread_value_count_within_warp); + int thread_valid_count, block_valid_count; + { + auto thread_mask = (uint32_t(1) << thread_value_count_within_warp) - 1; - block_scan_results valid_count_scan_results; - scan_block_exclusive_sum(warp_valid_mask, warp_lane, warp_index, thread_mask, valid_count_scan_results); - thread_valid_count = valid_count_scan_results.thread_count_within_block; - block_valid_count = valid_count_scan_results.block_count; - } + block_scan_results valid_count_scan_results; + scan_block_exclusive_sum(warp_valid_mask, warp_lane, warp_index, thread_mask, valid_count_scan_results); + thread_valid_count = valid_count_scan_results.thread_count_within_block; + block_valid_count = valid_count_scan_results.block_count; + } // compute warp and thread value counts for the -next- nesting level. we need to - // do this for nested schemas so that we can emit an offset for the -current- nesting - // level. more concretely : the offset for the current nesting level == current length of the - // next nesting level + // do this for lists so that we can emit an offset for the -current- nesting level. + // the offset for the current nesting level == current length of the next nesting level int next_thread_value_count_within_warp = 0, next_warp_value_count = 0; int next_thread_value_count = 0, next_block_value_count = 0; int next_in_nesting_bounds = 0; if (d_idx < max_depth) { - //mask is different between depths - next_in_nesting_bounds = - (d_idx + 1 >= start_depth && d_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0; -//NEXT WARP VALUE COUNT: + //NEXT DEPTH VALUE COUNT: + next_in_nesting_bounds = + ((d_idx + 1 >= start_depth) && (d_idx + 1 <= end_depth) && in_row_bounds) ? 1 : 0; { block_scan_results next_value_count_scan_results; scan_block_exclusive_sum(next_in_nesting_bounds, next_value_count_scan_results); @@ -704,6 +689,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( next_block_value_count = next_value_count_scan_results.block_count; } + // STORE OFFSET TO THE LIST LOCATION // if we're -not- at a leaf column and we're within nesting/row bounds // and we have a valid data_out pointer, it implies this is a list column, so // emit an offset. @@ -712,45 +698,37 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( int const idx = ni.value_count + thread_value_count; cudf::size_type const ofs = next_ni.value_count + next_thread_value_count + next_ni.page_start_value; - //STORE THE OFFSET FOR THE NEW LIST LOCATION (reinterpret_cast(ni.data_out))[idx] = ofs; } } - // validity is processed per-warp (on lane 0's), because writes are atomic + // validity is processed per-warp (on lane 0's), because writes are 32-bit atomic ops // - // nested schemas always read and write to the same bounds + // lists always read and write to the same bounds // (that is, read and write positions are already pre-bounded by first_row/num_rows). // since we are about to write the validity vector // here we need to adjust our computed mask to take into account the write row bounds. if constexpr (nullable) { -//TODO: Consider OR'ING for next_thread_value_count and popc() for next_thread_value_count -//so that we don't have to take a ballot here. Is uint128 so may deconstruct to this anyway ... - if(is_first_lane && (ni.valid_map != nullptr) && (warp_value_count > 0)) { - // last bit in the warp to store //in old is warp_valid_mask_bit_count -//so it's a count of everything in nesting bounds, though bits can be zero if NULL at this level - // absolute bit offset into the output validity map - //is cumulative sum of warp_value_count at the given nesting depth + // is cumulative sum of warp_value_count at the given nesting depth // DON'T subtract by first_row: since it's lists it's not 1-row-per-value int const bit_offset = ni.valid_map_offset + thread_value_count; + store_validity(bit_offset, ni.valid_map, warp_valid_mask, warp_value_count); } if (t == 0) { - size_type const block_null_count = block_value_count - block_valid_count; - ni.null_count += block_null_count; + ni.null_count += block_value_count - block_valid_count; } } // if this is valid and we're at the leaf, output dst_pos - // Read these before the sync, so that when thread 0 modifies them we've already read their values + // Read value_count before the sync, so that when thread 0 modifies it we've already read its value int current_value_count = ni.value_count; - __syncthreads(); // handle modification of ni.value_count from below + __syncthreads(); // guard against modification of ni.value_count below if (d_idx == max_depth) { if (is_valid) { - // for non-list types, the value count is always the same across int const dst_pos = current_value_count + thread_value_count; int const src_pos = max_depth_valid_count + thread_valid_count; int const output_index = rolling_index(src_pos); @@ -766,7 +744,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( ni.value_count += block_value_count; ni.valid_map_offset += block_value_count; } - __syncthreads(); // handle modification of ni.value_count from below + __syncthreads(); // sync modification of ni.value_count // propagate value counts for the next depth level block_value_count = next_block_value_count; @@ -959,10 +937,8 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) // and valid_count is that running count. int processed_count = 0; int valid_count = 0; - // the core loop. decode batches of level stream data using rle_stream objects - // and pass the results to gpuDecodeValues - //For lists (which can have skipped values, skip ahead in the decoding so that we don't repeat work + // Skip ahead in the decoding so that we don't repeat work (skipped_leaf_values = 0 for non-lists) if constexpr (has_lists_t){ if(s->page.skipped_leaf_values > 0) { if (should_process_nulls) { @@ -975,6 +951,8 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) } } + // the core loop. decode batches of level stream data using rle_stream objects + // and pass the results to gpuDecodeValues int last_row = s->first_row + s->num_rows; while ((s->error == 0) && (processed_count < s->page.num_input_values) && (s->input_row_count <= last_row)) { @@ -1049,6 +1027,7 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span pages, dim3 dim_block(decode_block_size, 1); dim3 dim_grid(pages.size(), 1); // 1 threadblock per page + if (level_type_size == 1) { if (is_list) { gpuDecodePageDataGeneric Date: Tue, 8 Oct 2024 15:54:28 -0400 Subject: [PATCH 30/38] style changes --- cpp/src/io/parquet/decode_fixed.cu | 189 +++++++++++++++-------------- cpp/src/io/parquet/page_hdr.cu | 18 +-- cpp/src/io/parquet/parquet_gpu.hpp | 7 +- cpp/src/io/parquet/rle_stream.cuh | 18 +-- 4 files changed, 119 insertions(+), 113 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 2f3923de8fe..159398a927e 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -39,7 +39,7 @@ struct block_scan_results { template __device__ inline static void scan_block_exclusive_sum(int thread_bit, block_scan_results& results) { - int const t = threadIdx.x; + int const t = threadIdx.x; int const warp_index = t / cudf::detail::warp_size; int const warp_lane = t % cudf::detail::warp_size; uint32_t const lane_mask = (uint32_t(1) << warp_lane) - 1; @@ -49,22 +49,26 @@ __device__ inline static void scan_block_exclusive_sum(int thread_bit, block_sca } template -__device__ inline static void scan_block_exclusive_sum(uint32_t warp_bits, int warp_lane, int warp_index, uint32_t lane_mask, block_scan_results& results) +__device__ inline static void scan_block_exclusive_sum(uint32_t warp_bits, + int warp_lane, + int warp_index, + uint32_t lane_mask, + block_scan_results& results) { - //Compute # warps + // Compute # warps constexpr int num_warps = decode_block_size / cudf::detail::warp_size; - - //Compute the warp-wide results + + // Compute the warp-wide results results.warp_bits = warp_bits; results.warp_count = __popc(results.warp_bits); results.thread_count_within_warp = __popc(results.warp_bits & lane_mask); - //Share the warp counts amongst the block threads + // Share the warp counts amongst the block threads __shared__ int warp_counts[num_warps]; if (warp_lane == 0) { warp_counts[warp_index] = results.warp_count; } __syncthreads(); - //Compute block-wide results + // Compute block-wide results results.block_count = 0; results.thread_count_within_block = results.thread_count_within_warp; for (int warp_idx = 0; warp_idx < num_warps; ++warp_idx) { @@ -82,9 +86,9 @@ __device__ inline void gpuDecodeFixedWidthValues( // nesting level that is storing actual leaf values int const leaf_level_index = s->col.max_nesting_depth - 1; - auto const data_out = s->nesting_info[leaf_level_index].data_out; + auto const data_out = s->nesting_info[leaf_level_index].data_out; - int const dtype = s->col.physical_type; + int const dtype = s->col.physical_type; uint32_t dtype_len = s->dtype_len; uint32_t const skipped_leaf_values = s->page.skipped_leaf_values; @@ -94,13 +98,11 @@ __device__ inline void gpuDecodeFixedWidthValues( while (pos < end) { int const batch_size = min(max_batch_size, end - pos); int const target_pos = pos + batch_size; - int src_pos = pos + t; + int src_pos = pos + t; - //Index from value buffer (doesn't include nulls) to final array (has gaps for nulls) + // Index from value buffer (doesn't include nulls) to final array (has gaps for nulls) int dst_pos = sb->nz_idx[rolling_index(src_pos)]; - if constexpr (!has_lists_t) { - dst_pos -= s->first_row; - } + if constexpr (!has_lists_t) { dst_pos -= s->first_row; } // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values // before first_row) in the flat hierarchy case. @@ -111,9 +113,7 @@ __device__ inline void gpuDecodeFixedWidthValues( // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So src_pos // has to take into account the # of values we have to skip in the page to get to the // desired logical row. For flat hierarchies, skipped_leaf_values will always be 0. - if constexpr (has_lists_t) { - src_pos += skipped_leaf_values; - } + if constexpr (has_lists_t) { src_pos += skipped_leaf_values; } void* dst = data_out + static_cast(dst_pos) * dtype_len; @@ -173,11 +173,11 @@ __device__ inline void gpuDecodeFixedWidthSplitValues( // nesting level that is storing actual leaf values int const leaf_level_index = s->col.max_nesting_depth - 1; - auto const data_out = s->nesting_info[leaf_level_index].data_out; + auto const data_out = s->nesting_info[leaf_level_index].data_out; - int const dtype = s->col.physical_type; - auto const data_len = thrust::distance(s->data_start, s->data_end); - auto const num_values = data_len / s->dtype_len_in; + int const dtype = s->col.physical_type; + auto const data_len = thrust::distance(s->data_start, s->data_end); + auto const num_values = data_len / s->dtype_len_in; uint32_t const skipped_leaf_values = s->page.skipped_leaf_values; @@ -187,29 +187,24 @@ __device__ inline void gpuDecodeFixedWidthSplitValues( int const batch_size = min(max_batch_size, end - pos); int const target_pos = pos + batch_size; - int src_pos = pos + t; + int src_pos = pos + t; // the position in the output column/buffer int dst_pos = sb->nz_idx[rolling_index(src_pos)]; - if constexpr (!has_lists_t) { - dst_pos -= s->first_row; - } + if constexpr (!has_lists_t) { dst_pos -= s->first_row; } // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values // before first_row) in the flat hierarchy case. if (src_pos < target_pos && dst_pos >= 0) { - // src_pos represents the logical row position we want to read from. But in the case of // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So src_pos // has to take into account the # of values we have to skip in the page to get to the // desired logical row. For flat hierarchies, skipped_leaf_values will always be 0. - if constexpr (has_lists_t) { - src_pos += skipped_leaf_values; - } + if constexpr (has_lists_t) { src_pos += skipped_leaf_values; } uint32_t dtype_len = s->dtype_len; uint8_t const* src = s->data_start + src_pos; - uint8_t* dst = data_out + static_cast(dst_pos) * dtype_len; + uint8_t* dst = data_out + static_cast(dst_pos) * dtype_len; auto const is_decimal = s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL; @@ -558,9 +553,12 @@ static __device__ int gpuUpdateValidityAndRowIndicesNonNullable(int32_t target_v } template -static __device__ int gpuUpdateValidityAndRowIndicesLists( - int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, - level_t const* const rep, int t) +static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_count, + page_state_s* s, + state_buf* sb, + level_t const* const def, + level_t const* const rep, + int t) { constexpr int num_warps = decode_block_size / cudf::detail::warp_size; constexpr int max_batch_size = num_warps * cudf::detail::warp_size; @@ -572,13 +570,13 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( int input_row_count = s->input_row_count; // cap by last row so that we don't process any rows past what we want to output. - int const first_row = s->first_row; - int const last_row = first_row + s->num_rows; + int const first_row = s->first_row; + int const last_row = first_row + s->num_rows; int const row_index_lower_bound = s->row_index_lower_bound; - int const max_depth = s->col.max_nesting_depth - 1; - int max_depth_valid_count = s->nesting_info[max_depth].valid_count; - + int const max_depth = s->col.max_nesting_depth - 1; + int max_depth_valid_count = s->nesting_info[max_depth].valid_count; + int const warp_index = t / cudf::detail::warp_size; int const warp_lane = t % cudf::detail::warp_size; bool const is_first_lane = (warp_lane == 0); @@ -593,9 +591,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( int def_level = -1, start_depth = -1, end_depth = -1; if (within_batch) { int const index = rolling_index(value_count + t); - int rep_level = static_cast(rep[index]); + int rep_level = static_cast(rep[index]); if constexpr (nullable) { - if(def != nullptr) { + if (def != nullptr) { def_level = static_cast(def[index]); end_depth = s->nesting_info[def_level].end_depth; } else { @@ -606,13 +604,13 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( end_depth = max_depth; } - //computed by generate_depth_remappings() + // computed by generate_depth_remappings() start_depth = s->nesting_info[rep_level].start_depth; } - //Determine value count & row index - // track (page-relative) row index for the thread so we can compare against input bounds - // keep track of overall # of rows we've read. + // Determine value count & row index + // track (page-relative) row index for the thread so we can compare against input bounds + // keep track of overall # of rows we've read. int const is_new_row = start_depth == 0 ? 1 : 0; int num_prior_new_rows, total_num_new_rows; { @@ -636,9 +634,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( scan_block_exclusive_sum(in_nesting_bounds, value_count_scan_results); thread_value_count_within_warp = value_count_scan_results.thread_count_within_warp; - warp_value_count = value_count_scan_results.warp_count; - thread_value_count = value_count_scan_results.thread_count_within_block; - block_value_count = value_count_scan_results.block_count; + warp_value_count = value_count_scan_results.warp_count; + thread_value_count = value_count_scan_results.thread_count_within_block; + block_value_count = value_count_scan_results.block_count; } // iterate by depth @@ -654,18 +652,20 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( } // VALID COUNT: - // Not all values visited by this block will represent a value at this nesting level. - // the validity bit for thread t might actually represent output value t-6. - // the correct position for thread t's bit is thread_value_count. - uint32_t const warp_valid_mask = WarpReduceOr32((uint32_t)is_valid << thread_value_count_within_warp); + // Not all values visited by this block will represent a value at this nesting level. + // the validity bit for thread t might actually represent output value t-6. + // the correct position for thread t's bit is thread_value_count. + uint32_t const warp_valid_mask = + WarpReduceOr32((uint32_t)is_valid << thread_value_count_within_warp); int thread_valid_count, block_valid_count; { auto thread_mask = (uint32_t(1) << thread_value_count_within_warp) - 1; block_scan_results valid_count_scan_results; - scan_block_exclusive_sum(warp_valid_mask, warp_lane, warp_index, thread_mask, valid_count_scan_results); + scan_block_exclusive_sum( + warp_valid_mask, warp_lane, warp_index, thread_mask, valid_count_scan_results); thread_valid_count = valid_count_scan_results.thread_count_within_block; - block_valid_count = valid_count_scan_results.block_count; + block_valid_count = valid_count_scan_results.block_count; } // compute warp and thread value counts for the -next- nesting level. we need to @@ -675,18 +675,19 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( int next_thread_value_count = 0, next_block_value_count = 0; int next_in_nesting_bounds = 0; if (d_idx < max_depth) { - - //NEXT DEPTH VALUE COUNT: - next_in_nesting_bounds = + // NEXT DEPTH VALUE COUNT: + next_in_nesting_bounds = ((d_idx + 1 >= start_depth) && (d_idx + 1 <= end_depth) && in_row_bounds) ? 1 : 0; { block_scan_results next_value_count_scan_results; - scan_block_exclusive_sum(next_in_nesting_bounds, next_value_count_scan_results); + scan_block_exclusive_sum(next_in_nesting_bounds, + next_value_count_scan_results); - next_thread_value_count_within_warp = next_value_count_scan_results.thread_count_within_warp; - next_warp_value_count = next_value_count_scan_results.warp_count; + next_thread_value_count_within_warp = + next_value_count_scan_results.thread_count_within_warp; + next_warp_value_count = next_value_count_scan_results.warp_count; next_thread_value_count = next_value_count_scan_results.thread_count_within_block; - next_block_value_count = next_value_count_scan_results.block_count; + next_block_value_count = next_value_count_scan_results.block_count; } // STORE OFFSET TO THE LIST LOCATION @@ -695,8 +696,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( // emit an offset. if (in_nesting_bounds && ni.data_out != nullptr) { const auto& next_ni = s->nesting_info[d_idx + 1]; - int const idx = ni.value_count + thread_value_count; - cudf::size_type const ofs = next_ni.value_count + next_thread_value_count + next_ni.page_start_value; + int const idx = ni.value_count + thread_value_count; + cudf::size_type const ofs = + next_ni.value_count + next_thread_value_count + next_ni.page_start_value; (reinterpret_cast(ni.data_out))[idx] = ofs; } @@ -704,12 +706,12 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( // validity is processed per-warp (on lane 0's), because writes are 32-bit atomic ops // - // lists always read and write to the same bounds - // (that is, read and write positions are already pre-bounded by first_row/num_rows). + // lists always read and write to the same bounds + // (that is, read and write positions are already pre-bounded by first_row/num_rows). // since we are about to write the validity vector // here we need to adjust our computed mask to take into account the write row bounds. if constexpr (nullable) { - if(is_first_lane && (ni.valid_map != nullptr) && (warp_value_count > 0)) { + if (is_first_lane && (ni.valid_map != nullptr) && (warp_value_count > 0)) { // absolute bit offset into the output validity map // is cumulative sum of warp_value_count at the given nesting depth // DON'T subtract by first_row: since it's lists it's not 1-row-per-value @@ -718,22 +720,22 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( store_validity(bit_offset, ni.valid_map, warp_valid_mask, warp_value_count); } - if (t == 0) { - ni.null_count += block_value_count - block_valid_count; - } + if (t == 0) { ni.null_count += block_value_count - block_valid_count; } } // if this is valid and we're at the leaf, output dst_pos - // Read value_count before the sync, so that when thread 0 modifies it we've already read its value + // Read value_count before the sync, so that when thread 0 modifies it we've already read its + // value int current_value_count = ni.value_count; __syncthreads(); // guard against modification of ni.value_count below if (d_idx == max_depth) { if (is_valid) { - int const dst_pos = current_value_count + thread_value_count; - int const src_pos = max_depth_valid_count + thread_valid_count; + int const dst_pos = current_value_count + thread_value_count; + int const src_pos = max_depth_valid_count + thread_valid_count; int const output_index = rolling_index(src_pos); - //Index from rolling buffer of values (which doesn't include nulls) to final array (which includes gaps for nulls) + // Index from rolling buffer of values (which doesn't include nulls) to final array (which + // includes gaps for nulls) sb->nz_idx[output_index] = dst_pos; } max_depth_valid_count += block_valid_count; @@ -747,12 +749,12 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( __syncthreads(); // sync modification of ni.value_count // propagate value counts for the next depth level - block_value_count = next_block_value_count; - thread_value_count = next_thread_value_count; - in_nesting_bounds = next_in_nesting_bounds; - warp_value_count = next_warp_value_count; + block_value_count = next_block_value_count; + thread_value_count = next_thread_value_count; + in_nesting_bounds = next_in_nesting_bounds; + warp_value_count = next_warp_value_count; thread_value_count_within_warp = next_thread_value_count_within_warp; - } //END OF DEPTH LOOP + } // END OF DEPTH LOOP int const batch_size = min(max_batch_size, target_value_count - value_count); value_count += batch_size; @@ -761,8 +763,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists( if (t == 0) { // update valid value count for decoding and total # of values we've processed s->nesting_info[max_depth].valid_count = max_depth_valid_count; - s->nz_count = max_depth_valid_count; - s->input_value_count = value_count; + s->nz_count = max_depth_valid_count; + s->input_value_count = value_count; // If we have lists # rows != # values s->input_row_count = input_row_count; @@ -881,8 +883,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) bool const should_process_nulls = is_nullable(s) && maybe_has_nulls(s); // shared buffer. all shared memory is suballocated out of here - constexpr int shared_rep_size = has_lists_t ? cudf::util::round_up_unsafe(rle_run_buffer_size * - sizeof(rle_run), size_t{16}) : 0; + constexpr int shared_rep_size = + has_lists_t + ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), size_t{16}) + : 0; constexpr int shared_dict_size = has_dict_t ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), size_t{16}) @@ -893,9 +897,9 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) __shared__ __align__(16) uint8_t shared_buf[shared_buf_size]; // setup all shared memory buffers - int shared_offset = 0; + int shared_offset = 0; rle_run* rep_runs = reinterpret_cast*>(shared_buf + shared_offset); - if constexpr (has_lists_t){ shared_offset += shared_rep_size; } + if constexpr (has_lists_t) { shared_offset += shared_rep_size; } rle_run* dict_runs = reinterpret_cast*>(shared_buf + shared_offset); if constexpr (has_dict_t) { shared_offset += shared_dict_size; } @@ -911,10 +915,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) def, s->page.num_input_values); } - + rle_stream rep_decoder{rep_runs}; level_t* const rep = reinterpret_cast(pp->lvl_decode_buf[level_type::REPETITION]); - if constexpr (has_lists_t){ + if constexpr (has_lists_t) { rep_decoder.init(s->col.level_bits[level_type::REPETITION], s->abs_lvl_start[level_type::REPETITION], s->abs_lvl_end[level_type::REPETITION], @@ -939,12 +943,13 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) int valid_count = 0; // Skip ahead in the decoding so that we don't repeat work (skipped_leaf_values = 0 for non-lists) - if constexpr (has_lists_t){ - if(s->page.skipped_leaf_values > 0) { + if constexpr (has_lists_t) { + if (s->page.skipped_leaf_values > 0) { if (should_process_nulls) { skip_decode(def_decoder, s->page.skipped_leaf_values, t); } - processed_count = skip_decode(rep_decoder, s->page.skipped_leaf_values, t); + processed_count = + skip_decode(rep_decoder, s->page.skipped_leaf_values, t); if constexpr (has_dict_t) { skip_decode(dict_stream, s->page.skipped_leaf_values, t); } @@ -983,12 +988,12 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) if constexpr (has_lists_t) { processed_count += rep_decoder.decode_next(t); __syncthreads(); - next_valid_count = - gpuUpdateValidityAndRowIndicesLists( - processed_count, s, sb, nullptr, rep, t); + next_valid_count = gpuUpdateValidityAndRowIndicesLists( + processed_count, s, sb, nullptr, rep, t); } else { processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count); - next_valid_count = gpuUpdateValidityAndRowIndicesNonNullable(processed_count, s, sb, t); + next_valid_count = + gpuUpdateValidityAndRowIndicesNonNullable(processed_count, s, sb, t); } } __syncthreads(); diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index 85a55fa97c9..52d53cb8225 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -185,18 +185,18 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page, if (!is_byte_array(chunk) && !is_boolean(chunk)) { if (page.encoding == Encoding::PLAIN) { - return is_list(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST : - is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED - : decode_kernel_mask::FIXED_WIDTH_NO_DICT; + return is_list(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST + : is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED + : decode_kernel_mask::FIXED_WIDTH_NO_DICT; } else if (page.encoding == Encoding::PLAIN_DICTIONARY || page.encoding == Encoding::RLE_DICTIONARY) { - return is_list(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_LIST : - is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_NESTED - : decode_kernel_mask::FIXED_WIDTH_DICT; + return is_list(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_LIST + : is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_NESTED + : decode_kernel_mask::FIXED_WIDTH_DICT; } else if (page.encoding == Encoding::BYTE_STREAM_SPLIT) { - return is_list(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST : - is_nested(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED - : decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT; + return is_list(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST + : is_nested(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED + : decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT; } } diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index a77a5f5ad50..695cc40297d 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -221,9 +221,10 @@ enum class decode_kernel_mask { (1 << 9), // Same as above but for nested, fixed-width data FIXED_WIDTH_NO_DICT_NESTED = (1 << 10), // Run decode kernel for fixed width non-dictionary pages FIXED_WIDTH_DICT_NESTED = (1 << 11), // Run decode kernel for fixed width dictionary pages - FIXED_WIDTH_DICT_LIST = (1 << 12), // Run decode kernel for fixed width dictionary pages for lists - FIXED_WIDTH_NO_DICT_LIST = (1 << 13), // Run decode kernel for fixed width non-dictionary pages for lists - BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST = (1 << 14), // Run decode kernel for BYTE_STREAM_SPLIT encoded data for fixed width lists + FIXED_WIDTH_DICT_LIST = (1 << 12), // Run decode kernel for fixed width dictionary pages + FIXED_WIDTH_NO_DICT_LIST = (1 << 13), // Run decode kernel for fixed width non-dictionary pages + BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST = + (1 << 14), // Run decode kernel for BYTE_STREAM_SPLIT encoded data for fixed width lists }; // mask representing all the ways in which a string can be encoded diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh index caa7c45840e..9270db16c08 100644 --- a/cpp/src/io/parquet/rle_stream.cuh +++ b/cpp/src/io/parquet/rle_stream.cuh @@ -374,10 +374,11 @@ struct rle_stream { __device__ inline int skip_runs(int target_count) { - //we want to process all runs UP TO BUT NOT INCLUDING the run that overlaps with the skip amount - //so threads spin like crazy on fill_run_batch(), skipping writing unnecessary run info - //then when it hits the one that matters, we don't process it at all and bail as if we never started - //basically we're setting up the rle_stream vars necessary to start fill_run_batch for the first time + // we want to process all runs UP TO BUT NOT INCLUDING the run that overlaps with the skip + // amount so threads spin like crazy on fill_run_batch(), skipping writing unnecessary run info + // then when it hits the one that matters, we don't process it at all and bail as if we never + // started basically we're setting up the rle_stream vars necessary to start fill_run_batch for + // the first time while (cur < end) { // bytes for the varint header uint8_t const* _cur = cur; @@ -396,19 +397,18 @@ struct rle_stream { run_bytes += ((level_bits) + 7) >> 3; } - if((output_pos + run_size) > target_count) { - return output_pos; //bail! we've reached the starting run + if ((output_pos + run_size) > target_count) { + return output_pos; // bail! we've reached the starting run } - //skip this run + // skip this run output_pos += run_size; cur += run_bytes; } - return output_pos; //we skipped everything + return output_pos; // we skipped everything } - __device__ inline int skip_decode(int t, int count) { int const output_count = min(count, total_values - cur_values); From edc56bdd0e22b756f26c3f5ec26bcf9db3ea06d3 Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Fri, 11 Oct 2024 12:44:49 -0400 Subject: [PATCH 31/38] constify variables --- cpp/benchmarks/CMakeLists.txt | 9 +- cpp/src/io/parquet/decode_fixed.cu | 149 +++++++++++++++++------------ 2 files changed, 94 insertions(+), 64 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index b8a53cd8bd9..4113e38dcf4 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -330,11 +330,11 @@ ConfigureNVBench(CSV_WRITER_NVBENCH io/csv/csv_writer.cpp) # ################################################################################################## # * ast benchmark --------------------------------------------------------------------------------- -ConfigureNVBench(AST_NVBENCH ast/transform.cpp) +ConfigureBench(AST_BENCH ast/transform.cpp) # ################################################################################################## # * binaryop benchmark ---------------------------------------------------------------------------- -ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp) +ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp) # ################################################################################################## # * nvtext benchmark ------------------------------------------------------------------- @@ -392,6 +392,11 @@ ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader ConfigureNVBench(JSON_READER_OPTION_NVBENCH io/json/json_reader_option.cpp) ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp) +# ################################################################################################## +# * multi buffer memset benchmark +# ---------------------------------------------------------------------- +ConfigureNVBench(BATCHED_MEMSET_BENCH io/utilities/batched_memset_bench.cpp) + # ################################################################################################## # * io benchmark --------------------------------------------------------------------- ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 159398a927e..897c0c04be1 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -88,34 +88,43 @@ __device__ inline void gpuDecodeFixedWidthValues( int const leaf_level_index = s->col.max_nesting_depth - 1; auto const data_out = s->nesting_info[leaf_level_index].data_out; - int const dtype = s->col.physical_type; - uint32_t dtype_len = s->dtype_len; + int const dtype = s->col.physical_type; + uint32_t const dtype_len = s->dtype_len; - uint32_t const skipped_leaf_values = s->page.skipped_leaf_values; + int const skipped_leaf_values = s->page.skipped_leaf_values; // decode values int pos = start; while (pos < end) { int const batch_size = min(max_batch_size, end - pos); int const target_pos = pos + batch_size; - int src_pos = pos + t; + int const thread_pos = pos + t; // Index from value buffer (doesn't include nulls) to final array (has gaps for nulls) - int dst_pos = sb->nz_idx[rolling_index(src_pos)]; - if constexpr (!has_lists_t) { dst_pos -= s->first_row; } + int const dst_pos = [&]() { + int dst_pos = sb->nz_idx[rolling_index(thread_pos)]; + if constexpr (!has_lists_t) { dst_pos -= s->first_row; } + return dst_pos; + }(); // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values // before first_row) in the flat hierarchy case. - if (src_pos < target_pos && dst_pos >= 0) { + if (thread_pos < target_pos && dst_pos >= 0) { // nesting level that is storing actual leaf values // src_pos represents the logical row position we want to read from. But in the case of // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So src_pos // has to take into account the # of values we have to skip in the page to get to the // desired logical row. For flat hierarchies, skipped_leaf_values will always be 0. - if constexpr (has_lists_t) { src_pos += skipped_leaf_values; } + int const src_pos = [&]() { + if constexpr (has_lists_t) { + return thread_pos + skipped_leaf_values; + } else { + return thread_pos; + } + }(); - void* dst = data_out + static_cast(dst_pos) * dtype_len; + void* const dst = data_out + static_cast(dst_pos) * dtype_len; if (s->col.logical_type.has_value() && s->col.logical_type->type == LogicalType::DECIMAL) { switch (dtype) { @@ -179,7 +188,7 @@ __device__ inline void gpuDecodeFixedWidthSplitValues( auto const data_len = thrust::distance(s->data_start, s->data_end); auto const num_values = data_len / s->dtype_len_in; - uint32_t const skipped_leaf_values = s->page.skipped_leaf_values; + int const skipped_leaf_values = s->page.skipped_leaf_values; // decode values int pos = start; @@ -187,24 +196,34 @@ __device__ inline void gpuDecodeFixedWidthSplitValues( int const batch_size = min(max_batch_size, end - pos); int const target_pos = pos + batch_size; - int src_pos = pos + t; + int const thread_pos = pos + t; // the position in the output column/buffer - int dst_pos = sb->nz_idx[rolling_index(src_pos)]; - if constexpr (!has_lists_t) { dst_pos -= s->first_row; } + // Index from value buffer (doesn't include nulls) to final array (has gaps for nulls) + int const dst_pos = [&]() { + int dst_pos = sb->nz_idx[rolling_index(thread_pos)]; + if constexpr (!has_lists_t) { dst_pos -= s->first_row; } + return dst_pos; + }(); // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values // before first_row) in the flat hierarchy case. - if (src_pos < target_pos && dst_pos >= 0) { + if (thread_pos < target_pos && dst_pos >= 0) { // src_pos represents the logical row position we want to read from. But in the case of // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So src_pos // has to take into account the # of values we have to skip in the page to get to the // desired logical row. For flat hierarchies, skipped_leaf_values will always be 0. - if constexpr (has_lists_t) { src_pos += skipped_leaf_values; } + int const src_pos = [&]() { + if constexpr (has_lists_t) { + return thread_pos + skipped_leaf_values; + } else { + return thread_pos; + } + }(); - uint32_t dtype_len = s->dtype_len; - uint8_t const* src = s->data_start + src_pos; - uint8_t* dst = data_out + static_cast(dst_pos) * dtype_len; + uint32_t const dtype_len = s->dtype_len; + uint8_t const* const src = s->data_start + src_pos; + uint8_t* const dst = data_out + static_cast(dst_pos) * dtype_len; auto const is_decimal = s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL; @@ -292,12 +311,15 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( int const batch_size = min(max_batch_size, capped_target_value_count - value_count); // definition level - int d = 1; - if (t >= batch_size) { - d = -1; - } else if (def) { - d = static_cast(def[rolling_index(value_count + t)]); - } + int const d = [&]() { + if (t >= batch_size) { + return -1; + } else if (def) { + return static_cast(def[rolling_index(value_count + t)]); + } else { + return 1; + } + }(); int const thread_value_count = t; int const block_value_count = batch_size; @@ -358,6 +380,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( if (is_valid) { int const dst_pos = value_count + thread_value_count; int const src_pos = max_depth_valid_count + thread_valid_count; + sb->nz_idx[rolling_index(src_pos)] = dst_pos; } // update stuff @@ -414,16 +437,17 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); // use definition level & row bounds to determine if is valid - int is_valid; - if (t >= batch_size) { - is_valid = 0; - } else if (def) { - int const def_level = - static_cast(def[rolling_index(value_count + t)]); - is_valid = ((def_level > 0) && in_row_bounds) ? 1 : 0; - } else { - is_valid = in_row_bounds; - } + int const is_valid = [&]() { + if (t >= batch_size) { + return 0; + } else if (def) { + int const def_level = + static_cast(def[rolling_index(value_count + t)]); + return ((def_level > 0) && in_row_bounds) ? 1 : 0; + } else { + return in_row_bounds; + } + }(); // thread and block validity count using block_scan = cub::BlockScan; @@ -588,25 +612,25 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c // get definition level, use repetition level to get start/end depth // different for each thread, as each thread has a different r/d - int def_level = -1, start_depth = -1, end_depth = -1; - if (within_batch) { - int const index = rolling_index(value_count + t); - int rep_level = static_cast(rep[index]); - if constexpr (nullable) { + auto const [def_level, start_depth, end_depth] = [&]() { + if (!within_batch) { return cuda::std::make_tuple(-1, -1, -1); } + + int const index = rolling_index(value_count + t); + int const rep_level = static_cast(rep[index]); + int const start_depth = s->nesting_info[rep_level].start_depth; + + if constexpr (!nullable) { + return cuda::std::make_tuple(-1, start_depth, max_depth); + } else { if (def != nullptr) { - def_level = static_cast(def[index]); - end_depth = s->nesting_info[def_level].end_depth; + int const def_level = static_cast(def[index]); + return cuda::std::make_tuple( + def_level, start_depth, s->nesting_info[def_level].end_depth); } else { - def_level = 1; - end_depth = max_depth; + return cuda::std::make_tuple(1, start_depth, max_depth); } - } else { - end_depth = max_depth; } - - // computed by generate_depth_remappings() - start_depth = s->nesting_info[rep_level].start_depth; - } + }(); // Determine value count & row index // track (page-relative) row index for the thread so we can compare against input bounds @@ -644,12 +668,13 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c auto& ni = s->nesting_info[d_idx]; // everything up to the max_def_level is a non-null value - int is_valid; - if constexpr (nullable) { - is_valid = ((def_level >= ni.max_def_level) && in_nesting_bounds) ? 1 : 0; - } else { - is_valid = in_nesting_bounds; - } + int const is_valid = [&](int input_def_level) { + if constexpr (nullable) { + return ((input_def_level >= ni.max_def_level) && in_nesting_bounds) ? 1 : 0; + } else { + return in_nesting_bounds; + } + }(def_level); // VALID COUNT: // Not all values visited by this block will represent a value at this nesting level. @@ -726,7 +751,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c // if this is valid and we're at the leaf, output dst_pos // Read value_count before the sync, so that when thread 0 modifies it we've already read its // value - int current_value_count = ni.value_count; + int const current_value_count = ni.value_count; __syncthreads(); // guard against modification of ni.value_count below if (d_idx == max_depth) { if (is_valid) { @@ -944,21 +969,21 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) // Skip ahead in the decoding so that we don't repeat work (skipped_leaf_values = 0 for non-lists) if constexpr (has_lists_t) { - if (s->page.skipped_leaf_values > 0) { + auto const skipped_leaf_values = s->page.skipped_leaf_values; + if (skipped_leaf_values > 0) { if (should_process_nulls) { - skip_decode(def_decoder, s->page.skipped_leaf_values, t); + skip_decode(def_decoder, skipped_leaf_values, t); } - processed_count = - skip_decode(rep_decoder, s->page.skipped_leaf_values, t); + processed_count = skip_decode(rep_decoder, skipped_leaf_values, t); if constexpr (has_dict_t) { - skip_decode(dict_stream, s->page.skipped_leaf_values, t); + skip_decode(dict_stream, skipped_leaf_values, t); } } } // the core loop. decode batches of level stream data using rle_stream objects // and pass the results to gpuDecodeValues - int last_row = s->first_row + s->num_rows; + int const last_row = s->first_row + s->num_rows; while ((s->error == 0) && (processed_count < s->page.num_input_values) && (s->input_row_count <= last_row)) { int next_valid_count; From e51406ce1992919bf567fc65125e2106015af239 Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Fri, 11 Oct 2024 12:49:28 -0400 Subject: [PATCH 32/38] revert cmakelists change --- cpp/benchmarks/CMakeLists.txt | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 4113e38dcf4..b8a53cd8bd9 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -330,11 +330,11 @@ ConfigureNVBench(CSV_WRITER_NVBENCH io/csv/csv_writer.cpp) # ################################################################################################## # * ast benchmark --------------------------------------------------------------------------------- -ConfigureBench(AST_BENCH ast/transform.cpp) +ConfigureNVBench(AST_NVBENCH ast/transform.cpp) # ################################################################################################## # * binaryop benchmark ---------------------------------------------------------------------------- -ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp) +ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp) # ################################################################################################## # * nvtext benchmark ------------------------------------------------------------------- @@ -392,11 +392,6 @@ ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader ConfigureNVBench(JSON_READER_OPTION_NVBENCH io/json/json_reader_option.cpp) ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp) -# ################################################################################################## -# * multi buffer memset benchmark -# ---------------------------------------------------------------------- -ConfigureNVBench(BATCHED_MEMSET_BENCH io/utilities/batched_memset_bench.cpp) - # ################################################################################################## # * io benchmark --------------------------------------------------------------------- ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp) From 07ffbf26046ae5fd72a112f030dbf233b1f7b677 Mon Sep 17 00:00:00 2001 From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com> Date: Fri, 18 Oct 2024 11:40:46 -0400 Subject: [PATCH 33/38] Update cpp/src/io/parquet/rle_stream.cuh Co-authored-by: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com> --- cpp/src/io/parquet/rle_stream.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh index 9270db16c08..a84067743df 100644 --- a/cpp/src/io/parquet/rle_stream.cuh +++ b/cpp/src/io/parquet/rle_stream.cuh @@ -375,7 +375,7 @@ struct rle_stream { __device__ inline int skip_runs(int target_count) { // we want to process all runs UP TO BUT NOT INCLUDING the run that overlaps with the skip - // amount so threads spin like crazy on fill_run_batch(), skipping writing unnecessary run info + // amount so threads spin like crazy on fill_run_batch(), skipping writing unnecessary run info. // then when it hits the one that matters, we don't process it at all and bail as if we never // started basically we're setting up the rle_stream vars necessary to start fill_run_batch for // the first time From 32fe8b97dc46999dcfbf8d9abf4e74d6bbd88379 Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Fri, 18 Oct 2024 12:48:46 -0400 Subject: [PATCH 34/38] refactor rle_stream --- cpp/src/io/parquet/decode_fixed.cu | 15 +++---- cpp/src/io/parquet/rle_stream.cuh | 69 ++++++++++++------------------ 2 files changed, 33 insertions(+), 51 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 4e83a788747..e806e54a522 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -117,14 +117,11 @@ __device__ void gpuDecodeFixedWidthValues( // has to take into account the # of values we have to skip in the page to get to the // desired logical row. For flat hierarchies, skipped_leaf_values will always be 0. int const src_pos = [&]() { - if constexpr (has_lists_t) { - return thread_pos + skipped_leaf_values; - } else { - return thread_pos; - } + if constexpr (has_lists_t) { return thread_pos + skipped_leaf_values; } + return thread_pos; }(); - void* const dst = data_out + static_cast(dst_pos) * dtype_len; + void* const dst = data_out + (static_cast(dst_pos) * dtype_len); if (s->col.logical_type.has_value() && s->col.logical_type->type == LogicalType::DECIMAL) { switch (dtype) { @@ -316,9 +313,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( return -1; } else if (def) { return static_cast(def[rolling_index(value_count + t)]); - } else { - return 1; } + return 1; }(); int const thread_value_count = t; @@ -444,9 +440,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( int const def_level = static_cast(def[rolling_index(value_count + t)]); return ((def_level > 0) && in_row_bounds) ? 1 : 0; - } else { - return in_row_bounds; } + return in_row_bounds; }(); // thread and block validity count diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh index a84067743df..55339dbc289 100644 --- a/cpp/src/io/parquet/rle_stream.cuh +++ b/cpp/src/io/parquet/rle_stream.cuh @@ -216,6 +216,26 @@ struct rle_stream { decode_index = -1; // signals the first iteration. Nothing to decode. } + __device__ inline int get_rle_run_info(rle_run& run) + { + run.start = cur; + run.level_run = get_vlq32(run.start, end); + + // run_bytes includes the header size + int run_bytes = run.start - cur; + if (is_literal_run(run.level_run)) { + // from the parquet spec: literal runs always come in multiples of 8 values. + run.size = (run.level_run >> 1) * 8; + run_bytes += ((run.size * level_bits) + 7) >> 3; + } else { + // repeated value run + run.size = (run.level_run >> 1); + run_bytes += ((level_bits) + 7) >> 3; + } + + return run_bytes; + } + __device__ inline void fill_run_batch() { // decode_index == -1 means we are on the very first decode iteration for this stream. @@ -226,31 +246,12 @@ struct rle_stream { while (((decode_index == -1 && fill_index < num_rle_stream_decode_warps) || fill_index < decode_index + run_buffer_size) && cur < end) { - auto& run = runs[rolling_index(fill_index)]; - // Encoding::RLE + auto& run = runs[rolling_index(fill_index)]; + int const run_bytes = get_rle_run_info(run); + run.remaining = run.size; + run.output_pos = output_pos; - // bytes for the varint header - uint8_t const* _cur = cur; - int const level_run = get_vlq32(_cur, end); - // run_bytes includes the header size - int run_bytes = _cur - cur; - - // literal run - if (is_literal_run(level_run)) { - // from the parquet spec: literal runs always come in multiples of 8 values. - run.size = (level_run >> 1) * 8; - run_bytes += ((run.size * level_bits) + 7) >> 3; - } - // repeated value run - else { - run.size = (level_run >> 1); - run_bytes += ((level_bits) + 7) >> 3; - } - run.output_pos = output_pos; - run.start = _cur; - run.level_run = level_run; - run.remaining = run.size; cur += run_bytes; output_pos += run.size; fill_index++; @@ -380,29 +381,15 @@ struct rle_stream { // started basically we're setting up the rle_stream vars necessary to start fill_run_batch for // the first time while (cur < end) { - // bytes for the varint header - uint8_t const* _cur = cur; - int const level_run = get_vlq32(_cur, end); - - // run_bytes includes the header size - int run_bytes = _cur - cur; - int run_size; - if (is_literal_run(level_run)) { - // from the parquet spec: literal runs always come in multiples of 8 values. - run_size = (level_run >> 1) * 8; - run_bytes += ((run_size * level_bits) + 7) >> 3; - } else { - // repeated value run - run_size = (level_run >> 1); - run_bytes += ((level_bits) + 7) >> 3; - } + rle_run run; + int run_bytes = get_rle_run_info(run); - if ((output_pos + run_size) > target_count) { + if ((output_pos + run.size) > target_count) { return output_pos; // bail! we've reached the starting run } // skip this run - output_pos += run_size; + output_pos += run.size; cur += run_bytes; } From 031ac6b414abc9045040e27ce4c5c1d5fa6f1c3f Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Wed, 23 Oct 2024 11:58:41 -0400 Subject: [PATCH 35/38] Use divide function --- cpp/src/io/parquet/rle_stream.cuh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh index 55339dbc289..78f6bcaa7b4 100644 --- a/cpp/src/io/parquet/rle_stream.cuh +++ b/cpp/src/io/parquet/rle_stream.cuh @@ -19,6 +19,7 @@ #include "parquet_gpu.hpp" #include +#include namespace cudf::io::parquet::detail { @@ -226,11 +227,11 @@ struct rle_stream { if (is_literal_run(run.level_run)) { // from the parquet spec: literal runs always come in multiples of 8 values. run.size = (run.level_run >> 1) * 8; - run_bytes += ((run.size * level_bits) + 7) >> 3; + run_bytes += util::div_rounding_up_unsafe(run.size * level_bits, 8); } else { // repeated value run run.size = (run.level_run >> 1); - run_bytes += ((level_bits) + 7) >> 3; + run_bytes += util::div_rounding_up_unsafe(level_bits, 8); } return run_bytes; From 534e67de716dd880b638da68b0af7e2e0f4bf8fa Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Fri, 25 Oct 2024 16:06:44 -0400 Subject: [PATCH 36/38] address comments --- cpp/src/io/parquet/decode_fixed.cu | 26 ++++++++++++++------------ cpp/src/io/parquet/rle_stream.cuh | 6 ++++-- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index e806e54a522..cedced55d51 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -610,15 +610,15 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c auto const [def_level, start_depth, end_depth] = [&]() { if (!within_batch) { return cuda::std::make_tuple(-1, -1, -1); } - int const index = rolling_index(value_count + t); - int const rep_level = static_cast(rep[index]); + int const level_index = rolling_index(value_count + t); + int const rep_level = static_cast(rep[level_index]); int const start_depth = s->nesting_info[rep_level].start_depth; if constexpr (!nullable) { return cuda::std::make_tuple(-1, start_depth, max_depth); } else { if (def != nullptr) { - int const def_level = static_cast(def[index]); + int const def_level = static_cast(def[level_index]); return cuda::std::make_tuple( def_level, start_depth, s->nesting_info[def_level].end_depth); } else { @@ -639,13 +639,13 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c total_num_new_rows = new_row_scan_results.block_count; } - int const row_index = input_row_count + (num_prior_new_rows + is_new_row - 1); + int const row_index = input_row_count + ((num_prior_new_rows + is_new_row) - 1); input_row_count += total_num_new_rows; int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); // VALUE COUNT: - // if we are within the range of nesting levels we should be adding value indices for - // is from/in current rep level to/in the rep level AT the depth with the def value + // in_nesting_bounds: if at a nesting level where we need to add value indices + // the bounds: from current rep to the rep AT the def depth int in_nesting_bounds = ((0 >= start_depth && 0 <= end_depth) && in_row_bounds) ? 1 : 0; int thread_value_count_within_warp, warp_value_count, thread_value_count, block_value_count; { @@ -724,7 +724,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c } } - // validity is processed per-warp (on lane 0's), because writes are 32-bit atomic ops + // validity is processed per-warp (on lane 0's) + // thi is because when atomic writes are needed, they are 32-bit operations // // lists always read and write to the same bounds // (that is, read and write positions are already pre-bounded by first_row/num_rows). @@ -820,7 +821,7 @@ __device__ inline bool maybe_has_nulls(page_state_s* s) return run_val != s->col.max_level[lvl]; } -template +template __device__ int skip_decode(stream_type& parquet_stream, int num_to_skip, int t) { // it could be that (e.g.) we skip 5000 but starting at row 4000 we have a run of length 2000: @@ -828,7 +829,8 @@ __device__ int skip_decode(stream_type& parquet_stream, int num_to_skip, int t) // modulo 2 * block_size of course, since that's as many as we process at once int num_skipped = parquet_stream.skip_decode(t, num_to_skip); while (num_skipped < num_to_skip) { - auto const to_decode = min(2 * decode_block_size_t, num_to_skip - num_skipped); + // TODO: Instead of decoding, skip within the run to the appropriate location + auto const to_decode = min(rolling_buf_size, num_to_skip - num_skipped); num_skipped += parquet_stream.decode_next(t, to_decode); __syncthreads(); } @@ -967,11 +969,11 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) auto const skipped_leaf_values = s->page.skipped_leaf_values; if (skipped_leaf_values > 0) { if (should_process_nulls) { - skip_decode(def_decoder, skipped_leaf_values, t); + skip_decode(def_decoder, skipped_leaf_values, t); } - processed_count = skip_decode(rep_decoder, skipped_leaf_values, t); + processed_count = skip_decode(rep_decoder, skipped_leaf_values, t); if constexpr (has_dict_t) { - skip_decode(dict_stream, skipped_leaf_values, t); + skip_decode(dict_stream, skipped_leaf_values, t); } } } diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh index 78f6bcaa7b4..69e783a89d0 100644 --- a/cpp/src/io/parquet/rle_stream.cuh +++ b/cpp/src/io/parquet/rle_stream.cuh @@ -248,10 +248,12 @@ struct rle_stream { fill_index < decode_index + run_buffer_size) && cur < end) { // Encoding::RLE + // Pass by reference to fill the runs shared memory with the run data auto& run = runs[rolling_index(fill_index)]; int const run_bytes = get_rle_run_info(run); - run.remaining = run.size; - run.output_pos = output_pos; + + run.remaining = run.size; + run.output_pos = output_pos; cur += run_bytes; output_pos += run.size; From a6adb0d09692f816f217095f909a50a31c1e5b09 Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Mon, 28 Oct 2024 16:14:30 -0400 Subject: [PATCH 37/38] Change scan interface to pass in shared memory to avoid sync issues --- cpp/src/io/parquet/decode_fixed.cu | 55 ++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index cedced55d51..6ad4d2233e2 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -37,7 +37,14 @@ struct block_scan_results { }; template -__device__ inline static void scan_block_exclusive_sum(int thread_bit, block_scan_results& results) +using block_scan_temp_storage = int[decode_block_size / cudf::detail::warp_size]; + +// Similar to CUB, must __syncthreads() after calling if reusing temp_storage +template +__device__ inline static void scan_block_exclusive_sum( + int thread_bit, + block_scan_results& results, + block_scan_temp_storage& temp_storage) { int const t = threadIdx.x; int const warp_index = t / cudf::detail::warp_size; @@ -45,15 +52,19 @@ __device__ inline static void scan_block_exclusive_sum(int thread_bit, block_sca uint32_t const lane_mask = (uint32_t(1) << warp_lane) - 1; uint32_t warp_bits = ballot(thread_bit); - scan_block_exclusive_sum(warp_bits, warp_lane, warp_index, lane_mask, results); + scan_block_exclusive_sum( + warp_bits, warp_lane, warp_index, lane_mask, results, temp_storage); } +// Similar to CUB, must __syncthreads() after calling if reusing temp_storage template -__device__ static void scan_block_exclusive_sum(uint32_t warp_bits, - int warp_lane, - int warp_index, - uint32_t lane_mask, - block_scan_results& results) +__device__ static void scan_block_exclusive_sum( + uint32_t warp_bits, + int warp_lane, + int warp_index, + uint32_t lane_mask, + block_scan_results& results, + block_scan_temp_storage& temp_storage) { // Compute # warps constexpr int num_warps = decode_block_size / cudf::detail::warp_size; @@ -64,16 +75,15 @@ __device__ static void scan_block_exclusive_sum(uint32_t warp_bits, results.thread_count_within_warp = __popc(results.warp_bits & lane_mask); // Share the warp counts amongst the block threads - __shared__ int warp_counts[num_warps]; - if (warp_lane == 0) { warp_counts[warp_index] = results.warp_count; } - __syncthreads(); + if (warp_lane == 0) { temp_storage[warp_index] = results.warp_count; } + __syncthreads(); // Sync to share counts between threads/warps // Compute block-wide results results.block_count = 0; results.thread_count_within_block = results.thread_count_within_warp; for (int warp_idx = 0; warp_idx < num_warps; ++warp_idx) { - results.block_count += warp_counts[warp_idx]; - if (warp_idx < warp_index) { results.thread_count_within_block += warp_counts[warp_idx]; } + results.block_count += temp_storage[warp_idx]; + if (warp_idx < warp_index) { results.thread_count_within_block += temp_storage[warp_idx]; } } } @@ -634,7 +644,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c int num_prior_new_rows, total_num_new_rows; { block_scan_results new_row_scan_results; - scan_block_exclusive_sum(is_new_row, new_row_scan_results); + __shared__ block_scan_temp_storage temp_storage; + scan_block_exclusive_sum(is_new_row, new_row_scan_results, temp_storage); num_prior_new_rows = new_row_scan_results.thread_count_within_block; total_num_new_rows = new_row_scan_results.block_count; } @@ -650,7 +661,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c int thread_value_count_within_warp, warp_value_count, thread_value_count, block_value_count; { block_scan_results value_count_scan_results; - scan_block_exclusive_sum(in_nesting_bounds, value_count_scan_results); + __shared__ block_scan_temp_storage temp_storage; + scan_block_exclusive_sum( + in_nesting_bounds, value_count_scan_results, temp_storage); thread_value_count_within_warp = value_count_scan_results.thread_count_within_warp; warp_value_count = value_count_scan_results.warp_count; @@ -682,8 +695,13 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c auto thread_mask = (uint32_t(1) << thread_value_count_within_warp) - 1; block_scan_results valid_count_scan_results; - scan_block_exclusive_sum( - warp_valid_mask, warp_lane, warp_index, thread_mask, valid_count_scan_results); + __shared__ block_scan_temp_storage temp_storage; + scan_block_exclusive_sum(warp_valid_mask, + warp_lane, + warp_index, + thread_mask, + valid_count_scan_results, + temp_storage); thread_valid_count = valid_count_scan_results.thread_count_within_block; block_valid_count = valid_count_scan_results.block_count; } @@ -700,8 +718,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c ((d_idx + 1 >= start_depth) && (d_idx + 1 <= end_depth) && in_row_bounds) ? 1 : 0; { block_scan_results next_value_count_scan_results; - scan_block_exclusive_sum(next_in_nesting_bounds, - next_value_count_scan_results); + __shared__ block_scan_temp_storage temp_storage; + scan_block_exclusive_sum( + next_in_nesting_bounds, next_value_count_scan_results, temp_storage); next_thread_value_count_within_warp = next_value_count_scan_results.thread_count_within_warp; From f4aedb988693c5690e52e4949965429d779f8586 Mon Sep 17 00:00:00 2001 From: Paul Mattione Date: Tue, 29 Oct 2024 11:55:45 -0400 Subject: [PATCH 38/38] switch to sharing memory between scans --- cpp/src/io/parquet/decode_fixed.cu | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 6ad4d2233e2..45380e6ea20 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -611,6 +611,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c bool const is_first_lane = (warp_lane == 0); __syncthreads(); + __shared__ block_scan_temp_storage temp_storage; while (value_count < target_value_count) { bool const within_batch = value_count + t < target_value_count; @@ -644,8 +645,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c int num_prior_new_rows, total_num_new_rows; { block_scan_results new_row_scan_results; - __shared__ block_scan_temp_storage temp_storage; scan_block_exclusive_sum(is_new_row, new_row_scan_results, temp_storage); + __syncthreads(); num_prior_new_rows = new_row_scan_results.thread_count_within_block; total_num_new_rows = new_row_scan_results.block_count; } @@ -661,9 +662,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c int thread_value_count_within_warp, warp_value_count, thread_value_count, block_value_count; { block_scan_results value_count_scan_results; - __shared__ block_scan_temp_storage temp_storage; scan_block_exclusive_sum( in_nesting_bounds, value_count_scan_results, temp_storage); + __syncthreads(); thread_value_count_within_warp = value_count_scan_results.thread_count_within_warp; warp_value_count = value_count_scan_results.warp_count; @@ -695,13 +696,13 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c auto thread_mask = (uint32_t(1) << thread_value_count_within_warp) - 1; block_scan_results valid_count_scan_results; - __shared__ block_scan_temp_storage temp_storage; scan_block_exclusive_sum(warp_valid_mask, warp_lane, warp_index, thread_mask, valid_count_scan_results, temp_storage); + __syncthreads(); thread_valid_count = valid_count_scan_results.thread_count_within_block; block_valid_count = valid_count_scan_results.block_count; } @@ -718,9 +719,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c ((d_idx + 1 >= start_depth) && (d_idx + 1 <= end_depth) && in_row_bounds) ? 1 : 0; { block_scan_results next_value_count_scan_results; - __shared__ block_scan_temp_storage temp_storage; scan_block_exclusive_sum( next_in_nesting_bounds, next_value_count_scan_results, temp_storage); + __syncthreads(); next_thread_value_count_within_warp = next_value_count_scan_results.thread_count_within_warp;