Skip to content

Commit

Permalink
Handle parquet list data corner case (#12698)
Browse files Browse the repository at this point in the history
Fixes an issue with a particular arrangement of page data related to lists.  Specifically, it is possible for page `N` to contain "0" rows because the values for the row it is a part of start on page `N-1` and end on page `N+1`.  This was defeating logic in the decode kernel that would erroneously cause these values to be skipped.

Similar to #12488 this is only reproducible with data out in the wild.  In this case, we have a file that we could in theory check in to create a test with, but it is 16 MB so it's fairly large.  Looking for feedback on whether this is too big.

Authors:
  - https://github.com/nvdbaranec
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: #12698
  • Loading branch information
nvdbaranec authored Feb 8, 2023
1 parent 8ad4166 commit 476d5bb
Showing 1 changed file with 42 additions and 8 deletions.
50 changes: 42 additions & 8 deletions cpp/src/io/parquet/page_data.cu
Original file line number Diff line number Diff line change
Expand Up @@ -104,20 +104,41 @@ struct page_state_s {
* specified row bounds
*
* @param s The page to be checked
* @param min_row The starting row index
* @param start_row The starting row index
* @param num_rows The number of rows
*
* @return True if the page spans the beginning or the end of the row bounds
*/
inline __device__ bool is_bounds_page(page_state_s* const s, size_t min_row, size_t num_rows)
inline __device__ bool is_bounds_page(page_state_s* const s, size_t start_row, size_t num_rows)
{
size_t const page_begin = s->col.start_row + s->page.chunk_row;
size_t const page_end = page_begin + s->page.num_rows;
size_t const begin = min_row;
size_t const end = min_row + num_rows;
size_t const begin = start_row;
size_t const end = start_row + num_rows;

return ((page_begin <= begin && page_end >= begin) || (page_begin <= end && page_end >= end));
}

/**
* @brief Returns whether or not a page is completely contained within the specified
* row bounds
*
* @param s The page to be checked
* @param start_row The starting row index
* @param num_rows The number of rows
*
* @return True if the page is completely contained within the row bounds
*/
inline __device__ bool is_page_contained(page_state_s* const s, size_t start_row, size_t num_rows)
{
size_t const page_begin = s->col.start_row + s->page.chunk_row;
size_t const page_end = page_begin + s->page.num_rows;
size_t const begin = start_row;
size_t const end = start_row + num_rows;

return page_begin >= begin && page_end <= end;
}

/**
* @brief Read a 32-bit varint integer
*
Expand Down Expand Up @@ -1728,10 +1749,11 @@ __global__ void __launch_bounds__(block_size)
auto const thread_depth = depth + t;
if (thread_depth < s->page.num_output_nesting_levels) {
// if we are not a bounding page (as checked above) then we are either
// returning 0 rows from the page (completely outside the bounds) or all
// rows in the page (completely within the bounds)
// returning all rows/values from this page, or 0 of them
pp->nesting[thread_depth].batch_size =
s->num_rows == 0 ? 0 : pp->nesting[thread_depth].size;
(s->num_rows == 0 && !is_page_contained(s, min_row, num_rows))
? 0
: pp->nesting[thread_depth].size;
}
depth += blockDim.x;
}
Expand Down Expand Up @@ -1838,7 +1860,19 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData(
bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;

// if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
if (s->num_rows == 0 && !(has_repetition && is_bounds_page(s, min_row, num_rows))) { return; }
//
// corner case: in the case of lists, we can have pages that contain "0" rows if the current row
// starts before this page and ends after this page:
// P0 P1 P2
// |---------|---------|----------|
// ^------------------^
// row start row end
// P1 will contain 0 rows
//
if (s->num_rows == 0 && !(has_repetition && (is_bounds_page(s, min_row, num_rows) ||
is_page_contained(s, min_row, num_rows)))) {
return;
}

if (s->dict_base) {
out_thread0 = (s->dict_bits > 0) ? 64 : 32;
Expand Down

0 comments on commit 476d5bb

Please sign in to comment.