diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 9cf791ef3d9..8cb01d5a34b 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -994,7 +994,10 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s, int chunk_idx; // Fetch page info - if (!t) s->page = *p; + if (!t) { + s->page = *p; + s->nesting_info = nullptr; + } __syncthreads(); if (s->page.flags & PAGEINFO_FLAGS_DICTIONARY) { return false; } @@ -1889,6 +1892,26 @@ __global__ void __launch_bounds__(block_size) } } +// Copies null counts back to `nesting_decode` at the end of scope +struct null_count_back_copier { + page_state_s* s; + int t; + __device__ ~null_count_back_copier() + { + if (s->nesting_info != nullptr and s->nesting_info == s->nesting_decode_cache) { + int depth = 0; + while (depth < s->page.num_output_nesting_levels) { + int const thread_depth = depth + t; + if (thread_depth < s->page.num_output_nesting_levels) { + s->page.nesting_decode[thread_depth].null_count = + s->nesting_decode_cache[thread_depth].null_count; + } + depth += blockDim.x; + } + } + } +}; + /** * @brief Kernel for co the column data stored in the pages * @@ -1913,6 +1936,7 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData( int page_idx = blockIdx.x; int t = threadIdx.x; int out_thread0; + [[maybe_unused]] null_count_back_copier _{s, t}; if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; } @@ -2065,19 +2089,6 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData( } __syncthreads(); } - - // if we are using the nesting decode cache, copy null count back - if (s->nesting_info == s->nesting_decode_cache) { - int depth = 0; - while (depth < s->page.num_output_nesting_levels) { - int const thread_depth = depth + t; - if (thread_depth < s->page.num_output_nesting_levels) { - s->page.nesting_decode[thread_depth].null_count = - s->nesting_decode_cache[thread_depth].null_count; - } - depth += blockDim.x; - } - } } } // anonymous namespace diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp index 430d7c4a26d..ae8532b0993 100644 --- a/cpp/src/io/utilities/column_buffer.cpp +++ b/cpp/src/io/utilities/column_buffer.cpp @@ -241,13 +241,8 @@ std::unique_ptr empty_like(column_buffer& buffer, auto child = empty_like(buffer.children[0], child_info, stream, mr); // make the final list column - return make_lists_column(0, - std::move(offsets), - std::move(child), - buffer._null_count, - std::move(buffer._null_mask), - stream, - mr); + return make_lists_column( + 0, std::move(offsets), std::move(child), 0, rmm::device_buffer{0, stream, mr}, stream, mr); } break; case type_id::STRUCT: { @@ -265,12 +260,8 @@ std::unique_ptr empty_like(column_buffer& buffer, return empty_like(col, child_info, stream, mr); }); - return make_structs_column(0, - std::move(output_children), - buffer._null_count, - std::move(buffer._null_mask), - stream, - mr); + return make_structs_column( + 0, std::move(output_children), 0, rmm::device_buffer{0, stream, mr}, stream, mr); } break; default: return cudf::make_empty_column(buffer.type); diff --git a/cpp/tests/io/parquet_chunked_reader_test.cpp b/cpp/tests/io/parquet_chunked_reader_test.cpp index e13046eb90c..7cb5eeab9dc 100644 --- a/cpp/tests/io/parquet_chunked_reader_test.cpp +++ b/cpp/tests/io/parquet_chunked_reader_test.cpp @@ -920,3 +920,34 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadWithListsOfStructs) CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); } } + +TEST_F(ParquetChunkedReaderTest, TestChunkedReadNullCount) +{ + auto constexpr num_rows = 100'000; + + auto const sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; }); + auto const validity = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 4 != 3; }); + cudf::test::fixed_width_column_wrapper col{sequence, sequence + num_rows, validity}; + std::vector> cols; + cols.push_back(col.release()); + auto const expected = std::make_unique(std::move(cols)); + + auto const filepath = temp_env->get_temp_filepath("chunked_reader_null_count.parquet"); + auto const page_limit_rows = num_rows / 5; + auto const write_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected) + .max_page_size_rows(page_limit_rows) // 20k rows per page + .build(); + cudf::io::write_parquet(write_opts); + + auto const byte_limit = page_limit_rows * sizeof(int); + auto const read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}).build(); + auto reader = cudf::io::chunked_parquet_reader(byte_limit, read_opts); + + do { + // Every fourth row is null + EXPECT_EQ(reader.read_chunk().tbl->get_column(0).null_count(), page_limit_rows / 4); + } while (reader.has_next()); +}