From 3540613436ee95296be61f66fed72de44314fb33 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 16 Mar 2023 23:53:46 -0700 Subject: [PATCH] Fix calculation of null counts for Parquet statistics (#12938) The current Parquet writer sometimes generates wrong values for `null_count` in the column chunk statistics and page indexes. This occurs for nested schemas when nulls occur at a level above the leaf values. This PR fixes the calculation by adding a `non_leaf_nulls` field to the `statistics_group` struct. This field is added to the chunk `null_count` calculated over leaf values in `gpu_calculate_group_statistics()`. Authors: - Ed Seidl (https://github.com/etseidl) - Nghia Truong (https://github.com/ttnghia) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Bradley Dice (https://github.com/bdice) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/12938 --- cpp/src/io/parquet/page_enc.cu | 1 + cpp/src/io/statistics/column_statistics.cuh | 10 +- cpp/src/io/statistics/statistics.cuh | 3 +- cpp/tests/io/parquet_test.cpp | 133 ++++++++++++++++++++ 4 files changed, 144 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 5a12acec2a3..e48696fcb9b 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -282,6 +282,7 @@ __global__ void __launch_bounds__(128) g.col = ck_g->col_desc; g.start_row = fragments[frag_id].start_value_idx; g.num_rows = fragments[frag_id].num_leaf_values; + g.non_leaf_nulls = fragments[frag_id].num_values - g.num_rows; groups[frag_id] = g; } } diff --git a/cpp/src/io/statistics/column_statistics.cuh b/cpp/src/io/statistics/column_statistics.cuh index 125235ebf2f..0b09cb63d19 100644 --- a/cpp/src/io/statistics/column_statistics.cuh +++ b/cpp/src/io/statistics/column_statistics.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -129,7 +129,13 @@ struct calculate_group_statistics_functor { chunk = block_reduce(chunk, storage); - if (t == 0) { s.ck = get_untyped_chunk(chunk); } + if (t == 0) { + // parquet wants total null count in stats, not just count of null leaf values + if constexpr (IO == detail::io_file_format::PARQUET) { + chunk.null_count += s.group.non_leaf_nulls; + } + s.ck = get_untyped_chunk(chunk); + } } template ; + + // 4 nulls + // [NULL, 2, NULL] + // [] + // [4, 5] + // NULL + lcw col0{{{{1, 2, 3}, nulls_at({0, 2})}, {}, {4, 5}, {}}, null_at(3)}; + + // 4 nulls + // [[1, 2, 3], [], [4, 5], [], [0, 6, 0]] + // [[7, 8]] + // [] + // [[]] + lcw col1{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, {{7, 8}}, lcw{}, lcw{lcw{}}}; + + // 4 nulls + // [[1, 2, 3], [], [4, 5], NULL, [0, 6, 0]] + // [[7, 8]] + // [] + // [[]] + lcw col2{{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, null_at(3)}, {{7, 8}}, lcw{}, lcw{lcw{}}}; + + // 6 nulls + // [[1, 2, 3], [], [4, 5], NULL, [NULL, 6, NULL]] + // [[7, 8]] + // [] + // [[]] + using dlcw = cudf::test::lists_column_wrapper; + dlcw col3{{{{1., 2., 3.}, {}, {4., 5.}, {}, {{0., 6., 0.}, nulls_at({0, 2})}}, null_at(3)}, + {{7., 8.}}, + dlcw{}, + dlcw{dlcw{}}}; + + // 4 nulls + // [[1, 2, 3], [], [4, 5], NULL, [0, 6, 0]] + // [[7, 8]] + // [] + // NULL + using ui16lcw = cudf::test::lists_column_wrapper; + cudf::test::lists_column_wrapper col4{ + {{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, null_at(3)}, {{7, 8}}, ui16lcw{}, ui16lcw{ui16lcw{}}}, + null_at(3)}; + + // 6 nulls + // [[1, 2, 3], [], [4, 5], NULL, [NULL, 6, NULL]] + // [[7, 8]] + // [] + // NULL + lcw col5{{{{{1, 2, 3}, {}, {4, 5}, {}, {{0, 6, 0}, nulls_at({0, 2})}}, null_at(3)}, + {{7, 8}}, + lcw{}, + lcw{lcw{}}}, + null_at(3)}; + + // 4 nulls + using strlcw = cudf::test::lists_column_wrapper; + cudf::test::lists_column_wrapper col6{ + {{"Monday", "Monday", "Friday"}, {}, {"Monday", "Friday"}, {}, {"Sunday", "Funday"}}, + {{"bee", "sting"}}, + strlcw{}, + strlcw{strlcw{}}}; + + // 11 nulls + // [[[NULL,2,NULL,4]], [[NULL,6,NULL], [8,9]]] + // [NULL, [[13],[14,15,16]], NULL] + // [NULL, [], NULL, [[]]] + // NULL + lcw col7{{ + {{{{1, 2, 3, 4}, nulls_at({0, 2})}}, {{{5, 6, 7}, nulls_at({0, 2})}, {8, 9}}}, + {{{{10, 11}, {12}}, {{13}, {14, 15, 16}}, {{17, 18}}}, nulls_at({0, 2})}, + {{lcw{lcw{}}, lcw{}, lcw{}, lcw{lcw{}}}, nulls_at({0, 2})}, + lcw{lcw{lcw{}}}, + }, + null_at(3)}; + + table_view expected({col0, col1, col2, col3, col4, col5, col6, col7}); + + int64_t const expected_null_counts[] = {4, 4, 4, 6, 4, 6, 4, 11}; + + auto const filepath = temp_env->get_temp_filepath("ColumnIndexListWithNulls.parquet"); + auto out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) + .compression(cudf::io::compression_type::NONE); + + cudf::io::write_parquet(out_opts); + + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::FileMetaData fmd; + + read_footer(source, &fmd); + + for (size_t r = 0; r < fmd.row_groups.size(); r++) { + auto const& rg = fmd.row_groups[r]; + for (size_t c = 0; c < rg.columns.size(); c++) { + auto const& chunk = rg.columns[c]; + + // loop over offsets, read each page header, make sure it's a data page and that + // the first row index is correct + auto const oi = read_offset_index(source, chunk); + + int64_t num_vals = 0; + for (size_t o = 0; o < oi.page_locations.size(); o++) { + auto const& page_loc = oi.page_locations[o]; + auto const ph = read_page_header(source, page_loc); + EXPECT_EQ(ph.type, cudf::io::parquet::PageType::DATA_PAGE); + // last column has 2 values per row + EXPECT_EQ(page_loc.first_row_index * (c == rg.columns.size() - 1 ? 2 : 1), num_vals); + num_vals += ph.data_page_header.num_values; + } + + // check null counts in column chunk stats and page indexes + auto const ci = read_column_index(source, chunk); + auto const stats = parse_statistics(chunk); + EXPECT_EQ(stats.null_count, expected_null_counts[c]); + + // should only be one page + EXPECT_FALSE(ci.null_pages[0]); + EXPECT_EQ(ci.null_counts[0], expected_null_counts[c]); + } + } +} + TEST_F(ParquetWriterTest, CheckColumnIndexTruncation) { const char* coldata[] = {