Skip to content

Commit

Permalink
Fix calculation of null counts for Parquet statistics (#12938)
Browse files Browse the repository at this point in the history
The current Parquet writer sometimes generates wrong values for `null_count` in the column chunk statistics and page indexes. This occurs for nested schemas when nulls occur at a level above the leaf values. This PR fixes the calculation by adding a `non_leaf_nulls` field to the `statistics_group` struct. This field is added to the chunk `null_count` calculated over leaf values in `gpu_calculate_group_statistics()`.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: #12938
  • Loading branch information
etseidl authored Mar 17, 2023
1 parent 1e377fc commit 3540613
Show file tree
Hide file tree
Showing 4 changed files with 144 additions and 3 deletions.
1 change: 1 addition & 0 deletions cpp/src/io/parquet/page_enc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,7 @@ __global__ void __launch_bounds__(128)
g.col = ck_g->col_desc;
g.start_row = fragments[frag_id].start_value_idx;
g.num_rows = fragments[frag_id].num_leaf_values;
g.non_leaf_nulls = fragments[frag_id].num_values - g.num_rows;
groups[frag_id] = g;
}
}
Expand Down
10 changes: 8 additions & 2 deletions cpp/src/io/statistics/column_statistics.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -129,7 +129,13 @@ struct calculate_group_statistics_functor {

chunk = block_reduce(chunk, storage);

if (t == 0) { s.ck = get_untyped_chunk(chunk); }
if (t == 0) {
// parquet wants total null count in stats, not just count of null leaf values
if constexpr (IO == detail::io_file_format::PARQUET) {
chunk.null_count += s.group.non_leaf_nulls;
}
s.ck = get_untyped_chunk(chunk);
}
}

template <typename T,
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/io/statistics/statistics.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -111,6 +111,7 @@ struct statistics_group {
const stats_column_desc* col; //!< Column information
uint32_t start_row; //!< Start row of this group
uint32_t num_rows; //!< Number of rows in group
uint32_t non_leaf_nulls; //!< Number of null non-leaf values in the group
};

struct statistics_merge_group {
Expand Down
133 changes: 133 additions & 0 deletions cpp/tests/io/parquet_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4279,6 +4279,9 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexNulls)
auto const ci = read_column_index(source, chunk);
auto const stats = parse_statistics(chunk);

// should be half nulls, except no nulls in column 0
EXPECT_EQ(stats.null_count, c == 0 ? 0 : num_rows / 2);

// schema indexing starts at 1
auto const ptype = fmd.schema[c + 1].type;
auto const ctype = fmd.schema[c + 1].converted_type;
Expand Down Expand Up @@ -4364,6 +4367,9 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexNullColumn)
auto const ci = read_column_index(source, chunk);
auto const stats = parse_statistics(chunk);

// there should be no nulls except column 1 which is all nulls
EXPECT_EQ(stats.null_count, c == 1 ? num_rows : 0);

// schema indexing starts at 1
auto const ptype = fmd.schema[c + 1].type;
auto const ctype = fmd.schema[c + 1].converted_type;
Expand Down Expand Up @@ -4465,6 +4471,133 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexStruct)
}
}

TEST_F(ParquetWriterTest, CheckColumnIndexListWithNulls)
{
using cudf::test::iterators::null_at;
using cudf::test::iterators::nulls_at;
using lcw = cudf::test::lists_column_wrapper<int32_t>;

// 4 nulls
// [NULL, 2, NULL]
// []
// [4, 5]
// NULL
lcw col0{{{{1, 2, 3}, nulls_at({0, 2})}, {}, {4, 5}, {}}, null_at(3)};

// 4 nulls
// [[1, 2, 3], [], [4, 5], [], [0, 6, 0]]
// [[7, 8]]
// []
// [[]]
lcw col1{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, {{7, 8}}, lcw{}, lcw{lcw{}}};

// 4 nulls
// [[1, 2, 3], [], [4, 5], NULL, [0, 6, 0]]
// [[7, 8]]
// []
// [[]]
lcw col2{{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, null_at(3)}, {{7, 8}}, lcw{}, lcw{lcw{}}};

// 6 nulls
// [[1, 2, 3], [], [4, 5], NULL, [NULL, 6, NULL]]
// [[7, 8]]
// []
// [[]]
using dlcw = cudf::test::lists_column_wrapper<double>;
dlcw col3{{{{1., 2., 3.}, {}, {4., 5.}, {}, {{0., 6., 0.}, nulls_at({0, 2})}}, null_at(3)},
{{7., 8.}},
dlcw{},
dlcw{dlcw{}}};

// 4 nulls
// [[1, 2, 3], [], [4, 5], NULL, [0, 6, 0]]
// [[7, 8]]
// []
// NULL
using ui16lcw = cudf::test::lists_column_wrapper<uint16_t>;
cudf::test::lists_column_wrapper<uint16_t> col4{
{{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, null_at(3)}, {{7, 8}}, ui16lcw{}, ui16lcw{ui16lcw{}}},
null_at(3)};

// 6 nulls
// [[1, 2, 3], [], [4, 5], NULL, [NULL, 6, NULL]]
// [[7, 8]]
// []
// NULL
lcw col5{{{{{1, 2, 3}, {}, {4, 5}, {}, {{0, 6, 0}, nulls_at({0, 2})}}, null_at(3)},
{{7, 8}},
lcw{},
lcw{lcw{}}},
null_at(3)};

// 4 nulls
using strlcw = cudf::test::lists_column_wrapper<cudf::string_view>;
cudf::test::lists_column_wrapper<cudf::string_view> col6{
{{"Monday", "Monday", "Friday"}, {}, {"Monday", "Friday"}, {}, {"Sunday", "Funday"}},
{{"bee", "sting"}},
strlcw{},
strlcw{strlcw{}}};

// 11 nulls
// [[[NULL,2,NULL,4]], [[NULL,6,NULL], [8,9]]]
// [NULL, [[13],[14,15,16]], NULL]
// [NULL, [], NULL, [[]]]
// NULL
lcw col7{{
{{{{1, 2, 3, 4}, nulls_at({0, 2})}}, {{{5, 6, 7}, nulls_at({0, 2})}, {8, 9}}},
{{{{10, 11}, {12}}, {{13}, {14, 15, 16}}, {{17, 18}}}, nulls_at({0, 2})},
{{lcw{lcw{}}, lcw{}, lcw{}, lcw{lcw{}}}, nulls_at({0, 2})},
lcw{lcw{lcw{}}},
},
null_at(3)};

table_view expected({col0, col1, col2, col3, col4, col5, col6, col7});

int64_t const expected_null_counts[] = {4, 4, 4, 6, 4, 6, 4, 11};

auto const filepath = temp_env->get_temp_filepath("ColumnIndexListWithNulls.parquet");
auto out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
.stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
.compression(cudf::io::compression_type::NONE);

cudf::io::write_parquet(out_opts);

auto const source = cudf::io::datasource::create(filepath);
cudf::io::parquet::FileMetaData fmd;

read_footer(source, &fmd);

for (size_t r = 0; r < fmd.row_groups.size(); r++) {
auto const& rg = fmd.row_groups[r];
for (size_t c = 0; c < rg.columns.size(); c++) {
auto const& chunk = rg.columns[c];

// loop over offsets, read each page header, make sure it's a data page and that
// the first row index is correct
auto const oi = read_offset_index(source, chunk);

int64_t num_vals = 0;
for (size_t o = 0; o < oi.page_locations.size(); o++) {
auto const& page_loc = oi.page_locations[o];
auto const ph = read_page_header(source, page_loc);
EXPECT_EQ(ph.type, cudf::io::parquet::PageType::DATA_PAGE);
// last column has 2 values per row
EXPECT_EQ(page_loc.first_row_index * (c == rg.columns.size() - 1 ? 2 : 1), num_vals);
num_vals += ph.data_page_header.num_values;
}

// check null counts in column chunk stats and page indexes
auto const ci = read_column_index(source, chunk);
auto const stats = parse_statistics(chunk);
EXPECT_EQ(stats.null_count, expected_null_counts[c]);

// should only be one page
EXPECT_FALSE(ci.null_pages[0]);
EXPECT_EQ(ci.null_counts[0], expected_null_counts[c]);
}
}
}

TEST_F(ParquetWriterTest, CheckColumnIndexTruncation)
{
const char* coldata[] = {
Expand Down

0 comments on commit 3540613

Please sign in to comment.