Skip to content

Commit

Permalink
Make Parquet ColumnIndex null_counts optional (#14596)
Browse files Browse the repository at this point in the history
The Parquet specification lists the `null_counts` field of the `ColumnIndex` structure as "optional". This PR brings the cudf `ColumnIndex` struct in line with the specification.

Authors:
  - Ed Seidl (https://github.com/etseidl)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: #14596
  • Loading branch information
etseidl authored Dec 13, 2023
1 parent 0698438 commit 420dc5d
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 10 deletions.
2 changes: 1 addition & 1 deletion cpp/src/io/parquet/compact_protocol_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -729,7 +729,7 @@ bool CompactProtocolReader::read(ColumnIndex* c)
parquet_field_binary_list(2, c->min_values),
parquet_field_binary_list(3, c->max_values),
parquet_field_enum<BoundaryOrder>(4, c->boundary_order),
parquet_field_int64_list(5, c->null_counts),
optional_list_i64(5, c->null_counts),
optional_list_i64(6, c->repetition_level_histogram),
optional_list_i64(7, c->definition_level_histogram));
return function_builder(this, op);
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/io/parquet/parquet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -315,8 +315,8 @@ struct ColumnIndex {
std::vector<std::vector<uint8_t>> min_values; // lower bound for values in each page
std::vector<std::vector<uint8_t>> max_values; // upper bound for values in each page
BoundaryOrder boundary_order =
BoundaryOrder::UNORDERED; // Indicates if min and max values are ordered
std::vector<int64_t> null_counts; // Optional count of null values per page
BoundaryOrder::UNORDERED; // Indicates if min and max values are ordered
thrust::optional<std::vector<int64_t>> null_counts; // Optional count of null values per page
// Repetition/definition level histograms for the column chunk
thrust::optional<std::vector<int64_t>> repetition_level_histogram;
thrust::optional<std::vector<int64_t>> definition_level_histogram;
Expand Down
20 changes: 13 additions & 7 deletions cpp/tests/io/parquet_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4249,6 +4249,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex)

ASSERT_TRUE(stats.min_value.has_value());
ASSERT_TRUE(stats.max_value.has_value());
ASSERT_TRUE(ci.null_counts.has_value());

// schema indexing starts at 1
auto const ptype = fmd.schema[c + 1].type;
Expand All @@ -4257,7 +4258,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
// null_pages should always be false
EXPECT_FALSE(ci.null_pages[p]);
// null_counts should always be 0
EXPECT_EQ(ci.null_counts[p], 0);
EXPECT_EQ(ci.null_counts.value()[p], 0);
EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0);
}
for (size_t p = 0; p < ci.max_values.size(); p++)
Expand Down Expand Up @@ -4356,16 +4357,17 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
ASSERT_TRUE(stats.max_value.has_value());
ASSERT_TRUE(stats.null_count.has_value());
EXPECT_EQ(stats.null_count.value(), c == 0 ? 0 : num_rows / 2);
ASSERT_TRUE(ci.null_counts.has_value());

// schema indexing starts at 1
auto const ptype = fmd.schema[c + 1].type;
auto const ctype = fmd.schema[c + 1].converted_type;
for (size_t p = 0; p < ci.min_values.size(); p++) {
EXPECT_FALSE(ci.null_pages[p]);
if (c > 0) { // first column has no nulls
EXPECT_GT(ci.null_counts[p], 0);
EXPECT_GT(ci.null_counts.value()[p], 0);
} else {
EXPECT_EQ(ci.null_counts[p], 0);
EXPECT_EQ(ci.null_counts.value()[p], 0);
}
EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0);
}
Expand Down Expand Up @@ -4453,6 +4455,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
}
ASSERT_TRUE(stats.null_count.has_value());
EXPECT_EQ(stats.null_count.value(), c == 1 ? num_rows : 0);
ASSERT_TRUE(ci.null_counts.has_value());

// schema indexing starts at 1
auto const ptype = fmd.schema[c + 1].type;
Expand All @@ -4461,10 +4464,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
// check tnat null_pages is true for column 1
if (c == 1) {
EXPECT_TRUE(ci.null_pages[p]);
EXPECT_GT(ci.null_counts[p], 0);
EXPECT_GT(ci.null_counts.value()[p], 0);
}
if (not ci.null_pages[p]) {
EXPECT_EQ(ci.null_counts[p], 0);
EXPECT_EQ(ci.null_counts.value()[p], 0);
EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0);
}
}
Expand Down Expand Up @@ -4651,14 +4654,16 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls)
}

int64_t num_vals = 0;

if (is_v2) { ASSERT_TRUE(ci.null_counts.has_value()); }
for (size_t o = 0; o < oi.page_locations.size(); o++) {
auto const& page_loc = oi.page_locations[o];
auto const ph = read_page_header(source, page_loc);
EXPECT_EQ(ph.type, expected_hdr_type);
EXPECT_EQ(page_loc.first_row_index, num_vals);
num_vals += is_v2 ? ph.data_page_header_v2.num_rows : ph.data_page_header.num_values;
// check that null counts match
if (is_v2) { EXPECT_EQ(ci.null_counts[o], ph.data_page_header_v2.num_nulls); }
if (is_v2) { EXPECT_EQ(ci.null_counts.value()[o], ph.data_page_header_v2.num_nulls); }
}
}
}
Expand Down Expand Up @@ -4863,7 +4868,8 @@ TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls)

// should only be one page
EXPECT_FALSE(ci.null_pages[0]);
EXPECT_EQ(ci.null_counts[0], expected_null_counts[c]);
ASSERT_TRUE(ci.null_counts.has_value());
EXPECT_EQ(ci.null_counts.value()[0], expected_null_counts[c]);

ASSERT_TRUE(ci.definition_level_histogram.has_value());
EXPECT_EQ(ci.definition_level_histogram.value(), expected_def_hists[c]);
Expand Down

0 comments on commit 420dc5d

Please sign in to comment.