Skip to content

Commit

Permalink
Fix Parquet decimal64 stats (#15281)
Browse files Browse the repository at this point in the history
In the Parquet writer, `decimal64` stats were being treated like `decimal128` (i.e. written in network byte order), when they should be treated like an `int64_t`. This PR fixes that and adds tests of `decimal32` and `decimal64` statistics.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - MithunR (https://github.com/mythrocks)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: #15281
  • Loading branch information
etseidl authored Mar 18, 2024
1 parent 13cb4bc commit e435953
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 1 deletion.
2 changes: 1 addition & 1 deletion cpp/src/io/parquet/page_enc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2896,9 +2896,9 @@ __device__ std::pair<void const*, uint32_t> get_extremum(statistics_val const* s
return {scratch, sizeof(float)};
}
case dtype_int64:
case dtype_decimal64:
case dtype_timestamp64:
case dtype_float64: return {stats_val, sizeof(int64_t)};
case dtype_decimal64:
case dtype_decimal128:
byte_reverse128(stats_val->d128_val, scratch);
return {scratch, sizeof(__int128_t)};
Expand Down
58 changes: 58 additions & 0 deletions cpp/tests/io/parquet_writer_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -719,6 +719,64 @@ TEST_F(ParquetWriterTest, CheckPageRowsTooSmall)
EXPECT_EQ(ph.data_page_header.num_values, num_rows);
}

TEST_F(ParquetWriterTest, Decimal32Stats)
{
// check that decimal64 min and max statistics are written properly
std::vector<uint8_t> expected_min{0, 0, 0xb2, 0xa1};
std::vector<uint8_t> expected_max{0xb2, 0xa1, 0, 0};

int32_t val0 = 0xa1b2;
int32_t val1 = val0 << 16;
column_wrapper<numeric::decimal32> col0{{numeric::decimal32(val0, numeric::scale_type{0}),
numeric::decimal32(val1, numeric::scale_type{0})}};

auto expected = table_view{{col0}};

auto const filepath = temp_env->get_temp_filepath("Decimal32Stats.parquet");
const cudf::io::parquet_writer_options out_opts =
cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
cudf::io::write_parquet(out_opts);

auto const source = cudf::io::datasource::create(filepath);
cudf::io::parquet::detail::FileMetaData fmd;

read_footer(source, &fmd);

auto const stats = get_statistics(fmd.row_groups[0].columns[0]);

EXPECT_EQ(expected_min, stats.min_value);
EXPECT_EQ(expected_max, stats.max_value);
}

TEST_F(ParquetWriterTest, Decimal64Stats)
{
// check that decimal64 min and max statistics are written properly
std::vector<uint8_t> expected_min{0, 0, 0, 0, 0xd4, 0xc3, 0xb2, 0xa1};
std::vector<uint8_t> expected_max{0xd4, 0xc3, 0xb2, 0xa1, 0, 0, 0, 0};

int64_t val0 = 0xa1b2'c3d4UL;
int64_t val1 = val0 << 32;
column_wrapper<numeric::decimal64> col0{{numeric::decimal64(val0, numeric::scale_type{0}),
numeric::decimal64(val1, numeric::scale_type{0})}};

auto expected = table_view{{col0}};

auto const filepath = temp_env->get_temp_filepath("Decimal64Stats.parquet");
const cudf::io::parquet_writer_options out_opts =
cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
cudf::io::write_parquet(out_opts);

auto const source = cudf::io::datasource::create(filepath);
cudf::io::parquet::detail::FileMetaData fmd;

read_footer(source, &fmd);

auto const stats = get_statistics(fmd.row_groups[0].columns[0]);

EXPECT_EQ(expected_min, stats.min_value);
EXPECT_EQ(expected_max, stats.max_value);
}

TEST_F(ParquetWriterTest, Decimal128Stats)
{
// check that decimal128 min and max statistics are written in network byte order
Expand Down

0 comments on commit e435953

Please sign in to comment.