From 90b763cc666c424f919ec8dcb1a0ccb064dde35e Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 22 Feb 2024 00:41:19 -0800 Subject: [PATCH] Fix bugs in handling of delta encodings (#15075) Part of #14938 was fixing two bugs discovered during testing. One is in the encoding of DELTA_BINARY_PACKED data where the first non-null value in a page to be encoded is not in the first batch of 129 values. The second is an error in decoding of DELTA_BYTE_ARRAY pages where, again, the first non-null value is not in the first block to be decoded. This PR includes a test for the former, but the latter cannot be easily tested because the python API still lacks `skip_rows`, and we cannot generate DELTA_BYTE_ARRAY encoded data without the changes in #14938. A test for the latter will be added later, but the fix has been validated with data on hand locally. Authors: - Ed Seidl (https://github.com/etseidl) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/15075 --- cpp/src/io/parquet/delta_enc.cuh | 4 ++-- cpp/src/io/parquet/page_string_decode.cu | 3 +++ cpp/tests/io/parquet_writer_test.cpp | 26 ++++++++++++++++++++++++ 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/delta_enc.cuh b/cpp/src/io/parquet/delta_enc.cuh index f90d364f5eb..49f4ccedbf0 100644 --- a/cpp/src/io/parquet/delta_enc.cuh +++ b/cpp/src/io/parquet/delta_enc.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -201,7 +201,7 @@ class delta_binary_packer { if (is_valid) { _buffer[delta::rolling_idx(pos + _current_idx + _values_in_buffer)] = value; } __syncthreads(); - if (threadIdx.x == 0) { + if (num_valid > 0 && threadIdx.x == 0) { _values_in_buffer += num_valid; // if first pass write header if (_current_idx == 0) { diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index d652a43d097..5cd8205b4ba 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -535,6 +535,9 @@ __device__ thrust::pair totalDeltaByteArraySize(uint8_t const* d uint32_t const idx = db->current_value_idx + i + lane_id; if (idx >= start_value && idx < end_value && idx < db->value_count) { lane_sum += db->value[rolling_index(idx)]; + } + // need lane_max over all values, not just in bounds + if (idx < db->value_count) { lane_max = max(lane_max, db->value[rolling_index(idx)]); } } diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index 34061cb7bf8..62a24bf0a73 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -1426,6 +1426,32 @@ TEST_F(ParquetWriterTest, RowGroupMetadata) static_cast(num_rows * sizeof(column_type))); } +TEST_F(ParquetWriterTest, DeltaBinaryStartsWithNulls) +{ + // test that the DELTA_BINARY_PACKED writer can properly encode a column that begins with + // more than 129 nulls + constexpr int num_rows = 500; + constexpr int num_nulls = 150; + + auto const ones = thrust::make_constant_iterator(1); + auto valids = cudf::detail::make_counting_transform_iterator( + 0, [num_nulls](auto i) { return i >= num_nulls; }); + auto const col = cudf::test::fixed_width_column_wrapper{ones, ones + num_rows, valids}; + auto const expected = table_view({col}); + + auto const filepath = temp_env->get_temp_filepath("DeltaBinaryStartsWithNulls.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .write_v2_headers(true) + .dictionary_policy(cudf::io::dictionary_policy::NEVER); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(in_opts); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); +} + ///////////////////////////////////////////////////////////// // custom mem mapped data sink that supports device writes template