diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 9f705768e4eff..931b9fd10729f 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -57,6 +57,7 @@ using arrow::VisitNullBitmapInline; using arrow::internal::AddWithOverflow; using arrow::internal::checked_cast; using arrow::internal::MultiplyWithOverflow; +using arrow::internal::SafeSignedSubtract; using arrow::internal::SubtractWithOverflow; using arrow::util::SafeLoad; using arrow::util::SafeLoadAs; @@ -2189,9 +2190,9 @@ class DeltaBitPackEncoder : public EncoderImpl, virtual public TypedEncoder deltas_; + T first_value_{0}; + T current_value_{0}; + ArrowPoolVector deltas_; std::shared_ptr bits_buffer_; ::arrow::BufferBuilder sink_; ::arrow::bit_util::BitWriter bit_writer_; @@ -2212,12 +2213,12 @@ void DeltaBitPackEncoder::Put(const T* src, int num_values) { total_value_count_ += num_values; while (idx < num_values) { - UT value = static_cast(src[idx]); + T value = src[idx]; // Calculate deltas. The possible overflow is handled by use of unsigned integers // making subtraction operations well-defined and correct even in case of overflow. // Encoded integers will wrap back around on decoding. // See http://en.wikipedia.org/wiki/Modular_arithmetic#Integers_modulo_n - deltas_[values_current_block_] = value - current_value_; + deltas_[values_current_block_] = SafeSignedSubtract(value, current_value_); current_value_ = value; idx++; values_current_block_++; @@ -2233,9 +2234,11 @@ void DeltaBitPackEncoder::FlushBlock() { return; } - const UT min_delta = + // Calculate the frame of reference for this miniblock. This value will be subtracted + // from all deltas to guarantee all deltas are positive for encoding. + const T min_delta = *std::min_element(deltas_.begin(), deltas_.begin() + values_current_block_); - bit_writer_.PutZigZagVlqInt(static_cast(min_delta)); + bit_writer_.PutZigZagVlqInt(min_delta); // Call to GetNextBytePtr reserves mini_blocks_per_block_ bytes of space to write // bit widths of miniblocks as they become known during the encoding. @@ -2250,17 +2253,17 @@ void DeltaBitPackEncoder::FlushBlock() { std::min(values_per_mini_block_, values_current_block_); const uint32_t start = i * values_per_mini_block_; - const UT max_delta = *std::max_element( + const T max_delta = *std::max_element( deltas_.begin() + start, deltas_.begin() + start + values_current_mini_block); // The minimum number of bits required to write any of values in deltas_ vector. // See overflow comment above. - const auto bit_width = bit_width_data[i] = - bit_util::NumRequiredBits(max_delta - min_delta); + const auto bit_width = bit_width_data[i] = bit_util::NumRequiredBits( + static_cast(max_delta) - static_cast(min_delta)); for (uint32_t j = start; j < start + values_current_mini_block; j++) { - // See overflow comment above. - const UT value = deltas_[j] - min_delta; + // Convert delta to frame of reference. See overflow comment above. + const UT value = static_cast(deltas_[j]) - static_cast(min_delta); bit_writer_.PutValue(value, bit_width); } // If there are not enough values to fill the last mini block, we pad the mini block diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 71dc40d33ac47..9861c317c80d9 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1634,6 +1634,70 @@ TYPED_TEST(TestDeltaBitPackEncoding, NonZeroPaddedMiniblockBitWidth) { } } +// Test that the DELTA_BINARY_PACKED encoding works properply in the presence of values +// that will cause integer overflow (see GH-37939). +TYPED_TEST(TestDeltaBitPackEncoding, DeltaBitPackedWrapping) { + using T = typename TypeParam::c_type; + + // Values that should wrap when converted to deltas, and then when converted to the + // frame of reference. + std::vector int_values = {std::numeric_limits::min(), + std::numeric_limits::max(), + std::numeric_limits::min(), + std::numeric_limits::max(), + 0, + -1, + 0, + 1, + -1, + 1}; + const int num_values = static_cast(int_values.size()); + + auto const encoder = MakeTypedEncoder( + Encoding::DELTA_BINARY_PACKED, /*use_dictionary=*/false, this->descr_.get()); + encoder->Put(int_values, num_values); + auto const encoded = encoder->FlushValues(); + + auto const decoder = + MakeTypedDecoder(Encoding::DELTA_BINARY_PACKED, this->descr_.get()); + + std::vector decoded(num_values); + decoder->SetData(num_values, encoded->data(), static_cast(encoded->size())); + + const int values_decoded = decoder->Decode(decoded.data(), num_values); + + ASSERT_EQ(num_values, values_decoded); + ASSERT_NO_FATAL_FAILURE( + VerifyResults(decoded.data(), int_values.data(), num_values)); +} + +// Test that the DELTA_BINARY_PACKED encoding does not use more bits to encode than +// necessary (see GH-37939). +TYPED_TEST(TestDeltaBitPackEncoding, DeltaBitPackedSize) { + using T = typename TypeParam::c_type; + constexpr int num_values = 128; + // 128 values should be <= 1 block of values encoded with 2 bits + // delta header should be 0x8001|0x8002 0x04 0x8001 0x02 (6 bytes) + // mini-block header should be 0x01 0x02020202 (5 bytes) + constexpr int encoded_size = 2 * num_values / 8 + 6 + 5; + + // Create a run of {1, 0, -1, 0, 1, 0, ...}. + // min_delta is -1, max_delta is 1, max_delta - min_delta is 2, so this requires 2 bits + // to encode. + std::vector int_values(num_values); + std::iota(int_values.begin(), int_values.end(), 0); + std::transform(int_values.begin(), int_values.end(), int_values.begin(), [](T idx) { + return (idx % 2) == 1 ? 0 : (idx % 4) == 0 ? 1 : -1; + }); + + auto const encoder = MakeTypedEncoder( + Encoding::DELTA_BINARY_PACKED, /*use_dictionary=*/false, this->descr_.get()); + encoder->Put(int_values, num_values); + auto const encoded = encoder->FlushValues(); + + ASSERT_EQ(encoded->size(), encoded_size); +} + // ---------------------------------------------------------------------- // Rle for Boolean encode/decode tests.