change delta binary packed to use signed types

apache · Oct 3, 2023 · 19b304b · 19b304b
1 parent f6afc33
commit 19b304b
Show file tree

Hide file tree

Showing 2 changed files with 79 additions and 12 deletions.
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
@@ -57,6 +57,7 @@ using arrow::VisitNullBitmapInline;
 using arrow::internal::AddWithOverflow;
 using arrow::internal::checked_cast;
 using arrow::internal::MultiplyWithOverflow;
+using arrow::internal::SafeSignedSubtract;
 using arrow::internal::SubtractWithOverflow;
 using arrow::util::SafeLoad;
 using arrow::util::SafeLoadAs;
@@ -2189,9 +2190,9 @@ class DeltaBitPackEncoder : public EncoderImpl, virtual public TypedEncoder<DTyp
   const uint32_t values_per_mini_block_;
   uint32_t values_current_block_{0};
   uint32_t total_value_count_{0};
-  UT first_value_{0};
-  UT current_value_{0};
-  ArrowPoolVector<UT> deltas_;
+  T first_value_{0};
+  T current_value_{0};
+  ArrowPoolVector<T> deltas_;
   std::shared_ptr<ResizableBuffer> bits_buffer_;
   ::arrow::BufferBuilder sink_;
   ::arrow::bit_util::BitWriter bit_writer_;
@@ -2212,12 +2213,12 @@ void DeltaBitPackEncoder<DType>::Put(const T* src, int num_values) {
   total_value_count_ += num_values;
 
   while (idx < num_values) {
-    UT value = static_cast<UT>(src[idx]);
+    T value = src[idx];
     // Calculate deltas. The possible overflow is handled by use of unsigned integers
     // making subtraction operations well-defined and correct even in case of overflow.
     // Encoded integers will wrap back around on decoding.
     // See http://en.wikipedia.org/wiki/Modular_arithmetic#Integers_modulo_n
-    deltas_[values_current_block_] = value - current_value_;
+    deltas_[values_current_block_] = SafeSignedSubtract(value, current_value_);
     current_value_ = value;
     idx++;
     values_current_block_++;
@@ -2233,9 +2234,11 @@ void DeltaBitPackEncoder<DType>::FlushBlock() {
     return;
   }
 
-  const UT min_delta =
+  // Calculate the frame of reference for this miniblock. This value will be subtracted
+  // from all deltas to guarantee all deltas are positive for encoding.
+  const T min_delta =
       *std::min_element(deltas_.begin(), deltas_.begin() + values_current_block_);
-  bit_writer_.PutZigZagVlqInt(static_cast<T>(min_delta));
+  bit_writer_.PutZigZagVlqInt(min_delta);
 
   // Call to GetNextBytePtr reserves mini_blocks_per_block_ bytes of space to write
   // bit widths of miniblocks as they become known during the encoding.
@@ -2250,17 +2253,17 @@ void DeltaBitPackEncoder<DType>::FlushBlock() {
         std::min(values_per_mini_block_, values_current_block_);
 
     const uint32_t start = i * values_per_mini_block_;
-    const UT max_delta = *std::max_element(
+    const T max_delta = *std::max_element(
         deltas_.begin() + start, deltas_.begin() + start + values_current_mini_block);
 
     // The minimum number of bits required to write any of values in deltas_ vector.
     // See overflow comment above.
-    const auto bit_width = bit_width_data[i] =
-        bit_util::NumRequiredBits(max_delta - min_delta);
+    const auto bit_width = bit_width_data[i] = bit_util::NumRequiredBits(
+        static_cast<UT>(max_delta) - static_cast<UT>(min_delta));
 
     for (uint32_t j = start; j < start + values_current_mini_block; j++) {
-      // See overflow comment above.
-      const UT value = deltas_[j] - min_delta;
+      // Convert delta to frame of reference. See overflow comment above.
+      const UT value = static_cast<UT>(deltas_[j]) - static_cast<UT>(min_delta);
       bit_writer_.PutValue(value, bit_width);
     }
     // If there are not enough values to fill the last mini block, we pad the mini block

diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc
@@ -1634,6 +1634,70 @@ TYPED_TEST(TestDeltaBitPackEncoding, NonZeroPaddedMiniblockBitWidth) {
   }
 }
 
+// Test that the DELTA_BINARY_PACKED encoding works properply in the presence of values
+// that will cause integer overflow (see GH-37939).
+TYPED_TEST(TestDeltaBitPackEncoding, DeltaBitPackedWrapping) {
+  using T = typename TypeParam::c_type;
+
+  // Values that should wrap when converted to deltas, and then when converted to the
+  // frame of reference.
+  std::vector<T> int_values = {std::numeric_limits<T>::min(),
+                               std::numeric_limits<T>::max(),
+                               std::numeric_limits<T>::min(),
+                               std::numeric_limits<T>::max(),
+                               0,
+                               -1,
+                               0,
+                               1,
+                               -1,
+                               1};
+  const int num_values = static_cast<int>(int_values.size());
+
+  auto const encoder = MakeTypedEncoder<TypeParam>(
+      Encoding::DELTA_BINARY_PACKED, /*use_dictionary=*/false, this->descr_.get());
+  encoder->Put(int_values, num_values);
+  auto const encoded = encoder->FlushValues();
+
+  auto const decoder =
+      MakeTypedDecoder<TypeParam>(Encoding::DELTA_BINARY_PACKED, this->descr_.get());
+
+  std::vector<T> decoded(num_values);
+  decoder->SetData(num_values, encoded->data(), static_cast<int>(encoded->size()));
+
+  const int values_decoded = decoder->Decode(decoded.data(), num_values);
+
+  ASSERT_EQ(num_values, values_decoded);
+  ASSERT_NO_FATAL_FAILURE(
+      VerifyResults<T>(decoded.data(), int_values.data(), num_values));
+}
+
+// Test that the DELTA_BINARY_PACKED encoding does not use more bits to encode than
+// necessary (see GH-37939).
+TYPED_TEST(TestDeltaBitPackEncoding, DeltaBitPackedSize) {
+  using T = typename TypeParam::c_type;
+  constexpr int num_values = 128;
+  // 128 values should be <= 1 block of values encoded with 2 bits
+  // delta header should be 0x8001|0x8002 0x04 0x8001 0x02 (6 bytes)
+  // mini-block header should be 0x01 0x02020202 (5 bytes)
+  constexpr int encoded_size = 2 * num_values / 8 + 6 + 5;
+
+  // Create a run of {1, 0, -1, 0, 1, 0, ...}.
+  // min_delta is -1, max_delta is 1, max_delta - min_delta is 2, so this requires 2 bits
+  // to encode.
+  std::vector<T> int_values(num_values);
+  std::iota(int_values.begin(), int_values.end(), 0);
+  std::transform(int_values.begin(), int_values.end(), int_values.begin(), [](T idx) {
+    return (idx % 2) == 1 ? 0 : (idx % 4) == 0 ? 1 : -1;
+  });
+
+  auto const encoder = MakeTypedEncoder<TypeParam>(
+      Encoding::DELTA_BINARY_PACKED, /*use_dictionary=*/false, this->descr_.get());
+  encoder->Put(int_values, num_values);
+  auto const encoded = encoder->FlushValues();
+
+  ASSERT_EQ(encoded->size(), encoded_size);
+}
+
 // ----------------------------------------------------------------------
 // Rle for Boolean encode/decode tests.