apacheGH-37939: [C++] Use signed arithmetic for frame of reference wh…

…en encoding DELTA_BINARY_PACKED (apache#37940) Closes apache#37939. ### What changes are included in this PR? This PR changes values used in the `DELTA_BINARY_PACKED` encoder to signed types. To gracefully handle overflow, arithmetic is still performed in the unsigned domain, but other operations such as computing the min and max deltas are done in the signed domain. Using signed types ensures the optimal number of bits is used when encoding the deltas, which was not the case before if any negative deltas were encountered (which is obviously common). ### Are these changes tested? I've included two tests that result in overflow. ### Are there any user-facing changes? No * Closes: apache#37939 Authored-by: seidl <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
JerAguilon · Oct 23, 2023 · 58d427f · 58d427f
1 parent 61c0277
commit 58d427f
Show file tree

Hide file tree

Showing 2 changed files with 79 additions and 12 deletions.
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
@@ -57,6 +57,7 @@ using arrow::VisitNullBitmapInline;
 using arrow::internal::AddWithOverflow;
 using arrow::internal::checked_cast;
 using arrow::internal::MultiplyWithOverflow;
+using arrow::internal::SafeSignedSubtract;
 using arrow::internal::SubtractWithOverflow;
 using arrow::util::SafeLoad;
 using arrow::util::SafeLoadAs;
@@ -2189,9 +2190,9 @@ class DeltaBitPackEncoder : public EncoderImpl, virtual public TypedEncoder<DTyp
   const uint32_t values_per_mini_block_;
   uint32_t values_current_block_{0};
   uint32_t total_value_count_{0};
-  UT first_value_{0};
-  UT current_value_{0};
-  ArrowPoolVector<UT> deltas_;
+  T first_value_{0};
+  T current_value_{0};
+  ArrowPoolVector<T> deltas_;
   std::shared_ptr<ResizableBuffer> bits_buffer_;
   ::arrow::BufferBuilder sink_;
   ::arrow::bit_util::BitWriter bit_writer_;
@@ -2212,12 +2213,12 @@ void DeltaBitPackEncoder<DType>::Put(const T* src, int num_values) {
   total_value_count_ += num_values;
 
   while (idx < num_values) {
-    UT value = static_cast<UT>(src[idx]);
+    T value = src[idx];
     // Calculate deltas. The possible overflow is handled by use of unsigned integers
     // making subtraction operations well-defined and correct even in case of overflow.
     // Encoded integers will wrap back around on decoding.
     // See http://en.wikipedia.org/wiki/Modular_arithmetic#Integers_modulo_n
-    deltas_[values_current_block_] = value - current_value_;
+    deltas_[values_current_block_] = SafeSignedSubtract(value, current_value_);
     current_value_ = value;
     idx++;
     values_current_block_++;
@@ -2233,9 +2234,11 @@ void DeltaBitPackEncoder<DType>::FlushBlock() {
     return;
   }
 
-  const UT min_delta =
+  // Calculate the frame of reference for this miniblock. This value will be subtracted
+  // from all deltas to guarantee all deltas are positive for encoding.
+  const T min_delta =
       *std::min_element(deltas_.begin(), deltas_.begin() + values_current_block_);
-  bit_writer_.PutZigZagVlqInt(static_cast<T>(min_delta));
+  bit_writer_.PutZigZagVlqInt(min_delta);
 
   // Call to GetNextBytePtr reserves mini_blocks_per_block_ bytes of space to write
   // bit widths of miniblocks as they become known during the encoding.
@@ -2250,17 +2253,17 @@ void DeltaBitPackEncoder<DType>::FlushBlock() {
         std::min(values_per_mini_block_, values_current_block_);
 
     const uint32_t start = i * values_per_mini_block_;
-    const UT max_delta = *std::max_element(
+    const T max_delta = *std::max_element(
         deltas_.begin() + start, deltas_.begin() + start + values_current_mini_block);
 
     // The minimum number of bits required to write any of values in deltas_ vector.
     // See overflow comment above.
-    const auto bit_width = bit_width_data[i] =
-        bit_util::NumRequiredBits(max_delta - min_delta);
+    const auto bit_width = bit_width_data[i] = bit_util::NumRequiredBits(
+        static_cast<UT>(max_delta) - static_cast<UT>(min_delta));
 
     for (uint32_t j = start; j < start + values_current_mini_block; j++) {
-      // See overflow comment above.
-      const UT value = deltas_[j] - min_delta;
+      // Convert delta to frame of reference. See overflow comment above.
+      const UT value = static_cast<UT>(deltas_[j]) - static_cast<UT>(min_delta);
       bit_writer_.PutValue(value, bit_width);
     }
     // If there are not enough values to fill the last mini block, we pad the mini block

diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc
@@ -1634,6 +1634,70 @@ TYPED_TEST(TestDeltaBitPackEncoding, NonZeroPaddedMiniblockBitWidth) {
   }
 }
 
+// Test that the DELTA_BINARY_PACKED encoding works properply in the presence of values
+// that will cause integer overflow (see GH-37939).
+TYPED_TEST(TestDeltaBitPackEncoding, DeltaBitPackedWrapping) {
+  using T = typename TypeParam::c_type;
+
+  // Values that should wrap when converted to deltas, and then when converted to the
+  // frame of reference.
+  std::vector<T> int_values = {std::numeric_limits<T>::min(),
+                               std::numeric_limits<T>::max(),
+                               std::numeric_limits<T>::min(),
+                               std::numeric_limits<T>::max(),
+                               0,
+                               -1,
+                               0,
+                               1,
+                               -1,
+                               1};
+  const int num_values = static_cast<int>(int_values.size());
+
+  auto const encoder = MakeTypedEncoder<TypeParam>(
+      Encoding::DELTA_BINARY_PACKED, /*use_dictionary=*/false, this->descr_.get());
+  encoder->Put(int_values, num_values);
+  auto const encoded = encoder->FlushValues();
+
+  auto const decoder =
+      MakeTypedDecoder<TypeParam>(Encoding::DELTA_BINARY_PACKED, this->descr_.get());
+
+  std::vector<T> decoded(num_values);
+  decoder->SetData(num_values, encoded->data(), static_cast<int>(encoded->size()));
+
+  const int values_decoded = decoder->Decode(decoded.data(), num_values);
+
+  ASSERT_EQ(num_values, values_decoded);
+  ASSERT_NO_FATAL_FAILURE(
+      VerifyResults<T>(decoded.data(), int_values.data(), num_values));
+}
+
+// Test that the DELTA_BINARY_PACKED encoding does not use more bits to encode than
+// necessary (see GH-37939).
+TYPED_TEST(TestDeltaBitPackEncoding, DeltaBitPackedSize) {
+  using T = typename TypeParam::c_type;
+  constexpr int num_values = 128;
+  // 128 values should be <= 1 block of values encoded with 2 bits
+  // delta header should be 0x8001|0x8002 0x04 0x8001 0x02 (6 bytes)
+  // mini-block header should be 0x01 0x02020202 (5 bytes)
+  constexpr int encoded_size = 2 * num_values / 8 + 6 + 5;
+
+  // Create a run of {1, 0, -1, 0, 1, 0, ...}.
+  // min_delta is -1, max_delta is 1, max_delta - min_delta is 2, so this requires 2 bits
+  // to encode.
+  std::vector<T> int_values(num_values);
+  std::iota(int_values.begin(), int_values.end(), 0);
+  std::transform(int_values.begin(), int_values.end(), int_values.begin(), [](T idx) {
+    return (idx % 2) == 1 ? 0 : (idx % 4) == 0 ? 1 : -1;
+  });
+
+  auto const encoder = MakeTypedEncoder<TypeParam>(
+      Encoding::DELTA_BINARY_PACKED, /*use_dictionary=*/false, this->descr_.get());
+  encoder->Put(int_values, num_values);
+  auto const encoded = encoder->FlushValues();
+
+  ASSERT_EQ(encoded->size(), encoded_size);
+}
+
 // ----------------------------------------------------------------------
 // Rle for Boolean encode/decode tests.