Skip to content

Commit

Permalink
change delta binary packed to use signed types
Browse files Browse the repository at this point in the history
  • Loading branch information
etseidl committed Oct 3, 2023
1 parent f6afc33 commit 19b304b
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 12 deletions.
27 changes: 15 additions & 12 deletions cpp/src/parquet/encoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ using arrow::VisitNullBitmapInline;
using arrow::internal::AddWithOverflow;
using arrow::internal::checked_cast;
using arrow::internal::MultiplyWithOverflow;
using arrow::internal::SafeSignedSubtract;
using arrow::internal::SubtractWithOverflow;
using arrow::util::SafeLoad;
using arrow::util::SafeLoadAs;
Expand Down Expand Up @@ -2189,9 +2190,9 @@ class DeltaBitPackEncoder : public EncoderImpl, virtual public TypedEncoder<DTyp
const uint32_t values_per_mini_block_;
uint32_t values_current_block_{0};
uint32_t total_value_count_{0};
UT first_value_{0};
UT current_value_{0};
ArrowPoolVector<UT> deltas_;
T first_value_{0};
T current_value_{0};
ArrowPoolVector<T> deltas_;
std::shared_ptr<ResizableBuffer> bits_buffer_;
::arrow::BufferBuilder sink_;
::arrow::bit_util::BitWriter bit_writer_;
Expand All @@ -2212,12 +2213,12 @@ void DeltaBitPackEncoder<DType>::Put(const T* src, int num_values) {
total_value_count_ += num_values;

while (idx < num_values) {
UT value = static_cast<UT>(src[idx]);
T value = src[idx];
// Calculate deltas. The possible overflow is handled by use of unsigned integers
// making subtraction operations well-defined and correct even in case of overflow.
// Encoded integers will wrap back around on decoding.
// See http://en.wikipedia.org/wiki/Modular_arithmetic#Integers_modulo_n
deltas_[values_current_block_] = value - current_value_;
deltas_[values_current_block_] = SafeSignedSubtract(value, current_value_);
current_value_ = value;
idx++;
values_current_block_++;
Expand All @@ -2233,9 +2234,11 @@ void DeltaBitPackEncoder<DType>::FlushBlock() {
return;
}

const UT min_delta =
// Calculate the frame of reference for this miniblock. This value will be subtracted
// from all deltas to guarantee all deltas are positive for encoding.
const T min_delta =
*std::min_element(deltas_.begin(), deltas_.begin() + values_current_block_);
bit_writer_.PutZigZagVlqInt(static_cast<T>(min_delta));
bit_writer_.PutZigZagVlqInt(min_delta);

// Call to GetNextBytePtr reserves mini_blocks_per_block_ bytes of space to write
// bit widths of miniblocks as they become known during the encoding.
Expand All @@ -2250,17 +2253,17 @@ void DeltaBitPackEncoder<DType>::FlushBlock() {
std::min(values_per_mini_block_, values_current_block_);

const uint32_t start = i * values_per_mini_block_;
const UT max_delta = *std::max_element(
const T max_delta = *std::max_element(
deltas_.begin() + start, deltas_.begin() + start + values_current_mini_block);

// The minimum number of bits required to write any of values in deltas_ vector.
// See overflow comment above.
const auto bit_width = bit_width_data[i] =
bit_util::NumRequiredBits(max_delta - min_delta);
const auto bit_width = bit_width_data[i] = bit_util::NumRequiredBits(
static_cast<UT>(max_delta) - static_cast<UT>(min_delta));

for (uint32_t j = start; j < start + values_current_mini_block; j++) {
// See overflow comment above.
const UT value = deltas_[j] - min_delta;
// Convert delta to frame of reference. See overflow comment above.
const UT value = static_cast<UT>(deltas_[j]) - static_cast<UT>(min_delta);
bit_writer_.PutValue(value, bit_width);
}
// If there are not enough values to fill the last mini block, we pad the mini block
Expand Down
64 changes: 64 additions & 0 deletions cpp/src/parquet/encoding_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1634,6 +1634,70 @@ TYPED_TEST(TestDeltaBitPackEncoding, NonZeroPaddedMiniblockBitWidth) {
}
}

// Test that the DELTA_BINARY_PACKED encoding works properply in the presence of values
// that will cause integer overflow (see GH-37939).
TYPED_TEST(TestDeltaBitPackEncoding, DeltaBitPackedWrapping) {
using T = typename TypeParam::c_type;

// Values that should wrap when converted to deltas, and then when converted to the
// frame of reference.
std::vector<T> int_values = {std::numeric_limits<T>::min(),
std::numeric_limits<T>::max(),
std::numeric_limits<T>::min(),
std::numeric_limits<T>::max(),
0,
-1,
0,
1,
-1,
1};
const int num_values = static_cast<int>(int_values.size());

auto const encoder = MakeTypedEncoder<TypeParam>(
Encoding::DELTA_BINARY_PACKED, /*use_dictionary=*/false, this->descr_.get());
encoder->Put(int_values, num_values);
auto const encoded = encoder->FlushValues();

auto const decoder =
MakeTypedDecoder<TypeParam>(Encoding::DELTA_BINARY_PACKED, this->descr_.get());

std::vector<T> decoded(num_values);
decoder->SetData(num_values, encoded->data(), static_cast<int>(encoded->size()));

const int values_decoded = decoder->Decode(decoded.data(), num_values);

ASSERT_EQ(num_values, values_decoded);
ASSERT_NO_FATAL_FAILURE(
VerifyResults<T>(decoded.data(), int_values.data(), num_values));
}

// Test that the DELTA_BINARY_PACKED encoding does not use more bits to encode than
// necessary (see GH-37939).
TYPED_TEST(TestDeltaBitPackEncoding, DeltaBitPackedSize) {
using T = typename TypeParam::c_type;
constexpr int num_values = 128;
// 128 values should be <= 1 block of values encoded with 2 bits
// delta header should be 0x8001|0x8002 0x04 0x8001 0x02 (6 bytes)
// mini-block header should be 0x01 0x02020202 (5 bytes)
constexpr int encoded_size = 2 * num_values / 8 + 6 + 5;

// Create a run of {1, 0, -1, 0, 1, 0, ...}.
// min_delta is -1, max_delta is 1, max_delta - min_delta is 2, so this requires 2 bits
// to encode.
std::vector<T> int_values(num_values);
std::iota(int_values.begin(), int_values.end(), 0);
std::transform(int_values.begin(), int_values.end(), int_values.begin(), [](T idx) {
return (idx % 2) == 1 ? 0 : (idx % 4) == 0 ? 1 : -1;
});

auto const encoder = MakeTypedEncoder<TypeParam>(
Encoding::DELTA_BINARY_PACKED, /*use_dictionary=*/false, this->descr_.get());
encoder->Put(int_values, num_values);
auto const encoded = encoder->FlushValues();

ASSERT_EQ(encoded->size(), encoded_size);
}

// ----------------------------------------------------------------------
// Rle for Boolean encode/decode tests.

Expand Down

0 comments on commit 19b304b

Please sign in to comment.