Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-37939: [C++] Use signed arithmetic for frame of reference when encoding DELTA_BINARY_PACKED #37940

Merged
merged 1 commit into from
Oct 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 15 additions & 12 deletions cpp/src/parquet/encoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ using arrow::VisitNullBitmapInline;
using arrow::internal::AddWithOverflow;
using arrow::internal::checked_cast;
using arrow::internal::MultiplyWithOverflow;
using arrow::internal::SafeSignedSubtract;
using arrow::internal::SubtractWithOverflow;
using arrow::util::SafeLoad;
using arrow::util::SafeLoadAs;
Expand Down Expand Up @@ -2189,9 +2190,9 @@ class DeltaBitPackEncoder : public EncoderImpl, virtual public TypedEncoder<DTyp
const uint32_t values_per_mini_block_;
uint32_t values_current_block_{0};
uint32_t total_value_count_{0};
UT first_value_{0};
UT current_value_{0};
ArrowPoolVector<UT> deltas_;
T first_value_{0};
T current_value_{0};
ArrowPoolVector<T> deltas_;
std::shared_ptr<ResizableBuffer> bits_buffer_;
::arrow::BufferBuilder sink_;
::arrow::bit_util::BitWriter bit_writer_;
Expand All @@ -2212,12 +2213,12 @@ void DeltaBitPackEncoder<DType>::Put(const T* src, int num_values) {
total_value_count_ += num_values;

while (idx < num_values) {
UT value = static_cast<UT>(src[idx]);
T value = src[idx];
// Calculate deltas. The possible overflow is handled by use of unsigned integers
// making subtraction operations well-defined and correct even in case of overflow.
// Encoded integers will wrap back around on decoding.
// See http://en.wikipedia.org/wiki/Modular_arithmetic#Integers_modulo_n
deltas_[values_current_block_] = value - current_value_;
deltas_[values_current_block_] = SafeSignedSubtract(value, current_value_);
current_value_ = value;
idx++;
values_current_block_++;
Expand All @@ -2233,9 +2234,11 @@ void DeltaBitPackEncoder<DType>::FlushBlock() {
return;
}

const UT min_delta =
// Calculate the frame of reference for this miniblock. This value will be subtracted
// from all deltas to guarantee all deltas are positive for encoding.
const T min_delta =
*std::min_element(deltas_.begin(), deltas_.begin() + values_current_block_);
bit_writer_.PutZigZagVlqInt(static_cast<T>(min_delta));
bit_writer_.PutZigZagVlqInt(min_delta);

// Call to GetNextBytePtr reserves mini_blocks_per_block_ bytes of space to write
// bit widths of miniblocks as they become known during the encoding.
Expand All @@ -2250,17 +2253,17 @@ void DeltaBitPackEncoder<DType>::FlushBlock() {
std::min(values_per_mini_block_, values_current_block_);

const uint32_t start = i * values_per_mini_block_;
const UT max_delta = *std::max_element(
const T max_delta = *std::max_element(
deltas_.begin() + start, deltas_.begin() + start + values_current_mini_block);

// The minimum number of bits required to write any of values in deltas_ vector.
// See overflow comment above.
const auto bit_width = bit_width_data[i] =
bit_util::NumRequiredBits(max_delta - min_delta);
const auto bit_width = bit_width_data[i] = bit_util::NumRequiredBits(
static_cast<UT>(max_delta) - static_cast<UT>(min_delta));

for (uint32_t j = start; j < start + values_current_mini_block; j++) {
// See overflow comment above.
const UT value = deltas_[j] - min_delta;
// Convert delta to frame of reference. See overflow comment above.
const UT value = static_cast<UT>(deltas_[j]) - static_cast<UT>(min_delta);
bit_writer_.PutValue(value, bit_width);
}
// If there are not enough values to fill the last mini block, we pad the mini block
Expand Down
64 changes: 64 additions & 0 deletions cpp/src/parquet/encoding_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1634,6 +1634,70 @@ TYPED_TEST(TestDeltaBitPackEncoding, NonZeroPaddedMiniblockBitWidth) {
}
}

// Test that the DELTA_BINARY_PACKED encoding works properply in the presence of values
// that will cause integer overflow (see GH-37939).
TYPED_TEST(TestDeltaBitPackEncoding, DeltaBitPackedWrapping) {
using T = typename TypeParam::c_type;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a comment refering to the GH issue?


// Values that should wrap when converted to deltas, and then when converted to the
// frame of reference.
std::vector<T> int_values = {std::numeric_limits<T>::min(),
std::numeric_limits<T>::max(),
std::numeric_limits<T>::min(),
std::numeric_limits<T>::max(),
0,
-1,
0,
1,
-1,
1};
const int num_values = static_cast<int>(int_values.size());

auto const encoder = MakeTypedEncoder<TypeParam>(
Encoding::DELTA_BINARY_PACKED, /*use_dictionary=*/false, this->descr_.get());
encoder->Put(int_values, num_values);
auto const encoded = encoder->FlushValues();

auto const decoder =
MakeTypedDecoder<TypeParam>(Encoding::DELTA_BINARY_PACKED, this->descr_.get());

std::vector<T> decoded(num_values);
decoder->SetData(num_values, encoded->data(), static_cast<int>(encoded->size()));

const int values_decoded = decoder->Decode(decoded.data(), num_values);

ASSERT_EQ(num_values, values_decoded);
ASSERT_NO_FATAL_FAILURE(
VerifyResults<T>(decoded.data(), int_values.data(), num_values));
}
Copy link
Member

@pitrou pitrou Oct 3, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this PR also fixes the encoded data size, can you add a test for that? For example check that the encoded buffer size is equal to a certain value given data that would have triggered the bug, such as the data in #37939 (comment)


// Test that the DELTA_BINARY_PACKED encoding does not use more bits to encode than
// necessary (see GH-37939).
TYPED_TEST(TestDeltaBitPackEncoding, DeltaBitPackedSize) {
using T = typename TypeParam::c_type;
constexpr int num_values = 128;
// 128 values should be <= 1 block of values encoded with 2 bits
// delta header should be 0x8001|0x8002 0x04 0x8001 0x02 (6 bytes)
// mini-block header should be 0x01 0x02020202 (5 bytes)
constexpr int encoded_size = 2 * num_values / 8 + 6 + 5;

// Create a run of {1, 0, -1, 0, 1, 0, ...}.
// min_delta is -1, max_delta is 1, max_delta - min_delta is 2, so this requires 2 bits
// to encode.
std::vector<T> int_values(num_values);
std::iota(int_values.begin(), int_values.end(), 0);
std::transform(int_values.begin(), int_values.end(), int_values.begin(), [](T idx) {
return (idx % 2) == 1 ? 0 : (idx % 4) == 0 ? 1 : -1;
});

auto const encoder = MakeTypedEncoder<TypeParam>(
Encoding::DELTA_BINARY_PACKED, /*use_dictionary=*/false, this->descr_.get());
encoder->Put(int_values, num_values);
auto const encoded = encoder->FlushValues();

ASSERT_EQ(encoded->size(), encoded_size);
}

// ----------------------------------------------------------------------
// Rle for Boolean encode/decode tests.

Expand Down
Loading