Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: BitPackedCompressor allows signed arrays #1699

Merged
merged 7 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docs/quickstart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ Vortex array:
>>> parquet = pq.read_table("_static/example.parquet")
>>> vtx = vortex.array(parquet)
>>> vtx.nbytes
141070
141024

Compress
^^^^^^^^
Expand All @@ -46,9 +46,9 @@ Use :func:`~vortex.encoding.compress` to compress the Vortex array and check the

>>> cvtx = vortex.compress(vtx)
>>> cvtx.nbytes
16756
15243
a10y marked this conversation as resolved.
Show resolved Hide resolved
>>> cvtx.nbytes / vtx.nbytes
0.118...
0.10...

Vortex uses nearly ten times fewer bytes than Arrow. Fewer bytes means more of your data fits in
cache and RAM.
Expand Down
24 changes: 18 additions & 6 deletions encodings/fastlanes/src/bitpacking/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ pub unsafe fn bitpack_encode_unchecked(
///
/// On success, returns a [Buffer] containing the packed data.
pub fn bitpack(parray: &PrimitiveArray, bit_width: u8) -> VortexResult<Buffer> {
// We know the min is > 0, so it's safe to re-interpret signed integers as unsigned.
let parray = parray.reinterpret_cast(parray.ptype().to_unsigned());
let packed = match_each_unsigned_integer_ptype!(parray.ptype(), |$P| {
bitpack_primitive(parray.maybe_null_slice::<$P>(), bit_width)
Expand Down Expand Up @@ -359,7 +358,7 @@ pub fn count_exceptions(bit_width: u8, bit_width_freq: &[usize]) -> usize {
#[cfg(test)]
#[allow(clippy::cast_possible_truncation)]
mod test {
use vortex_array::{IntoArrayVariant, ToArrayData};
use vortex_array::{IntoArrayVariant, IntoCanonical, ToArrayData};

use super::*;

Expand Down Expand Up @@ -431,12 +430,25 @@ mod test {
}

#[test]
#[should_panic(expected = "expected type: uint but instead got i64")]
fn gh_issue_929() {
fn compress_signed_roundtrip() {
let values: Vec<i64> = (-500..500).collect();
let array = PrimitiveArray::from_vec(values, Validity::AllValid);
let array = PrimitiveArray::from_vec(values.clone(), Validity::AllValid);
assert!(array.ptype().is_signed_int());

BitPackedArray::encode(array.as_ref(), 1024u32.ilog2() as u8).unwrap();
let bitpacked_array =
BitPackedArray::encode(array.as_ref(), 1024u32.ilog2() as u8).unwrap();
let num_patches = bitpacked_array
.patches()
.as_ref()
.map(Patches::num_patches)
.unwrap_or_default();
assert_eq!(num_patches, 500);

let unpacked = bitpacked_array
.into_canonical()
.unwrap()
.into_primitive()
.unwrap();
assert_eq!(unpacked.into_maybe_null_slice::<i64>(), values);
}
}
33 changes: 31 additions & 2 deletions encodings/fastlanes/src/bitpacking/compute/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,21 @@ use crate::{BitPackedArray, BitPackedEncoding};

impl FilterFn<BitPackedArray> for BitPackedEncoding {
fn filter(&self, array: &BitPackedArray, mask: FilterMask) -> VortexResult<ArrayData> {
let primitive = match_each_unsigned_integer_ptype!(array.ptype(), |$I| {
let primitive = match_each_unsigned_integer_ptype!(array.ptype().to_unsigned(), |$I| {
filter_primitive::<$I>(array, mask)
});
Ok(primitive?.into_array())
}
}

/// Specialized filter kernel for primitive bit-packed arrays.
///
/// Because the FastLanes bit-packing kernels are only implemented for unsigned types, the provided
/// `T` should be promoted to the unsigned variant for any target bit width.
/// For example, if the array is bit-packed `i16`, this function called be called with `T = u16`.
///
/// All bit-packing operations will use the unsigned kernels, but the logical type of `array`
/// dictates the final `PType` of the result.
fn filter_primitive<T: NativePType + BitPacking + ArrowNativeType>(
array: &BitPackedArray,
mask: FilterMask,
Expand Down Expand Up @@ -49,7 +57,7 @@ fn filter_primitive<T: NativePType + BitPacking + ArrowNativeType>(
FilterIter::SlicesIter(iter) => filter_slices(array, mask.true_count(), iter),
};

let mut values = PrimitiveArray::from_vec(values, validity);
let mut values = PrimitiveArray::from_vec(values, validity).reinterpret_cast(array.ptype());
if let Some(patches) = patches {
values = values.patch(patches)?;
}
Expand Down Expand Up @@ -120,6 +128,7 @@ fn filter_slices<T: NativePType + BitPacking + ArrowNativeType>(

#[cfg(test)]
mod test {
use itertools::Itertools;
use vortex_array::array::PrimitiveArray;
use vortex_array::compute::{filter, slice, FilterMask};
use vortex_array::{ArrayLen, IntoArrayVariant};
Expand Down Expand Up @@ -166,4 +175,24 @@ mod test {
(0..1024).map(|i| (i % 63) as u8).collect::<Vec<_>>()
);
}

#[test]
fn filter_bitpacked_signed() {
// Elements 0..=499 are negative integers (patches)
// Element 500 = 0 (packed)
// Elements 501..999 are positive integers (packed)
let values: Vec<i64> = (-500..500).collect_vec();
let unpacked = PrimitiveArray::from(values.clone());
let bitpacked = BitPackedArray::encode(unpacked.as_ref(), 9).unwrap();
let filtered = filter(
bitpacked.as_ref(),
FilterMask::from_indices(values.len(), 250..750),
)
.unwrap()
.into_primitive()
.unwrap()
.into_maybe_null_slice::<i64>();

assert_eq!(filtered.as_slice(), &values[250..750]);
}
}
4 changes: 0 additions & 4 deletions encodings/fastlanes/src/bitpacking/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,6 @@ impl BitPackedArray {
) -> VortexResult<Self> {
let dtype = DType::Primitive(ptype, validity.nullability());

if !dtype.is_unsigned_int() {
lwwmanning marked this conversation as resolved.
Show resolved Hide resolved
vortex_bail!(MismatchedTypes: "uint", &dtype);
}

if bit_width > u64::BITS as u8 {
vortex_bail!("Unsupported bit width {}", bit_width);
}
Expand Down
2 changes: 1 addition & 1 deletion pyvortex/src/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use crate::array::PyArray;
///
/// >>> a = vortex.array(list(range(1000)))
/// >>> str(vortex.compress(a))
/// 'fastlanes.for(0x17)(i64, len=1000)'
/// 'fastlanes.bitpacked(0x15)(i64, len=1000)'
///
/// Compress an array of increasing floating-point numbers and a few nulls:
///
Expand Down
3 changes: 1 addition & 2 deletions vortex-sampling-compressor/src/compressors/bitpacked.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,7 @@ impl EncodingCompressor for BitPackedCompressor {
// Only support primitive arrays
let parray = PrimitiveArray::maybe_from(array)?;

// Only supports unsigned ints
if !parray.ptype().is_unsigned_int() {
if !parray.ptype().is_int() {
return None;
}

Expand Down
2 changes: 1 addition & 1 deletion vortex-sampling-compressor/src/compressors/for.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ impl EncodingCompressor for FoRCompressor {
let shift = trailing_zeros(array);
match_each_integer_ptype!(parray.ptype(), |$P| {
let min: $P = parray.statistics().compute_min()?;
if min == 0 && shift == 0 && parray.ptype().is_unsigned_int() {
if min == 0 && shift == 0 {
return None;
}
});
Expand Down
4 changes: 2 additions & 2 deletions vortex-sampling-compressor/tests/smoketest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ mod tests {
use vortex_datetime_dtype::TimeUnit;
use vortex_datetime_parts::DateTimePartsEncoding;
use vortex_dict::DictEncoding;
use vortex_fastlanes::FoREncoding;
use vortex_fastlanes::BitPackedEncoding;
use vortex_fsst::FSSTEncoding;
use vortex_sampling_compressor::ALL_COMPRESSORS;
use vortex_scalar::Scalar;
Expand Down Expand Up @@ -122,7 +122,7 @@ mod tests {
.unwrap();
println!("prim_col num chunks: {}", prim_col.nchunks());
for chunk in prim_col.chunks() {
assert_eq!(chunk.encoding().id(), FoREncoding::ID);
assert_eq!(chunk.encoding().id(), BitPackedEncoding::ID);
assert_eq!(
chunk.statistics().get(Stat::UncompressedSizeInBytes),
Some(Scalar::from((chunk.len() * 8) as u64 + 1))
Expand Down
Loading