Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Embeds OwnedBytes into the FastFieldCodecReader. #1458

Merged
merged 1 commit into from
Aug 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions fastfield_codecs/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ description = "Fast field codecs used by tantivy"
[dependencies]
common = { version = "0.3", path = "../common/", package = "tantivy-common" }
tantivy-bitpacker = { version="0.2", path = "../bitpacker/" }
ownedbytes = { version = "0.3.0", path = "../ownedbytes" }
prettytable-rs = {version="0.9.0", optional= true}
rand = {version="0.8.3", optional= true}

Expand Down
12 changes: 8 additions & 4 deletions fastfield_codecs/src/bitpacked.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::io::{self, Write};

use common::BinarySerializable;
use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};

use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
Expand All @@ -9,29 +10,32 @@ use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess,
/// fast field is required.
#[derive(Clone)]
pub struct BitpackedFastFieldReader {
data: OwnedBytes,
bit_unpacker: BitUnpacker,
pub min_value_u64: u64,
pub max_value_u64: u64,
}

impl FastFieldCodecReader for BitpackedFastFieldReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
let (_data, mut footer) = bytes.split_at(bytes.len() - 16);
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_offset = bytes.len() - 16;
let (data, mut footer) = bytes.split(footer_offset);
let min_value = u64::deserialize(&mut footer)?;
let amplitude = u64::deserialize(&mut footer)?;
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(BitpackedFastFieldReader {
data,
min_value_u64: min_value,
max_value_u64: max_value,
bit_unpacker,
})
}
#[inline]
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
self.min_value_u64 + self.bit_unpacker.get(doc, data)
fn get_u64(&self, doc: u64) -> u64 {
self.min_value_u64 + self.bit_unpacker.get(doc, &self.data)
}
#[inline]
fn min_value(&self) -> u64 {
Expand Down
17 changes: 9 additions & 8 deletions fastfield_codecs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@ extern crate more_asserts;
use std::io;
use std::io::Write;

use ownedbytes::OwnedBytes;

pub mod bitpacked;
pub mod linearinterpol;
pub mod multilinearinterpol;

pub trait FastFieldCodecReader: Sized {
/// reads the metadata and returns the CodecReader
fn open_from_bytes(bytes: &[u8]) -> std::io::Result<Self>;

fn get_u64(&self, doc: u64, data: &[u8]) -> u64;

fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result<Self>;
fn get_u64(&self, doc: u64) -> u64;
fn min_value(&self) -> u64;
fn max_value(&self) -> u64;
}
Expand Down Expand Up @@ -98,7 +98,7 @@ mod tests {
return (f32::MAX, 0.0);
}
let estimation = S::estimate(&data, crate::tests::stats_from_vec(data));
let mut out = vec![];
let mut out: Vec<u8> = Vec::new();
S::serialize(
&mut out,
&data,
Expand All @@ -108,17 +108,18 @@ mod tests {
)
.unwrap();

let reader = R::open_from_bytes(&out).unwrap();
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);

let reader = R::open_from_bytes(OwnedBytes::new(out)).unwrap();
for (doc, orig_val) in data.iter().enumerate() {
let val = reader.get_u64(doc as u64, &out);
let val = reader.get_u64(doc as u64);
if val != *orig_val {
panic!(
"val {:?} does not match orig_val {:?}, in data set {}, data {:?}",
val, orig_val, name, data
);
}
}
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
(estimation, actual_compression)
}
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
Expand Down
13 changes: 8 additions & 5 deletions fastfield_codecs/src/linearinterpol.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use std::io::{self, Read, Write};
use std::ops::Sub;

use common::{BinarySerializable, FixedSize};
use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};

use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
Expand All @@ -10,6 +11,7 @@ use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess,
/// fast field is required.
#[derive(Clone)]
pub struct LinearInterpolFastFieldReader {
data: OwnedBytes,
bit_unpacker: BitUnpacker,
pub footer: LinearInterpolFooter,
pub slope: f32,
Expand Down Expand Up @@ -57,23 +59,24 @@ impl FixedSize for LinearInterpolFooter {

impl FastFieldCodecReader for LinearInterpolFastFieldReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
let (_data, mut footer) = bytes.split_at(bytes.len() - LinearInterpolFooter::SIZE_IN_BYTES);
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_offset = bytes.len() - LinearInterpolFooter::SIZE_IN_BYTES;
let (data, mut footer) = bytes.split(footer_offset);
let footer = LinearInterpolFooter::deserialize(&mut footer)?;
let slope = get_slope(footer.first_val, footer.last_val, footer.num_vals);

let num_bits = compute_num_bits(footer.relative_max_value);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(LinearInterpolFastFieldReader {
data,
bit_unpacker,
footer,
slope,
})
}
#[inline]
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
fn get_u64(&self, doc: u64) -> u64 {
let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope);
(calculated_value + self.bit_unpacker.get(doc, data)) - self.footer.offset
(calculated_value + self.bit_unpacker.get(doc, &self.data)) - self.footer.offset
}

#[inline]
Expand Down
15 changes: 8 additions & 7 deletions fastfield_codecs/src/multilinearinterpol.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use std::io::{self, Read, Write};
use std::ops::Sub;

use common::{BinarySerializable, CountingWriter, DeserializeFrom};
use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};

use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
Expand All @@ -24,6 +25,7 @@ const CHUNK_SIZE: u64 = 512;
/// fast field is required.
#[derive(Clone)]
pub struct MultiLinearInterpolFastFieldReader {
data: OwnedBytes,
pub footer: MultiLinearInterpolFooter,
}

Expand Down Expand Up @@ -145,24 +147,23 @@ fn get_interpolation_function(doc: u64, interpolations: &[Function]) -> &Functio

impl FastFieldCodecReader for MultiLinearInterpolFastFieldReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;

let (_data, mut footer) = bytes.split_at(bytes.len() - (4 + footer_len) as usize);
let footer_offset = bytes.len() - 4 - footer_len as usize;
let (data, mut footer) = bytes.split(footer_offset);
let footer = MultiLinearInterpolFooter::deserialize(&mut footer)?;

Ok(MultiLinearInterpolFastFieldReader { footer })
Ok(MultiLinearInterpolFastFieldReader { data, footer })
}

#[inline]
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
fn get_u64(&self, doc: u64) -> u64 {
let interpolation = get_interpolation_function(doc, &self.footer.interpolations);
let doc = doc - interpolation.start_pos;
let calculated_value =
get_calculated_value(interpolation.value_start_pos, doc, interpolation.slope);
let diff = interpolation
.bit_unpacker
.get(doc, &data[interpolation.data_start_offset as usize..]);
.get(doc, &self.data[interpolation.data_start_offset as usize..]);
(calculated_value + diff) - interpolation.positive_val_offset
}

Expand Down
13 changes: 7 additions & 6 deletions src/fastfield/gcd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use common::BinarySerializable;
use fastdivide::DividerU64;
use fastfield_codecs::FastFieldCodecReader;
use gcd::Gcd;
use ownedbytes::OwnedBytes;

pub const GCD_DEFAULT: u64 = 1;
pub const GCD_CODEC_ID: u8 = 4;
Expand All @@ -19,12 +20,12 @@ pub struct GCDFastFieldCodec<CodecReader> {
}
impl<C: FastFieldCodecReader + Clone> FastFieldCodecReader for GCDFastFieldCodec<C> {
/// Opens a fast field given the bytes.
fn open_from_bytes(bytes: &[u8]) -> std::io::Result<Self> {
let (header, mut footer) = bytes.split_at(bytes.len() - 16);
fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result<Self> {
let footer_offset = bytes.len() - 16;
let (body, mut footer) = bytes.split(footer_offset);
let gcd = u64::deserialize(&mut footer)?;
let min_value = u64::deserialize(&mut footer)?;
let reader = C::open_from_bytes(header)?;

let reader = C::open_from_bytes(body)?;
Ok(GCDFastFieldCodec {
gcd,
min_value,
Expand All @@ -33,8 +34,8 @@ impl<C: FastFieldCodecReader + Clone> FastFieldCodecReader for GCDFastFieldCodec
}

#[inline]
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
let mut data = self.reader.get_u64(doc, data);
fn get_u64(&self, doc: u64) -> u64 {
let mut data = self.reader.get_u64(doc);
data *= self.gcd;
data += self.min_value;
data
Expand Down
7 changes: 3 additions & 4 deletions src/fastfield/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,6 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
#[derive(Clone)]
pub struct FastFieldReaderCodecWrapper<Item: FastValue, CodecReader> {
reader: CodecReader,
bytes: OwnedBytes,
_phantom: PhantomData<Item>,
}

Expand All @@ -235,16 +234,16 @@ impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item,
}
/// Opens a fast field given the bytes.
pub fn open_from_bytes(bytes: OwnedBytes) -> crate::Result<Self> {
let reader = C::open_from_bytes(bytes.as_slice())?;
let reader = C::open_from_bytes(bytes)?;
Ok(FastFieldReaderCodecWrapper {
reader,
bytes,
_phantom: PhantomData,
})
}

#[inline]
pub(crate) fn get_u64(&self, doc: u64) -> Item {
let data = self.reader.get_u64(doc, self.bytes.as_slice());
let data = self.reader.get_u64(doc);
Item::from_u64(data)
}

Expand Down