forked from quickwit-oss/tantivy
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Integrated fastfield codecs into columnar. (quickwit-oss#1782)
Introduced asymetric OptionalCodec / SerializableOptionalCodec Removed cardinality from the columnar sstable. Added DynamicColumn Reorganized all files Change DenseCodec serialization logic. Renamed methods to rank/select Moved versioning footer to the columnar level
- Loading branch information
1 parent
a6ab5b6
commit f373d86
Showing
44 changed files
with
6,035 additions
and
327 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
# zero to one | ||
* merges | ||
* full still needs a num_values | ||
* replug u128 | ||
* add dictionary encoded stuff | ||
* fix multivalued | ||
* find a way to make columnar work with strict types | ||
* plug to tantivy | ||
- indexing | ||
- aggregations | ||
- merge | ||
|
||
# Perf and Size | ||
* re-add ZSTD compression for dictionaries | ||
no systematic monotonic mapping | ||
consider removing multilinear | ||
f32? | ||
adhoc solution for bool? | ||
|
||
add metrics helper for aggregate. sum(row_id) | ||
review inline absence/presence | ||
improv perf of select using PDEP | ||
compare with roaring bitmap/elias fano etc etc. | ||
SIMD range? (see blog post) | ||
Add alignment? | ||
Consider another codec to bridge the gap between few and 5k elements | ||
|
||
# Cleanup and rationalization | ||
in benchmark, unify percent vs ratio, f32 vs f64. | ||
investigate if should have better errors? io::Error is overused at the moment. | ||
rename rank/select in unit tests | ||
Review the public API via cargo doc | ||
go through TODOs | ||
remove all doc_id occurences -> row_id | ||
use the rank & select naming in unit tests branch. | ||
multi-linear -> blockwise | ||
linear codec -> simply a multiplication for the index column | ||
|
||
# Other | ||
fix enhance column-cli | ||
|
||
# Santa claus | ||
|
||
autodetect datetime ipaddr, plug customizable tokenizer. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
use std::io; | ||
use std::ops::Deref; | ||
use std::sync::Arc; | ||
|
||
use sstable::{Dictionary, VoidSSTable}; | ||
|
||
use crate::column::Column; | ||
use crate::column_index::ColumnIndex; | ||
|
||
/// Dictionary encoded column. | ||
#[derive(Clone)] | ||
pub struct BytesColumn { | ||
pub(crate) dictionary: Arc<Dictionary<VoidSSTable>>, | ||
pub(crate) term_ord_column: Column<u64>, | ||
} | ||
|
||
impl BytesColumn { | ||
/// Returns `false` if the term does not exist (e.g. `term_ord` is greater or equal to the | ||
/// overll number of terms). | ||
pub fn term_ord_to_str(&self, term_ord: u64, output: &mut Vec<u8>) -> io::Result<bool> { | ||
self.dictionary.ord_to_term(term_ord, output) | ||
} | ||
|
||
pub fn term_ords(&self) -> &Column<u64> { | ||
&self.term_ord_column | ||
} | ||
} | ||
|
||
impl Deref for BytesColumn { | ||
type Target = ColumnIndex<'static>; | ||
|
||
fn deref(&self) -> &Self::Target { | ||
&**self.term_ords() | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use crate::{ColumnarReader, ColumnarWriter}; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
mod dictionary_encoded; | ||
mod serialize; | ||
|
||
use std::ops::Deref; | ||
use std::sync::Arc; | ||
|
||
use common::BinarySerializable; | ||
pub use dictionary_encoded::BytesColumn; | ||
pub use serialize::{open_column_bytes, open_column_u64, serialize_column_u64}; | ||
|
||
use crate::column_index::ColumnIndex; | ||
use crate::column_values::ColumnValues; | ||
use crate::{Cardinality, RowId}; | ||
|
||
#[derive(Clone)] | ||
pub struct Column<T> { | ||
pub idx: ColumnIndex<'static>, | ||
pub values: Arc<dyn ColumnValues<T>>, | ||
} | ||
|
||
use crate::column_index::Set; | ||
|
||
impl<T: PartialOrd> Column<T> { | ||
pub fn first(&self, row_id: RowId) -> Option<T> { | ||
match &self.idx { | ||
ColumnIndex::Full => Some(self.values.get_val(row_id)), | ||
ColumnIndex::Optional(opt_idx) => { | ||
let value_row_idx = opt_idx.rank_if_exists(row_id)?; | ||
Some(self.values.get_val(value_row_idx)) | ||
} | ||
ColumnIndex::Multivalued(_multivalued_index) => { | ||
todo!(); | ||
} | ||
} | ||
} | ||
} | ||
|
||
impl<T> Deref for Column<T> { | ||
type Target = ColumnIndex<'static>; | ||
|
||
fn deref(&self) -> &Self::Target { | ||
&self.idx | ||
} | ||
} | ||
|
||
impl BinarySerializable for Cardinality { | ||
fn serialize<W: std::io::Write>(&self, writer: &mut W) -> std::io::Result<()> { | ||
self.to_code().serialize(writer) | ||
} | ||
|
||
fn deserialize<R: std::io::Read>(reader: &mut R) -> std::io::Result<Self> { | ||
let cardinality_code = u8::deserialize(reader)?; | ||
let cardinality = Cardinality::try_from_code(cardinality_code)?; | ||
Ok(cardinality) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
use std::io; | ||
use std::io::Write; | ||
use std::sync::Arc; | ||
|
||
use common::{CountingWriter, OwnedBytes}; | ||
use sstable::Dictionary; | ||
|
||
use crate::column::{BytesColumn, Column}; | ||
use crate::column_index::{serialize_column_index, SerializableColumnIndex}; | ||
use crate::column_values::{ | ||
serialize_column_values, ColumnValues, MonotonicallyMappableToU64, ALL_CODEC_TYPES, | ||
}; | ||
pub fn serialize_column_u64<T: MonotonicallyMappableToU64>( | ||
column_index: SerializableColumnIndex<'_>, | ||
column_values: &impl ColumnValues<T>, | ||
output: &mut impl Write, | ||
) -> io::Result<()> { | ||
let mut counting_writer = CountingWriter::wrap(output); | ||
serialize_column_index(column_index, &mut counting_writer)?; | ||
let column_index_num_bytes = counting_writer.written_bytes() as u32; | ||
let output = counting_writer.finish(); | ||
serialize_column_values(column_values, &ALL_CODEC_TYPES[..], output)?; | ||
output.write_all(&column_index_num_bytes.to_le_bytes())?; | ||
Ok(()) | ||
} | ||
|
||
pub fn open_column_u64<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::Result<Column<T>> { | ||
let (body, column_index_num_bytes_payload) = bytes.rsplit(4); | ||
let column_index_num_bytes = u32::from_le_bytes( | ||
column_index_num_bytes_payload | ||
.as_slice() | ||
.try_into() | ||
.unwrap(), | ||
); | ||
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize); | ||
let column_index = crate::column_index::open_column_index(column_index_data)?; | ||
let column_values = crate::column_values::open_u64_mapped(column_values_data)?; | ||
Ok(Column { | ||
idx: column_index, | ||
values: column_values, | ||
}) | ||
} | ||
|
||
pub fn open_column_bytes(data: OwnedBytes) -> io::Result<BytesColumn> { | ||
let (body, dictionary_len_bytes) = data.rsplit(4); | ||
let dictionary_len = u32::from_le_bytes(dictionary_len_bytes.as_slice().try_into().unwrap()); | ||
let (dictionary_bytes, column_bytes) = body.split(dictionary_len as usize); | ||
let dictionary = Arc::new(Dictionary::from_bytes(dictionary_bytes)?); | ||
let term_ord_column = crate::column::open_column_u64::<u64>(column_bytes)?; | ||
Ok(BytesColumn { | ||
dictionary, | ||
term_ord_column, | ||
}) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
mod multivalued_index; | ||
mod optional_index; | ||
mod serialize; | ||
|
||
use std::sync::Arc; | ||
|
||
pub use optional_index::{OptionalIndex, SerializableOptionalIndex, Set}; | ||
pub use serialize::{open_column_index, serialize_column_index, SerializableColumnIndex}; | ||
|
||
use crate::column_values::ColumnValues; | ||
use crate::{Cardinality, RowId}; | ||
|
||
#[derive(Clone)] | ||
pub enum ColumnIndex<'a> { | ||
Full, | ||
Optional(OptionalIndex), | ||
// TODO remove the Arc<dyn> apart from serialization this is not | ||
// dynamic at all. | ||
Multivalued(Arc<dyn ColumnValues<RowId> + 'a>), | ||
} | ||
|
||
impl<'a> ColumnIndex<'a> { | ||
pub fn get_cardinality(&self) -> Cardinality { | ||
match self { | ||
ColumnIndex::Full => Cardinality::Full, | ||
ColumnIndex::Optional(_) => Cardinality::Optional, | ||
ColumnIndex::Multivalued(_) => Cardinality::Multivalued, | ||
} | ||
} | ||
|
||
pub fn num_rows(&self) -> RowId { | ||
match self { | ||
ColumnIndex::Full => { | ||
todo!() | ||
} | ||
ColumnIndex::Optional(optional_index) => optional_index.num_rows(), | ||
ColumnIndex::Multivalued(multivalued_index) => multivalued_index.num_vals() - 1, | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
use std::io; | ||
use std::io::Write; | ||
use std::sync::Arc; | ||
|
||
use common::OwnedBytes; | ||
|
||
use crate::column_values::{ColumnValues, FastFieldCodecType}; | ||
use crate::RowId; | ||
|
||
#[derive(Clone)] | ||
pub struct MultivaluedIndex(Arc<dyn ColumnValues<RowId>>); | ||
|
||
pub fn serialize_multivalued_index( | ||
multivalued_index: MultivaluedIndex, | ||
output: &mut impl Write, | ||
) -> io::Result<()> { | ||
crate::column_values::serialize_column_values( | ||
&*multivalued_index.0, | ||
&[FastFieldCodecType::Bitpacked, FastFieldCodecType::Linear], | ||
output, | ||
)?; | ||
Ok(()) | ||
} | ||
|
||
pub fn open_multivalued_index(bytes: OwnedBytes) -> io::Result<Arc<dyn ColumnValues<RowId>>> { | ||
todo!(); | ||
} |
Oops, something went wrong.