Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new sstable format #1943

Merged
merged 9 commits into from
Mar 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions columnar/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ fn test_dataframe_writer_str() {
assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].num_bytes(), 158);
assert_eq!(cols[0].num_bytes(), 88);
}

#[test]
Expand All @@ -31,7 +31,7 @@ fn test_dataframe_writer_bytes() {
assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].num_bytes(), 158);
assert_eq!(cols[0].num_bytes(), 88);
}

#[test]
Expand Down
63 changes: 63 additions & 0 deletions common/src/dictionary_footer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
use std::io::{self, Read, Write};

use crate::BinarySerializable;

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(u32)]
pub enum DictionaryKind {
Fst = 1,
SSTable = 2,
}

#[derive(Debug, Clone, PartialEq)]
pub struct DictionaryFooter {
pub kind: DictionaryKind,
pub version: u32,
}

impl DictionaryFooter {
pub fn verify_equal(&self, other: &DictionaryFooter) -> io::Result<()> {
if self.kind != other.kind {
return Err(io::Error::new(
io::ErrorKind::Other,
format!(
"Invalid dictionary type, expected {:?}, found {:?}",
self.kind, other.kind
),
));
}
if self.version != other.version {
return Err(io::Error::new(
io::ErrorKind::Other,
format!(
"Unsuported dictionary version, expected {}, found {}",
self.version, other.version
),
));
}
Ok(())
}
}

impl BinarySerializable for DictionaryFooter {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
self.version.serialize(writer)?;
(self.kind as u32).serialize(writer)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let version = u32::deserialize(reader)?;
let kind = u32::deserialize(reader)?;
let kind = match kind {
1 => DictionaryKind::Fst,
2 => DictionaryKind::SSTable,
_ => {
return Err(io::Error::new(
io::ErrorKind::Other,
format!("invalid dictionary kind: {kind}"),
))
}
};

Ok(DictionaryFooter { kind, version })
}
}
2 changes: 2 additions & 0 deletions common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@ pub use byteorder::LittleEndian as Endianness;

mod bitset;
mod datetime;
mod dictionary_footer;
pub mod file_slice;
mod group_by;
mod serialize;
mod vint;
mod writer;
pub use bitset::*;
pub use datetime::{DatePrecision, DateTime};
pub use dictionary_footer::*;
pub use group_by::GroupByIteratorExtended;
pub use ownedbytes::{OwnedBytes, StableDeref};
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
Expand Down
16 changes: 8 additions & 8 deletions src/fastfield/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ mod tests {
}
let file = directory.open_read(path).unwrap();

assert_eq!(file.len(), 161);
assert_eq!(file.len(), 94);
let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
let column = fast_field_readers
.u64("field")
Expand Down Expand Up @@ -180,7 +180,7 @@ mod tests {
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 189);
assert_eq!(file.len(), 122);
let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
let col = fast_field_readers
.u64("field")
Expand Down Expand Up @@ -213,7 +213,7 @@ mod tests {
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 162);
assert_eq!(file.len(), 95);
let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
let fast_field_reader = fast_field_readers
.u64("field")
Expand Down Expand Up @@ -245,7 +245,7 @@ mod tests {
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 4557);
assert_eq!(file.len(), 4490);
{
let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
let col = fast_field_readers
Expand Down Expand Up @@ -278,7 +278,7 @@ mod tests {
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 333_usize);
assert_eq!(file.len(), 266);

{
let fast_field_readers = FastFieldReaders::open(file, schema).unwrap();
Expand Down Expand Up @@ -772,7 +772,7 @@ mod tests {
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 175);
assert_eq!(file.len(), 103);
let fast_field_readers = FastFieldReaders::open(file, schema).unwrap();
let bool_col = fast_field_readers.bool("field_bool").unwrap();
assert_eq!(bool_col.first(0), Some(true));
Expand Down Expand Up @@ -804,7 +804,7 @@ mod tests {
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 187);
assert_eq!(file.len(), 115);
let readers = FastFieldReaders::open(file, schema).unwrap();
let bool_col = readers.bool("field_bool").unwrap();
for i in 0..25 {
Expand All @@ -829,7 +829,7 @@ mod tests {
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 177);
assert_eq!(file.len(), 105);
let fastfield_readers = FastFieldReaders::open(file, schema).unwrap();
let col = fastfield_readers.bool("field_bool").unwrap();
assert_eq!(col.first(0), None);
Expand Down
2 changes: 0 additions & 2 deletions sstable/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@ license = "MIT"

[dependencies]
common = {path="../common", package="tantivy-common"}
ciborium = "0.2"
serde = "1"
tantivy-fst = "0.4"

[dev-dependencies]
Expand Down
91 changes: 91 additions & 0 deletions sstable/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,94 @@ possible.
- it allows incremental encoding of the keys
- the front compression is leveraged to optimize
the intersection with an automaton

# On disk format

Overview of the SSTable format. Unless noted otherwise, numbers are little-endian.

### SSTable
```
+-------+-------+-----+--------+
| Block | Block | ... | Footer |
+-------+-------+-----+--------+
|----( # of blocks)---|
```
- Block(`SSTBlock`): list of independent block, terminated by a single empty block.
- Footer(`SSTFooter`)

### SSTBlock
```
+----------+--------+-------+-------+-----+
| BlockLen | Values | Delta | Delta | ... |
+----------+--------+-------+-------+-----+
|----( # of deltas)---|
```
- BlockLen(u32): length of the block
- Values: an application defined format storing a sequence of value, capable of determining it own length
- Delta

### Delta
```
+---------+--------+
| KeepAdd | Suffix |
+---------+--------+
```
- KeepAdd
- Suffix: KeepAdd.add bytes of key suffix

### KeepAdd
KeepAdd can be represented in two different representation, a very compact 1byte one which is enough for most usage, and a longer variable-len one when required

When keep < 16 and add < 16
```
+-----+------+
| Add | Keep |
+-----+------+
```
- Add(u4): number of bytes to push
- Keep(u4): number of bytes to pop

Otherwise:
```
+------+------+-----+
| 0x01 | Keep | Add |
+------+------+-----+
```
- Add(VInt): number of bytes to push
- Keep(VInt): number of bytes to pop


Note: there is no ambiguity between both representation as Add is always guarantee to be non-zero, except for the very first key of an SSTable, where Keep is guaranteed to be zero.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great comment. This also implies that we do not support redundant keys. Is it written somewhere in this document?

Suggested change
Note: there is no ambiguity between both representation as Add is always guarantee to be non-zero, except for the very first key of an SSTable, where Keep is guaranteed to be zero.
Note: there is no ambiguity between both representation as Add is always guarantee to be non-zero, except for the very first key of an SSTable, where Keep is guaranteed to be zero.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it is, this could be rephrased as

Suggested change
Note: there is no ambiguity between both representation as Add is always guarantee to be non-zero, except for the very first key of an SSTable, where Keep is guaranteed to be zero.
Note: as the SSTable does not support redundant keys, there is no ambiguity between both representation. Add is always guaranteed to be non-zero, except for the very first key of an SSTable, where Keep is guaranteed to be zero.

to include that information

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oups, I forgot to add that before merging. I'll add it in the compression PR, or as an independent PR if it compression end up not making it


### SSTFooter
```
+-------+-------+-----+-------------+---------+---------+------+
| Block | Block | ... | IndexOffset | NumTerm | Version | Type |
+-------+-------+-----+-------------+---------+---------+------+
|----( # of blocks)---|
```
- Block(SSTBlock): uses IndexValue for its Values format
- IndexOffset(u64): Offset to the start of the SSTFooter
- NumTerm(u64): number of terms in the sstable
- Version(u32): Currently defined to 0x00\_00\_00\_01
- Type(u32): Defined to 0x00\_00\_00\_02

### IndexValue
```
+------------+-------+-------+-----+
| EntryCount | Entry | Entry | ... |
+------------+-------+-------+-----+
|---( # of entries)---|
```

- EntryCount(VInt): number of entries
- Entry (IndexEntry)

### Entry
```
+----------+--------------+
| BlockLen | FirstOrdinal |
+----------+--------------+
```
- BlockLen(VInt): length of the block
- FirstOrdinal(VInt): ordinal of the first element in the given block
14 changes: 7 additions & 7 deletions sstable/src/delta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ where W: io::Write
value_writer: TValueWriter,
// Only here to avoid allocations.
stateless_buffer: Vec<u8>,
block_len: usize,
}

impl<W, TValueWriter> DeltaWriter<W, TValueWriter>
Expand All @@ -31,15 +32,14 @@ where
write: CountingWriter::wrap(BufWriter::new(wrt)),
value_writer: TValueWriter::default(),
stateless_buffer: Vec::new(),
block_len: BLOCK_LEN,
}
}
}

impl<W, TValueWriter> DeltaWriter<W, TValueWriter>
where
W: io::Write,
TValueWriter: value::ValueWriter,
{
pub fn set_block_len(&mut self, block_len: usize) {
self.block_len = block_len
}

pub fn flush_block(&mut self) -> io::Result<Option<Range<usize>>> {
if self.block.is_empty() {
return Ok(None);
Expand Down Expand Up @@ -82,7 +82,7 @@ where
}

pub fn flush_block_if_required(&mut self) -> io::Result<Option<Range<usize>>> {
if self.block.len() > BLOCK_LEN {
if self.block.len() > self.block_len {
return self.flush_block();
}
Ok(None)
Expand Down
13 changes: 9 additions & 4 deletions sstable/src/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use std::ops::{Bound, RangeBounds};
use std::sync::Arc;

use common::file_slice::FileSlice;
use common::{BinarySerializable, OwnedBytes};
use common::{BinarySerializable, DictionaryFooter, OwnedBytes};
use tantivy_fst::automaton::AlwaysMatch;
use tantivy_fst::Automaton;

Expand Down Expand Up @@ -110,7 +110,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
/// only block for up to `limit` matching terms.
///
/// It works by identifying
/// - `first_block`: the block containing the start boudary key
/// - `first_block`: the block containing the start boundary key
/// - `last_block`: the block containing the end boundary key.
///
/// And then returning the range that spans over all blocks between.
Expand Down Expand Up @@ -178,10 +178,15 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {

/// Opens a `TermDictionary`.
pub fn open(term_dictionary_file: FileSlice) -> io::Result<Self> {
let (main_slice, footer_len_slice) = term_dictionary_file.split_from_end(16);
let (main_slice, footer_len_slice) = term_dictionary_file.split_from_end(24);
let mut footer_len_bytes: OwnedBytes = footer_len_slice.read_bytes()?;

let index_offset = u64::deserialize(&mut footer_len_bytes)?;
let num_terms = u64::deserialize(&mut footer_len_bytes)?;

let footer = DictionaryFooter::deserialize(&mut footer_len_bytes)?;
crate::FOOTER.verify_equal(&footer)?;

let (sstable_slice, index_slice) = main_slice.split(index_offset as usize);
let sstable_index_bytes = index_slice.read_bytes()?;
let sstable_index = SSTableIndex::load(sstable_index_bytes.as_slice())
Expand Down Expand Up @@ -231,7 +236,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
let suffix = sstable_delta_reader.suffix();

match prefix_len.cmp(&ok_bytes) {
Ordering::Less => return Ok(None), // poped bytes already matched => too far
Ordering::Less => return Ok(None), // popped bytes already matched => too far
Ordering::Equal => (),
Ordering::Greater => {
// the ok prefix is less than current entry prefix => continue to next elem
Expand Down
Loading