quickwit-oss · trinity-1686a · Mar 21, 2023 · Mar 14, 2023 · Mar 15, 2023 · Mar 15, 2023
diff --git a/columnar/src/tests.rs b/columnar/src/tests.rs
@@ -17,7 +17,7 @@ fn test_dataframe_writer_str() {
     assert_eq!(columnar.num_columns(), 1);
     let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
     assert_eq!(cols.len(), 1);
-    assert_eq!(cols[0].num_bytes(), 158);
+    assert_eq!(cols[0].num_bytes(), 88);
 }
 
 #[test]
@@ -31,7 +31,7 @@ fn test_dataframe_writer_bytes() {
     assert_eq!(columnar.num_columns(), 1);
     let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
     assert_eq!(cols.len(), 1);
-    assert_eq!(cols[0].num_bytes(), 158);
+    assert_eq!(cols[0].num_bytes(), 88);
 }
 
 #[test]

diff --git a/common/src/dictionary_footer.rs b/common/src/dictionary_footer.rs
@@ -0,0 +1,63 @@
+use std::io::{self, Read, Write};
+
+use crate::BinarySerializable;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[repr(u32)]
+pub enum DictionaryKind {
+    Fst = 1,
+    SSTable = 2,
+}
+
+#[derive(Debug, Clone, PartialEq)]
+pub struct DictionaryFooter {
+    pub kind: DictionaryKind,
+    pub version: u32,
+}
+
+impl DictionaryFooter {
+    pub fn verify_equal(&self, other: &DictionaryFooter) -> io::Result<()> {
+        if self.kind != other.kind {
+            return Err(io::Error::new(
+                io::ErrorKind::Other,
+                format!(
+                    "Invalid dictionary type, expected {:?}, found {:?}",
+                    self.kind, other.kind
+                ),
+            ));
+        }
+        if self.version != other.version {
+            return Err(io::Error::new(
+                io::ErrorKind::Other,
+                format!(
+                    "Unsuported dictionary version, expected {}, found {}",
+                    self.version, other.version
+                ),
+            ));
+        }
+        Ok(())
+    }
+}
+
+impl BinarySerializable for DictionaryFooter {
+    fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
+        self.version.serialize(writer)?;
+        (self.kind as u32).serialize(writer)
+    }
+    fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
+        let version = u32::deserialize(reader)?;
+        let kind = u32::deserialize(reader)?;
+        let kind = match kind {
+            1 => DictionaryKind::Fst,
+            2 => DictionaryKind::SSTable,
+            _ => {
+                return Err(io::Error::new(
+                    io::ErrorKind::Other,
+                    format!("invalid dictionary kind: {kind}"),
+                ))
+            }
+        };
+
+        Ok(DictionaryFooter { kind, version })
+    }
+}
diff --git a/common/src/lib.rs b/common/src/lib.rs
@@ -6,13 +6,15 @@ pub use byteorder::LittleEndian as Endianness;
 
 mod bitset;
 mod datetime;
+mod dictionary_footer;
 pub mod file_slice;
 mod group_by;
 mod serialize;
 mod vint;
 mod writer;
 pub use bitset::*;
 pub use datetime::{DatePrecision, DateTime};
+pub use dictionary_footer::*;
 pub use group_by::GroupByIteratorExtended;
 pub use ownedbytes::{OwnedBytes, StableDeref};
 pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};

diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs
@@ -130,7 +130,7 @@ mod tests {
         }
         let file = directory.open_read(path).unwrap();
 
-        assert_eq!(file.len(), 161);
+        assert_eq!(file.len(), 94);
         let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
         let column = fast_field_readers
             .u64("field")
@@ -180,7 +180,7 @@ mod tests {
             write.terminate().unwrap();
         }
         let file = directory.open_read(path).unwrap();
-        assert_eq!(file.len(), 189);
+        assert_eq!(file.len(), 122);
         let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
         let col = fast_field_readers
             .u64("field")
@@ -213,7 +213,7 @@ mod tests {
             write.terminate().unwrap();
         }
         let file = directory.open_read(path).unwrap();
-        assert_eq!(file.len(), 162);
+        assert_eq!(file.len(), 95);
         let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
         let fast_field_reader = fast_field_readers
             .u64("field")
@@ -245,7 +245,7 @@ mod tests {
             write.terminate().unwrap();
         }
         let file = directory.open_read(path).unwrap();
-        assert_eq!(file.len(), 4557);
+        assert_eq!(file.len(), 4490);
         {
             let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
             let col = fast_field_readers
@@ -278,7 +278,7 @@ mod tests {
             write.terminate().unwrap();
         }
         let file = directory.open_read(path).unwrap();
-        assert_eq!(file.len(), 333_usize);
+        assert_eq!(file.len(), 266);
 
         {
             let fast_field_readers = FastFieldReaders::open(file, schema).unwrap();
@@ -772,7 +772,7 @@ mod tests {
             write.terminate().unwrap();
         }
         let file = directory.open_read(path).unwrap();
-        assert_eq!(file.len(), 175);
+        assert_eq!(file.len(), 103);
         let fast_field_readers = FastFieldReaders::open(file, schema).unwrap();
         let bool_col = fast_field_readers.bool("field_bool").unwrap();
         assert_eq!(bool_col.first(0), Some(true));
@@ -804,7 +804,7 @@ mod tests {
             write.terminate().unwrap();
         }
         let file = directory.open_read(path).unwrap();
-        assert_eq!(file.len(), 187);
+        assert_eq!(file.len(), 115);
         let readers = FastFieldReaders::open(file, schema).unwrap();
         let bool_col = readers.bool("field_bool").unwrap();
         for i in 0..25 {
@@ -829,7 +829,7 @@ mod tests {
             write.terminate().unwrap();
         }
         let file = directory.open_read(path).unwrap();
-        assert_eq!(file.len(), 177);
+        assert_eq!(file.len(), 105);
         let fastfield_readers = FastFieldReaders::open(file, schema).unwrap();
         let col = fastfield_readers.bool("field_bool").unwrap();
         assert_eq!(col.first(0), None);

diff --git a/sstable/Cargo.toml b/sstable/Cargo.toml
@@ -6,8 +6,6 @@ license = "MIT"
 
 [dependencies]
 common = {path="../common", package="tantivy-common"}
-ciborium = "0.2"
-serde = "1"
 tantivy-fst = "0.4"
 
 [dev-dependencies]

diff --git a/sstable/README.md b/sstable/README.md
@@ -26,3 +26,94 @@ possible.
 - it allows incremental encoding of the keys
 - the front compression is leveraged to optimize
 the intersection with an automaton
+
+# On disk format
+
+Overview of the SSTable format. Unless noted otherwise, numbers are little-endian.
+
+### SSTable
+```
++-------+-------+-----+--------+
+| Block | Block | ... | Footer |
++-------+-------+-----+--------+
+|----( # of blocks)---|
+```
+- Block(`SSTBlock`): list of independent block, terminated by a single empty block.
+- Footer(`SSTFooter`)
+
+### SSTBlock
+```
++----------+--------+-------+-------+-----+
+| BlockLen | Values | Delta | Delta | ... |
++----------+--------+-------+-------+-----+
+                    |----( # of deltas)---|
+```
+- BlockLen(u32): length of the block
+- Values: an application defined format storing a sequence of value, capable of determining it own length
+- Delta
+
+### Delta
+```
++---------+--------+
+| KeepAdd | Suffix |
++---------+--------+
+```
+- KeepAdd
+- Suffix: KeepAdd.add bytes of key suffix
+
+### KeepAdd
+KeepAdd can be represented in two different representation, a very compact 1byte one which is enough for most usage, and a longer variable-len one when required
+
+When keep < 16 and add < 16
+```
++-----+------+
+| Add | Keep |
++-----+------+
+```
+- Add(u4): number of bytes to push
+- Keep(u4): number of bytes to pop
+
+Otherwise:
+```
++------+------+-----+
+| 0x01 | Keep | Add |
++------+------+-----+
+```
+- Add(VInt): number of bytes to push
+- Keep(VInt): number of bytes to pop
+
+
+Note: there is no ambiguity between both representation as Add is always guarantee to be non-zero, except for the very first key of an SSTable, where Keep is guaranteed to be zero.
-Note: there is no ambiguity between both representation as Add is always guarantee to be non-zero, except for the very first key of an SSTable, where Keep is guaranteed to be zero.
+Note: there is no ambiguity between both representation as Add is always guarantee to be non-zero, except for the very first key of an SSTable, where Keep is guaranteed to be zero.
-Note: there is no ambiguity between both representation as Add is always guarantee to be non-zero, except for the very first key of an SSTable, where Keep is guaranteed to be zero.
+Note: as the SSTable does not support redundant keys, there is no ambiguity between both representation. Add is always guaranteed to be non-zero, except for the very first key of an SSTable, where Keep is guaranteed to be zero.
-Note: there is no ambiguity between both representation as Add is always guarantee to be non-zero, except for the very first key of an SSTable, where Keep is guaranteed to be zero.
+Note: there is no ambiguity between both representation as Add is always guarantee to be non-zero, except for the very first key of an SSTable, where Keep is guaranteed to be zero.
-Note: there is no ambiguity between both representation as Add is always guarantee to be non-zero, except for the very first key of an SSTable, where Keep is guaranteed to be zero.
+Note: as the SSTable does not support redundant keys, there is no ambiguity between both representation. Add is always guaranteed to be non-zero, except for the very first key of an SSTable, where Keep is guaranteed to be zero.
+
+### SSTFooter
+```
++-------+-------+-----+-------------+---------+---------+------+
+| Block | Block | ... | IndexOffset | NumTerm | Version | Type |
++-------+-------+-----+-------------+---------+---------+------+
+|----( # of blocks)---|
+```
+- Block(SSTBlock): uses IndexValue for its Values format
+- IndexOffset(u64): Offset to the start of the SSTFooter
+- NumTerm(u64): number of terms in the sstable
+- Version(u32): Currently defined to 0x00\_00\_00\_01
+- Type(u32): Defined to 0x00\_00\_00\_02
+
+### IndexValue
+```
++------------+-------+-------+-----+
+| EntryCount | Entry | Entry | ... |
++------------+-------+-------+-----+
+             |---( # of entries)---|
+```
+
+- EntryCount(VInt): number of entries
+- Entry (IndexEntry)
+
+### Entry
+```
++----------+--------------+
+| BlockLen | FirstOrdinal |
++----------+--------------+
+```
+- BlockLen(VInt): length of the block
+- FirstOrdinal(VInt): ordinal of the first element in the given block
diff --git a/sstable/src/delta.rs b/sstable/src/delta.rs
@@ -18,6 +18,7 @@ where W: io::Write
     value_writer: TValueWriter,
     // Only here to avoid allocations.
     stateless_buffer: Vec<u8>,
+    block_len: usize,
 }
 
 impl<W, TValueWriter> DeltaWriter<W, TValueWriter>
@@ -31,15 +32,14 @@ where
             write: CountingWriter::wrap(BufWriter::new(wrt)),
             value_writer: TValueWriter::default(),
             stateless_buffer: Vec::new(),
+            block_len: BLOCK_LEN,
         }
     }
-}
 
-impl<W, TValueWriter> DeltaWriter<W, TValueWriter>
-where
-    W: io::Write,
-    TValueWriter: value::ValueWriter,
-{
+    pub fn set_block_len(&mut self, block_len: usize) {
+        self.block_len = block_len
+    }
+
     pub fn flush_block(&mut self) -> io::Result<Option<Range<usize>>> {
         if self.block.is_empty() {
             return Ok(None);
@@ -82,7 +82,7 @@ where
     }
 
     pub fn flush_block_if_required(&mut self) -> io::Result<Option<Range<usize>>> {
-        if self.block.len() > BLOCK_LEN {
+        if self.block.len() > self.block_len {
             return self.flush_block();
         }
         Ok(None)

diff --git a/sstable/src/dictionary.rs b/sstable/src/dictionary.rs
@@ -5,7 +5,7 @@ use std::ops::{Bound, RangeBounds};
 use std::sync::Arc;
 
 use common::file_slice::FileSlice;
-use common::{BinarySerializable, OwnedBytes};
+use common::{BinarySerializable, DictionaryFooter, OwnedBytes};
 use tantivy_fst::automaton::AlwaysMatch;
 use tantivy_fst::Automaton;
 
@@ -110,7 +110,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
     /// only block for up to `limit` matching terms.
     ///
     /// It works by identifying
-    /// - `first_block`: the block containing the start boudary key
+    /// - `first_block`: the block containing the start boundary key
     /// - `last_block`: the block containing the end boundary key.
     ///
     /// And then returning the range that spans over all blocks between.
@@ -178,10 +178,15 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
 
     /// Opens a `TermDictionary`.
     pub fn open(term_dictionary_file: FileSlice) -> io::Result<Self> {
-        let (main_slice, footer_len_slice) = term_dictionary_file.split_from_end(16);
+        let (main_slice, footer_len_slice) = term_dictionary_file.split_from_end(24);
         let mut footer_len_bytes: OwnedBytes = footer_len_slice.read_bytes()?;
+
         let index_offset = u64::deserialize(&mut footer_len_bytes)?;
         let num_terms = u64::deserialize(&mut footer_len_bytes)?;
+
+        let footer = DictionaryFooter::deserialize(&mut footer_len_bytes)?;
+        crate::FOOTER.verify_equal(&footer)?;
+
         let (sstable_slice, index_slice) = main_slice.split(index_offset as usize);
         let sstable_index_bytes = index_slice.read_bytes()?;
         let sstable_index = SSTableIndex::load(sstable_index_bytes.as_slice())
@@ -231,7 +236,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
             let suffix = sstable_delta_reader.suffix();
 
             match prefix_len.cmp(&ok_bytes) {
-                Ordering::Less => return Ok(None), // poped bytes already matched => too far
+                Ordering::Less => return Ok(None), // popped bytes already matched => too far
                 Ordering::Equal => (),
                 Ordering::Greater => {
                     // the ok prefix is less than current entry prefix => continue to next elem