Review comments

quickwit-oss · Dec 27, 2022 · f199b88 · f199b88
1 parent 71308e6
commit f199b88
Show file tree

Hide file tree

Showing 8 changed files with 145 additions and 125 deletions.
diff --git a/columnar/Cargo.toml b/columnar/Cargo.toml
@@ -10,7 +10,6 @@ serde_json = "1"
 thiserror = "1"
 fnv = "1"
 sstable = { path = "../sstable", package = "tantivy-sstable" }
-zstd = "0.12"
 common = { path = "../common", package = "tantivy-common" }
 fastfield_codecs = { path = "../fastfield_codecs"}
 itertools = "0.10"

diff --git a/columnar/README.md b/columnar/README.md
@@ -16,42 +16,36 @@ and different cardinality `(required, optional, multivalued)`.
 
 # Coercion rules
 
-Users can create a columnar by appending rows to a writer.
-Nothing prevents a user from recording values with different to a same `column_key`.
+Users can create a columnar by inserting rows to a `ColumnarWriter`,
+and serializing it into a `Write` object.
+Nothing prevents a user from recording values with different type to the same `column_name`.
 
 In that case, `tantivy-columnar`'s behavior is as follows:
-- Values that corresponds to different JsonValue type are mapped to different columns. For instance, String values are treated independently from Number or boolean values. `tantivy-columnar` will simply emit several columns associated to a given column_name.
-- Only one column for a given json value type is emitted.  If number values with different number types are recorded (e.g. u64, i64, f64), `tantivy-columnar` will pick the first type that can represents the set of appended value, with the following prioriy order (`i64`, `u64`, `f64`). `i64` is picked over `u64` as it is likely to  yield less change of types. Most use cases strictly requiring `u64` show the restriction on 50% of the values (e.g. a 64-bit hash). On the other hand, a lot of use cases can show rare negative value.
+- JsonValues are grouped into 3 types (String, Number, bool).
+Values that corresponds to different groups are mapped to different columns. For instance, String values are treated independently
+from Number or boolean values. `tantivy-columnar` will simply emit several columns associated to a given column_name.
+- Only one column for a given json value type is emitted.  If number values with different number types are recorded (e.g. u64, i64, f64),
+`tantivy-columnar` will pick the first type that can represents the set of appended value, with the following prioriy order (`i64`, `u64`, `f64`).
+`i64` is picked over `u64` as it is likely to  yield less change of types. Most use cases strictly requiring `u64` show the
+restriction on 50% of the values (e.g. a 64-bit hash). On the other hand, a lot of use cases can show rare negative value.
 
 # Columnar format
 
-Because this columnar format tries to avoid some coercion.
-There can be several columns (with different type) associated to a single `column_name`.
-
-Each column is associated to `column_key`.
-The format of that key is:
+This columnar format may have more than one column (with different types) associated to the same `column_name` (see [Coercion rules](#coercion-rules) above).
+The `(column_name, columne_type)` couple however uniquely identifies a column.
+That couple is serialized as a column `column_key`.  The format of that key is:
 `[column_name][ZERO_BYTE][column_type_header: u8]`
 
 ```
 COLUMNAR:=
     [COLUMNAR_DATA]
-    [COLUMNAR_INDEX]
+    [COLUMNAR_KEY_TO_DATA_INDEX]
     [COLUMNAR_FOOTER];
 
 
 # Columns are sorted by their column key.
 COLUMNAR_DATA:=
-    [COLUMN]+;
-
-COLUMN:=
-    COMPRESSED_COLUMN | NON_COMPRESSED_COLUMN;
-
-# COLUMN_DATA is compressed when it exceeds a threshold of 100KB.
-
-COMPRESSED_COLUMN := [b'1'][zstd(COLUMN_DATA)]
-NON_COMPRESSED_COLUMN:= [b'0'][COLUMN_DATA]
-
-COLUMNAR_INDEX := [RANGE_SSTABLE_BYTES]
+    [COLUMN_DATA]+;
 
 COLUMNAR_FOOTER := [RANGE_SSTABLE_BYTES_LEN: 8 bytes little endian]
 
@@ -63,7 +57,7 @@ sorted by column key.
 A sstable associates
 `(column names, column_cardinality, column_type) to range of bytes.
 
-Column name may not contain the zero byte.
+Column name may not contain the zero byte `\0`.
 
 Listing all columns associated to `column_name` can therefore
 be done by listing all keys prefixed by

diff --git a/columnar/src/column_type_header.rs b/columnar/src/column_type_header.rs
@@ -1,8 +1,11 @@
 use crate::utils::{place_bits, select_bits};
 use crate::value::NumericalType;
+use crate::InvalidData;
 
 /// Enum describing the number of values that can exist per document
 /// (or per row if you will).
+///
+/// The cardinality must fit on 2 bits.
 #[derive(Clone, Copy, Hash, Default, Debug, PartialEq, Eq, PartialOrd, Ord)]
 #[repr(u8)]
 pub enum Cardinality {
@@ -20,16 +23,20 @@ impl Cardinality {
         self as u8
     }
 
-    pub(crate) fn try_from_code(code: u8) -> Option<Cardinality> {
+    pub(crate) fn try_from_code(code: u8) -> Result<Cardinality, InvalidData> {
         match code {
-            0 => Some(Cardinality::Required),
-            1 => Some(Cardinality::Optional),
-            2 => Some(Cardinality::Multivalued),
-            _ => None,
+            0 => Ok(Cardinality::Required),
+            1 => Ok(Cardinality::Optional),
+            2 => Ok(Cardinality::Multivalued),
+            _ => Err(InvalidData),
         }
     }
 }
 
+/// The column type represents the column type and can fit on 6-bits.
+///
+/// - bits[0..3]: Column category type.
+/// - bits[3..6]: Numerical type if necessary.
 #[derive(Hash, Eq, PartialEq, Debug, Clone, Copy)]
 pub enum ColumnType {
     Bytes,
@@ -40,73 +47,79 @@ pub enum ColumnType {
 impl ColumnType {
     /// Encoded over 6 bits.
     pub(crate) fn to_code(self) -> u8 {
-        let high_type;
-        let low_code: u8;
+        let column_type_category;
+        let numerical_type_code: u8;
         match self {
             ColumnType::Bytes => {
-                high_type = GeneralType::Str;
-                low_code = 0u8;
+                column_type_category = ColumnTypeCategory::Str;
+                numerical_type_code = 0u8;
             }
             ColumnType::Numerical(numerical_type) => {
-                high_type = GeneralType::Numerical;
-                low_code = numerical_type.to_code();
+                column_type_category = ColumnTypeCategory::Numerical;
+                numerical_type_code = numerical_type.to_code();
             }
             ColumnType::Bool => {
-                high_type = GeneralType::Bool;
-                low_code = 0u8;
+                column_type_category = ColumnTypeCategory::Bool;
+                numerical_type_code = 0u8;
             }
         }
-        place_bits::<3, 6>(high_type.to_code()) | place_bits::<0, 3>(low_code)
+        place_bits::<0, 3>(column_type_category.to_code()) | place_bits::<3, 6>(numerical_type_code)
     }
 
-    pub(crate) fn try_from_code(code: u8) -> Option<ColumnType> {
+    pub(crate) fn try_from_code(code: u8) -> Result<ColumnType, InvalidData> {
         if select_bits::<6, 8>(code) != 0u8 {
-            return None;
+            return Err(InvalidData);
         }
-        let high_code = select_bits::<3, 6>(code);
-        let low_code = select_bits::<0, 3>(code);
-        let high_type = GeneralType::try_from_code(high_code)?;
-        match high_type {
-            GeneralType::Bool => {
-                if low_code != 0u8 {
-                    return None;
+        let column_type_category_code = select_bits::<0, 3>(code);
+        let numerical_type_code = select_bits::<3, 6>(code);
+        let column_type_category = ColumnTypeCategory::try_from_code(column_type_category_code)?;
+        match column_type_category {
+            ColumnTypeCategory::Bool => {
+                if numerical_type_code != 0u8 {
+                    return Err(InvalidData);
                 }
-                Some(ColumnType::Bool)
+                Ok(ColumnType::Bool)
             }
-            GeneralType::Str => {
-                if low_code != 0u8 {
-                    return None;
+            ColumnTypeCategory::Str => {
+                if numerical_type_code != 0u8 {
+                    return Err(InvalidData);
                 }
-                Some(ColumnType::Bytes)
+                Ok(ColumnType::Bytes)
             }
-            GeneralType::Numerical => {
-                let numerical_type = NumericalType::try_from_code(low_code)?;
-                Some(ColumnType::Numerical(numerical_type))
+            ColumnTypeCategory::Numerical => {
+                let numerical_type = NumericalType::try_from_code(numerical_type_code)?;
+                Ok(ColumnType::Numerical(numerical_type))
             }
         }
     }
 }
 
-/// This corresponds to the JsonType.
+/// Column types are grouped into different categories that
+/// corresponds to the different types of `JsonValue` types.
+///
+/// The columnar writer will apply coercion rules to make sure that
+/// at most one column exist per `ColumnTypeCategory`.
+///
+/// See also [README.md].
 #[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Debug)]
 #[repr(u8)]
-pub(crate) enum GeneralType {
+pub(crate) enum ColumnTypeCategory {
     Bool = 0u8,
     Str = 1u8,
     Numerical = 2u8,
 }
 
-impl GeneralType {
+impl ColumnTypeCategory {
     pub fn to_code(self) -> u8 {
         self as u8
     }
 
-    pub fn try_from_code(code: u8) -> Option<Self> {
+    pub fn try_from_code(code: u8) -> Result<Self, InvalidData> {
         match code {
-            0u8 => Some(Self::Bool),
-            1u8 => Some(Self::Str),
-            2u8 => Some(Self::Numerical),
-            _ => None,
+            0u8 => Ok(Self::Bool),
+            1u8 => Ok(Self::Str),
+            2u8 => Ok(Self::Numerical),
+            _ => Err(InvalidData),
         }
     }
 }
@@ -115,26 +128,26 @@ impl GeneralType {
 /// This is encoded over one-byte and added to a column key in the
 /// columnar sstable.
 ///
-/// Cardinality is encoded as the first two highest two bits.
-/// The low 6 bits encode the column type.
+/// - [0..6] bits: encodes the column type
+/// - [6..8] bits: encodes the cardinality
 #[derive(Eq, Hash, PartialEq, Debug, Copy, Clone)]
 pub struct ColumnTypeAndCardinality {
-    pub cardinality: Cardinality,
     pub typ: ColumnType,
+    pub cardinality: Cardinality,
 }
 
 impl ColumnTypeAndCardinality {
     pub fn to_code(self) -> u8 {
         place_bits::<6, 8>(self.cardinality.to_code()) | place_bits::<0, 6>(self.typ.to_code())
     }
 
-    pub fn try_from_code(code: u8) -> Option<ColumnTypeAndCardinality> {
+    pub fn try_from_code(code: u8) -> Result<ColumnTypeAndCardinality, InvalidData> {
         let typ_code = select_bits::<0, 6>(code);
         let cardinality_code = select_bits::<6, 8>(code);
         let cardinality = Cardinality::try_from_code(cardinality_code)?;
         let typ = ColumnType::try_from_code(typ_code)?;
         assert_eq!(typ.to_code(), typ_code);
-        Some(ColumnTypeAndCardinality { cardinality, typ })
+        Ok(ColumnTypeAndCardinality { cardinality, typ })
     }
 }
 
@@ -149,7 +162,7 @@ mod tests {
     fn test_column_type_header_to_code() {
         let mut column_type_header_set: HashSet<ColumnTypeAndCardinality> = HashSet::new();
         for code in u8::MIN..=u8::MAX {
-            if let Some(column_type_header) = ColumnTypeAndCardinality::try_from_code(code) {
+            if let Ok(column_type_header) = ColumnTypeAndCardinality::try_from_code(code) {
                 assert_eq!(column_type_header.to_code(), code);
                 assert!(column_type_header_set.insert(column_type_header));
             }
@@ -165,7 +178,7 @@ mod tests {
     fn test_column_type_to_code() {
         let mut column_type_set: HashSet<ColumnType> = HashSet::new();
         for code in u8::MIN..=u8::MAX {
-            if let Some(column_type) = ColumnType::try_from_code(code) {
+            if let Ok(column_type) = ColumnType::try_from_code(code) {
                 assert_eq!(column_type.to_code(), code);
                 assert!(column_type_set.insert(column_type));
             }
@@ -177,8 +190,7 @@ mod tests {
     fn test_cardinality_to_code() {
         let mut num_cardinality = 0;
         for code in u8::MIN..=u8::MAX {
-            let cardinality_opt = Cardinality::try_from_code(code);
-            if let Some(cardinality) = cardinality_opt {
+            if let Ok(cardinality) = Cardinality::try_from_code(code) {
                 assert_eq!(cardinality.to_code(), code);
                 num_cardinality += 1;
             }

diff --git a/columnar/src/dictionary.rs b/columnar/src/dictionary.rs
@@ -3,11 +3,11 @@ use std::io;
 use fnv::FnvHashMap;
 use sstable::SSTable;
 
-pub(crate) struct IdMapping {
+pub(crate) struct TermIdMapping {
     unordered_to_ord: Vec<OrderedId>,
 }
 
-impl IdMapping {
+impl TermIdMapping {
     pub fn to_ord(&self, unordered: UnorderedId) -> OrderedId {
         self.unordered_to_ord[unordered.0 as usize]
     }
@@ -48,7 +48,7 @@ impl DictionaryBuilder {
 
     /// Serialize the dictionary into an fst, and returns the
     /// `UnorderedId -> TermOrdinal` map.
-    pub fn serialize<'a, W: io::Write + 'a>(&self, wrt: &mut W) -> io::Result<IdMapping> {
+    pub fn serialize<'a, W: io::Write + 'a>(&self, wrt: &mut W) -> io::Result<TermIdMapping> {
         let mut terms: Vec<(&[u8], UnorderedId)> =
             self.dict.iter().map(|(k, v)| (k.as_slice(), *v)).collect();
         terms.sort_unstable_by_key(|(key, _)| *key);
@@ -61,7 +61,7 @@ impl DictionaryBuilder {
             unordered_to_ord[unordered_id.0 as usize] = ordered_id;
         }
         sstable_builder.finish()?;
-        Ok(IdMapping { unordered_to_ord })
+        Ok(TermIdMapping { unordered_to_ord })
     }
 }
 

diff --git a/columnar/src/lib.rs b/columnar/src/lib.rs
@@ -12,6 +12,9 @@ pub use writer::ColumnarWriter;
 
 pub type DocId = u32;
 
+#[derive(Copy, Clone, Debug)]
+pub struct InvalidData;
+
 #[cfg(test)]
 mod tests {
     use std::ops::Range;
@@ -26,8 +29,8 @@ mod tests {
     #[test]
     fn test_dataframe_writer_bytes() {
         let mut dataframe_writer = ColumnarWriter::default();
-        dataframe_writer.record_str(1u32, "my_string", b"hello");
-        dataframe_writer.record_str(3u32, "my_string", b"helloeee");
+        dataframe_writer.record_str(1u32, "my_string", "hello");
+        dataframe_writer.record_str(3u32, "my_string", "helloeee");
         let mut buffer: Vec<u8> = Vec::new();
         dataframe_writer.serialize(5, &mut buffer).unwrap();
         let columnar_fileslice = FileSlice::from(buffer);
@@ -36,7 +39,7 @@ mod tests {
         let cols: Vec<(ColumnTypeAndCardinality, Range<u64>)> =
             columnar.read_columns("my_string").unwrap();
         assert_eq!(cols.len(), 1);
-        assert_eq!(cols[0].1, 0..159);
+        assert_eq!(cols[0].1, 0..158);
     }
 
     #[test]
@@ -58,7 +61,7 @@ mod tests {
                 typ: ColumnType::Bool
             }
         );
-        assert_eq!(cols[0].1, 0..22);
+        assert_eq!(cols[0].1, 0..21);
     }
 
     #[test]
@@ -81,6 +84,6 @@ mod tests {
         // - vals  8 //< due to padding? could have been 1byte?.
         // - null footer 6 bytes
         // - version footer 3 bytes // Should be file-wide
-        assert_eq!(cols[0].1, 0..32);
+        assert_eq!(cols[0].1, 0..31);
     }
 }