From 0cc185f186f6fc8d479504e27f1b962b64012944 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 1 May 2023 10:17:41 +0800 Subject: [PATCH 1/2] add seperate tokenizer manager for fast fields --- src/core/index.rs | 12 ++++++++++++ src/fastfield/mod.rs | 34 ++++++++++++++++++++++++++++++++-- src/indexer/segment_writer.rs | 3 ++- src/schema/text_options.rs | 6 ++++++ 4 files changed, 52 insertions(+), 3 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index 3212e6295a..fa5a3a8e03 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -282,6 +282,7 @@ pub struct Index { settings: IndexSettings, executor: Arc, tokenizers: TokenizerManager, + fast_field_tokenizers: TokenizerManager, inventory: SegmentMetaInventory, } @@ -394,6 +395,7 @@ impl Index { directory, schema, tokenizers: TokenizerManager::default(), + fast_field_tokenizers: TokenizerManager::default(), executor: Arc::new(Executor::single_thread()), inventory, } @@ -409,6 +411,16 @@ impl Index { &self.tokenizers } + /// Setter for the fast field tokenizer manager. + pub fn set_fast_field_tokenizers(&mut self, tokenizers: TokenizerManager) { + self.fast_field_tokenizers = tokenizers; + } + + /// Accessor for the fast field tokenizer manager. + pub fn tokenizer_fast_field(&self) -> &TokenizerManager { + &self.fast_field_tokenizers + } + /// Get the tokenizer associated with a specific field. pub fn tokenizer_for_field(&self, field: Field) -> crate::Result { let field_entry = self.schema.get_field_entry(field); diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index c8d7b7c61b..e6c6cd582d 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -90,10 +90,11 @@ mod tests { use crate::directory::{Directory, RamDirectory, WritePtr}; use crate::merge_policy::NoMergePolicy; use crate::schema::{ - Document, Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder, FAST, - INDEXED, STORED, STRING, TEXT, + Document, Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder, + TextOptions, FAST, INDEXED, STORED, STRING, TEXT, }; use crate::time::OffsetDateTime; + use crate::tokenizer::{LowerCaser, RawTokenizer, TextAnalyzer, TokenizerManager}; use crate::{DateOptions, DatePrecision, Index, SegmentId, SegmentReader}; pub static SCHEMA: Lazy = Lazy::new(|| { @@ -1173,6 +1174,35 @@ mod tests { assert_eq!(&vals, &[33]); } + #[test] + fn test_fast_field_tokenizer() { + let mut schema_builder = Schema::builder(); + let opt = TextOptions::default().set_fast(Some("custom_lowercase")); + let text_field = schema_builder.add_text_field("text", opt); + let schema = schema_builder.build(); + let ff_tokenizer_manager = TokenizerManager::default(); + ff_tokenizer_manager.register( + "custom_lowercase", + TextAnalyzer::builder(RawTokenizer) + .filter(LowerCaser) + .build(), + ); + + let mut index = Index::create_in_ram(schema); + index.set_fast_field_tokenizers(ff_tokenizer_manager); + let mut index_writer = index.writer_for_tests().unwrap(); + index_writer + .add_document(doc!(text_field => "Test1 test2")) + .unwrap(); + index_writer.commit().unwrap(); + let searcher = index.reader().unwrap().searcher(); + let fast_field_reader = searcher.segment_reader(0u32).fast_fields(); + let column = fast_field_reader.str("text").unwrap().unwrap(); + let mut out = String::new(); + column.ord_to_str(0u64, &mut out).unwrap(); + assert_eq!(&out, "test1 test2"); + } + #[test] fn test_text_fast_field_tokenizer() { let mut schema_builder = Schema::builder(); diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 1c8cd0ce6e..a207fd2eaf 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -84,6 +84,7 @@ impl SegmentWriter { ) -> crate::Result { let schema = segment.schema(); let tokenizer_manager = segment.index().tokenizers().clone(); + let tokenizer_manager_fast_field = segment.index().tokenizer_fast_field().clone(); let table_size = compute_initial_table_size(memory_budget_in_bytes)?; let segment_serializer = SegmentSerializer::for_segment(segment, false)?; let per_field_postings_writers = PerFieldPostingsWriter::for_schema(&schema); @@ -113,7 +114,7 @@ impl SegmentWriter { segment_serializer, fast_field_writers: FastFieldsWriter::from_schema_and_tokenizer_manager( &schema, - tokenizer_manager, + tokenizer_manager_fast_field, )?, doc_opstamps: Vec::with_capacity(1_000), per_field_text_analyzers, diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index f049ba8e1a..57f2f10229 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -25,8 +25,12 @@ pub struct TextOptions { #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] #[serde(untagged)] +/// Enum to control how the fast field setting of a text field. enum FastFieldOptions { + /// Flag to enable/disable IsEnabled(bool), + /// Enable with tokenizer. The tokenizer must be available on the fast field tokenizer manager. + /// `Index::tokenizer_fast_field`. EnabledWithTokenizer { with_tokenizer: TokenizerName }, } @@ -111,6 +115,8 @@ impl TextOptions { /// The effective cardinality depends on the tokenizer. Without a tokenizer, the text will be /// stored as is, which equals to the "raw" tokenizer. The tokenizer can be used to apply /// normalization like lower case. + /// The passed tokenizer_name must be available on the fast field tokenizer manager. + /// `Index::tokenizer_fast_field`. /// /// The original text can be retrieved via /// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term) From 029f7695b2ef1e1ca550a2baa2391d3e4529a7bf Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 8 May 2023 15:41:24 +0800 Subject: [PATCH 2/2] rename --- src/core/index.rs | 2 +- src/indexer/segment_writer.rs | 2 +- src/schema/text_options.rs | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index fa5a3a8e03..4ac0c3cdcf 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -417,7 +417,7 @@ impl Index { } /// Accessor for the fast field tokenizer manager. - pub fn tokenizer_fast_field(&self) -> &TokenizerManager { + pub fn fast_field_tokenizer(&self) -> &TokenizerManager { &self.fast_field_tokenizers } diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index a207fd2eaf..c21d7d5348 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -84,7 +84,7 @@ impl SegmentWriter { ) -> crate::Result { let schema = segment.schema(); let tokenizer_manager = segment.index().tokenizers().clone(); - let tokenizer_manager_fast_field = segment.index().tokenizer_fast_field().clone(); + let tokenizer_manager_fast_field = segment.index().fast_field_tokenizer().clone(); let table_size = compute_initial_table_size(memory_budget_in_bytes)?; let segment_serializer = SegmentSerializer::for_segment(segment, false)?; let per_field_postings_writers = PerFieldPostingsWriter::for_schema(&schema); diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index 57f2f10229..cd1a04a226 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -30,7 +30,7 @@ enum FastFieldOptions { /// Flag to enable/disable IsEnabled(bool), /// Enable with tokenizer. The tokenizer must be available on the fast field tokenizer manager. - /// `Index::tokenizer_fast_field`. + /// `Index::fast_field_tokenizer`. EnabledWithTokenizer { with_tokenizer: TokenizerName }, } @@ -116,7 +116,7 @@ impl TextOptions { /// stored as is, which equals to the "raw" tokenizer. The tokenizer can be used to apply /// normalization like lower case. /// The passed tokenizer_name must be available on the fast field tokenizer manager. - /// `Index::tokenizer_fast_field`. + /// `Index::fast_field_tokenizer`. /// /// The original text can be retrieved via /// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term)