Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add seperate tokenizer manager for fast fields #2019

Merged
merged 2 commits into from
May 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions src/core/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,7 @@ pub struct Index {
settings: IndexSettings,
executor: Arc<Executor>,
tokenizers: TokenizerManager,
fast_field_tokenizers: TokenizerManager,
inventory: SegmentMetaInventory,
}

Expand Down Expand Up @@ -394,6 +395,7 @@ impl Index {
directory,
schema,
tokenizers: TokenizerManager::default(),
fast_field_tokenizers: TokenizerManager::default(),
executor: Arc::new(Executor::single_thread()),
inventory,
}
Expand All @@ -409,6 +411,16 @@ impl Index {
&self.tokenizers
}

/// Setter for the fast field tokenizer manager.
pub fn set_fast_field_tokenizers(&mut self, tokenizers: TokenizerManager) {
self.fast_field_tokenizers = tokenizers;
}

/// Accessor for the fast field tokenizer manager.
pub fn fast_field_tokenizer(&self) -> &TokenizerManager {
&self.fast_field_tokenizers
}

/// Get the tokenizer associated with a specific field.
pub fn tokenizer_for_field(&self, field: Field) -> crate::Result<TextAnalyzer> {
let field_entry = self.schema.get_field_entry(field);
Expand Down
34 changes: 32 additions & 2 deletions src/fastfield/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,11 @@ mod tests {
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::merge_policy::NoMergePolicy;
use crate::schema::{
Document, Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder, FAST,
INDEXED, STORED, STRING, TEXT,
Document, Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder,
TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
};
use crate::time::OffsetDateTime;
use crate::tokenizer::{LowerCaser, RawTokenizer, TextAnalyzer, TokenizerManager};
use crate::{DateOptions, DatePrecision, Index, SegmentId, SegmentReader};

pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
Expand Down Expand Up @@ -1173,6 +1174,35 @@ mod tests {
assert_eq!(&vals, &[33]);
}

#[test]
fn test_fast_field_tokenizer() {
let mut schema_builder = Schema::builder();
let opt = TextOptions::default().set_fast(Some("custom_lowercase"));
let text_field = schema_builder.add_text_field("text", opt);
let schema = schema_builder.build();
let ff_tokenizer_manager = TokenizerManager::default();
ff_tokenizer_manager.register(
"custom_lowercase",
TextAnalyzer::builder(RawTokenizer)
.filter(LowerCaser)
.build(),
);

let mut index = Index::create_in_ram(schema);
index.set_fast_field_tokenizers(ff_tokenizer_manager);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(text_field => "Test1 test2"))
.unwrap();
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
let fast_field_reader = searcher.segment_reader(0u32).fast_fields();
let column = fast_field_reader.str("text").unwrap().unwrap();
let mut out = String::new();
column.ord_to_str(0u64, &mut out).unwrap();
assert_eq!(&out, "test1 test2");
}

#[test]
fn test_text_fast_field_tokenizer() {
let mut schema_builder = Schema::builder();
Expand Down
3 changes: 2 additions & 1 deletion src/indexer/segment_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ impl SegmentWriter {
) -> crate::Result<SegmentWriter> {
let schema = segment.schema();
let tokenizer_manager = segment.index().tokenizers().clone();
let tokenizer_manager_fast_field = segment.index().fast_field_tokenizer().clone();
let table_size = compute_initial_table_size(memory_budget_in_bytes)?;
let segment_serializer = SegmentSerializer::for_segment(segment, false)?;
let per_field_postings_writers = PerFieldPostingsWriter::for_schema(&schema);
Expand Down Expand Up @@ -113,7 +114,7 @@ impl SegmentWriter {
segment_serializer,
fast_field_writers: FastFieldsWriter::from_schema_and_tokenizer_manager(
&schema,
tokenizer_manager,
tokenizer_manager_fast_field,
)?,
doc_opstamps: Vec::with_capacity(1_000),
per_field_text_analyzers,
Expand Down
6 changes: 6 additions & 0 deletions src/schema/text_options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,12 @@ pub struct TextOptions {

#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(untagged)]
/// Enum to control how the fast field setting of a text field.
enum FastFieldOptions {
/// Flag to enable/disable
IsEnabled(bool),
/// Enable with tokenizer. The tokenizer must be available on the fast field tokenizer manager.
/// `Index::fast_field_tokenizer`.
EnabledWithTokenizer { with_tokenizer: TokenizerName },
fulmicoton marked this conversation as resolved.
Show resolved Hide resolved
}

Expand Down Expand Up @@ -111,6 +115,8 @@ impl TextOptions {
/// The effective cardinality depends on the tokenizer. Without a tokenizer, the text will be
/// stored as is, which equals to the "raw" tokenizer. The tokenizer can be used to apply
/// normalization like lower case.
/// The passed tokenizer_name must be available on the fast field tokenizer manager.
/// `Index::fast_field_tokenizer`.
///
/// The original text can be retrieved via
/// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term)
Expand Down