From 97e592dff1a6cf8a04c20e33ecda2ce5274974f4 Mon Sep 17 00:00:00 2001 From: Sergei Lavrentev <23312691+lavrxxx@users.noreply.github.com> Date: Sat, 23 Jul 2022 00:48:01 +0400 Subject: [PATCH 01/11] add possibility to change highlight prefix and postfix --- src/snippet/mod.rs | 124 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 98 insertions(+), 26 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index c67df021dc..34cea681c7 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -50,35 +50,51 @@ impl FragmentCandidate { } /// `Snippet` -/// Contains a fragment of a document, and some highlighed parts inside it. +/// Contains a fragment of a document, and some highlighted parts inside it. #[derive(Debug)] pub struct Snippet { fragment: String, highlighted: Vec>, + highlighten_prefix: String, + highlighten_postfix: String, } -const HIGHLIGHTEN_PREFIX: &str = ""; -const HIGHLIGHTEN_POSTFIX: &str = ""; - impl Snippet { + // TODO add a comment + fn new( + fragment: &str, + highlighted: Vec>, + highlighten_prefix: &str, + highlighten_postfix: &str, + ) -> Self { + Self { + fragment: fragment.to_string(), + highlighted, + highlighten_prefix: highlighten_prefix.to_string(), + highlighten_postfix: highlighten_postfix.to_string(), + } + } + /// Create a new, empty, `Snippet` pub fn empty() -> Snippet { Snippet { fragment: String::new(), highlighted: Vec::new(), + highlighten_prefix: String::new(), + highlighten_postfix: String::new(), } } - /// Returns a hignlightned html from the `Snippet`. + /// Returns a highlighted html from the `Snippet`. pub fn to_html(&self) -> String { let mut html = String::new(); let mut start_from: usize = 0; for item in self.highlighted.iter() { html.push_str(&encode_minimal(&self.fragment[start_from..item.start])); - html.push_str(HIGHLIGHTEN_PREFIX); + html.push_str(&self.highlighten_prefix); html.push_str(&encode_minimal(&self.fragment[item.clone()])); - html.push_str(HIGHLIGHTEN_POSTFIX); + html.push_str(&self.highlighten_postfix); start_from = item.end; } html.push_str(&encode_minimal( @@ -92,7 +108,7 @@ impl Snippet { &self.fragment } - /// Returns a list of higlighted positions from the `Snippet`. + /// Returns a list of highlighted positions from the `Snippet`. pub fn highlighted(&self) -> &[Range] { &self.highlighted } @@ -148,7 +164,12 @@ fn search_fragments<'a>( /// /// Takes a vector of `FragmentCandidate`s and the text. /// Figures out the best fragment from it and creates a snippet. -fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str) -> Snippet { +fn select_best_fragment_combination( + fragments: &[FragmentCandidate], + text: &str, + highlighten_prefix: &str, + highlighten_postfix: &str, +) -> Snippet { let best_fragment_opt = fragments.iter().max_by(|left, right| { let cmp_score = left .score @@ -167,17 +188,16 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str) .iter() .map(|item| item.start - fragment.start_offset..item.end - fragment.start_offset) .collect(); - Snippet { - fragment: fragment_text.to_string(), + Snippet::new( + fragment_text, highlighted, - } + highlighten_prefix, + highlighten_postfix, + ) } else { // when there no fragments to chose from, // for now create a empty snippet - Snippet { - fragment: String::new(), - highlighted: vec![], - } + Snippet::empty() } } @@ -227,6 +247,8 @@ pub struct SnippetGenerator { tokenizer: TextAnalyzer, field: Field, max_num_chars: usize, + highlighten_prefix: String, + highlighten_postfix: String, } impl SnippetGenerator { @@ -260,6 +282,8 @@ impl SnippetGenerator { tokenizer, field, max_num_chars: DEFAULT_MAX_NUM_CHARS, + highlighten_prefix: "".to_string(), + highlighten_postfix: "".to_string(), }) } @@ -290,7 +314,12 @@ impl SnippetGenerator { pub fn snippet(&self, text: &str) -> Snippet { let fragment_candidates = search_fragments(&self.tokenizer, text, &self.terms_text, self.max_num_chars); - select_best_fragment_combination(&fragment_candidates[..], text) + select_best_fragment_combination( + &fragment_candidates[..], + text, + &self.highlighten_prefix, + &self.highlighten_postfix, + ) } } @@ -320,6 +349,9 @@ to the project are from community members.[15] Rust won first place for "most loved programming language" in the Stack Overflow Developer Survey in 2016, 2017, and 2018."#; + const HIGHLIGHTEN_PREFIX: &str = ""; + const HIGHLIGHTEN_POSTFIX: &str = ""; + #[test] fn test_snippet() { let terms = btreemap! { @@ -333,7 +365,12 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(first.score, 1.9); assert_eq!(first.stop_offset, 89); } - let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT); + let snippet = select_best_fragment_combination( + &fragments[..], + TEST_TEXT, + "", + HIGHLIGHTEN_POSTFIX, + ); assert_eq!( snippet.fragment, "Rust is a systems programming language sponsored by\nMozilla which describes it as a \ @@ -341,7 +378,7 @@ Survey in 2016, 2017, and 2018."#; ); assert_eq!( snippet.to_html(), - "Rust is a systems programming language sponsored by\nMozilla which \ + "Rust is a systems programming language sponsored by\nMozilla which \ describes it as a "safe" ) } @@ -359,7 +396,12 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(first.score, 1.0); assert_eq!(first.stop_offset, 17); } - let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT); + let snippet = select_best_fragment_combination( + &fragments[..], + TEST_TEXT, + HIGHLIGHTEN_PREFIX, + HIGHLIGHTEN_POSTFIX, + ); assert_eq!(snippet.to_html(), "Rust is a systems") } { @@ -374,7 +416,12 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(first.score, 0.9); assert_eq!(first.stop_offset, 17); } - let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT); + let snippet = select_best_fragment_combination( + &fragments[..], + TEST_TEXT, + HIGHLIGHTEN_PREFIX, + HIGHLIGHTEN_POSTFIX, + ); assert_eq!(snippet.to_html(), "programming language") } } @@ -396,7 +443,12 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(first.stop_offset, 7); } - let snippet = select_best_fragment_combination(&fragments[..], text); + let snippet = select_best_fragment_combination( + &fragments[..], + text, + HIGHLIGHTEN_PREFIX, + HIGHLIGHTEN_POSTFIX, + ); assert_eq!(snippet.fragment, "c d"); assert_eq!(snippet.to_html(), "c d"); } @@ -418,7 +470,12 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(first.start_offset, 8); } - let snippet = select_best_fragment_combination(&fragments[..], text); + let snippet = select_best_fragment_combination( + &fragments[..], + text, + HIGHLIGHTEN_PREFIX, + HIGHLIGHTEN_POSTFIX, + ); assert_eq!(snippet.fragment, "e f"); assert_eq!(snippet.to_html(), "e f"); } @@ -441,7 +498,12 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(first.start_offset, 0); } - let snippet = select_best_fragment_combination(&fragments[..], text); + let snippet = select_best_fragment_combination( + &fragments[..], + text, + HIGHLIGHTEN_PREFIX, + HIGHLIGHTEN_POSTFIX, + ); assert_eq!(snippet.fragment, "e f g"); assert_eq!(snippet.to_html(), "e f g"); } @@ -457,7 +519,12 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(fragments.len(), 0); - let snippet = select_best_fragment_combination(&fragments[..], text); + let snippet = select_best_fragment_combination( + &fragments[..], + text, + HIGHLIGHTEN_PREFIX, + HIGHLIGHTEN_POSTFIX, + ); assert_eq!(snippet.fragment, ""); assert_eq!(snippet.to_html(), ""); } @@ -470,7 +537,12 @@ Survey in 2016, 2017, and 2018."#; let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 3); assert_eq!(fragments.len(), 0); - let snippet = select_best_fragment_combination(&fragments[..], text); + let snippet = select_best_fragment_combination( + &fragments[..], + text, + HIGHLIGHTEN_PREFIX, + HIGHLIGHTEN_POSTFIX, + ); assert_eq!(snippet.fragment, ""); assert_eq!(snippet.to_html(), ""); } From aecaa04ac028fa7b0db90cd090fd999f6b99aa3e Mon Sep 17 00:00:00 2001 From: Sergei Lavrentev <23312691+lavrxxx@users.noreply.github.com> Date: Sat, 23 Jul 2022 00:51:31 +0400 Subject: [PATCH 02/11] add comment to Snippet::new --- src/snippet/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 34cea681c7..b1af62eda0 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -60,7 +60,7 @@ pub struct Snippet { } impl Snippet { - // TODO add a comment + /// Create a new `Snippet`. fn new( fragment: &str, highlighted: Vec>, @@ -75,7 +75,7 @@ impl Snippet { } } - /// Create a new, empty, `Snippet` + /// Create a new, empty, `Snippet`. pub fn empty() -> Snippet { Snippet { fragment: String::new(), From 2960e8d548b96dbb1e239b145cfc3c4f82f27cf1 Mon Sep 17 00:00:00 2001 From: Sergei Lavrentev <23312691+lavrxxx@users.noreply.github.com> Date: Sat, 23 Jul 2022 00:57:42 +0400 Subject: [PATCH 03/11] add test for highlighten elements --- src/snippet/mod.rs | 51 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index b1af62eda0..c9ed692531 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -287,6 +287,12 @@ impl SnippetGenerator { }) } + /// Sets highlighten prefix and postfix. + pub fn set_highlighten_elements(&mut self, prefix: &str, postfix: &str) { + self.highlighten_prefix = prefix.to_string(); + self.highlighten_postfix = postfix.to_string() + } + /// Sets a maximum number of chars. pub fn set_max_num_chars(&mut self, max_num_chars: usize) { self.max_num_chars = max_num_chars; @@ -638,4 +644,49 @@ Survey in 2016, 2017, and 2018."#; } Ok(()) } + + #[test] + fn test_snippet_generator_custom_highlighten_elements() -> crate::Result<()> { + let mut schema_builder = Schema::builder(); + let text_options = TextOptions::default().set_indexing_options( + TextFieldIndexing::default() + .set_tokenizer("en_stem") + .set_index_option(IndexRecordOption::Basic), + ); + let text_field = schema_builder.add_text_field("text", text_options); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + { + // writing the segment + let mut index_writer = index.writer_for_tests()?; + let doc = doc!(text_field => TEST_TEXT); + index_writer.add_document(doc)?; + index_writer.commit()?; + } + let searcher = index.reader().unwrap().searcher(); + let query_parser = QueryParser::for_index(&index, vec![text_field]); + let query = query_parser.parse_query("rust design").unwrap(); + let mut snippet_generator = + SnippetGenerator::create(&searcher, &*query, text_field).unwrap(); + { + let snippet = snippet_generator.snippet(TEST_TEXT); + assert_eq!( + snippet.to_html(), + "imperative-procedural paradigms. Rust is syntactically similar to \ + C++[according to whom?],\nbut its designers intend it to provide better \ + memory safety" + ); + } + { + snippet_generator.set_max_num_chars(90); + snippet_generator.set_highlighten_elements("", ""); + let snippet = snippet_generator.snippet(TEST_TEXT); + assert_eq!( + snippet.to_html(), + "Rust is syntactically similar to C++[according to whom?],\nbut its \ + designers intend it to" + ); + } + Ok(()) + } } From 75b62e758306e47042c5f0e9c70748e35bd80325 Mon Sep 17 00:00:00 2001 From: Sergei Lavrentev <23312691+lavrxxx@users.noreply.github.com> Date: Sat, 23 Jul 2022 01:01:58 +0400 Subject: [PATCH 04/11] add default highlight prefix and postfix constants --- src/snippet/mod.rs | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index c9ed692531..dcf9915ab6 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -11,6 +11,9 @@ use crate::{Document, Score, Searcher}; const DEFAULT_MAX_NUM_CHARS: usize = 150; +const DEFAULT_HIGHLIGHTEN_PREFIX: &str = ""; +const DEFAULT_HIGHLIGHTEN_POSTFIX: &str = ""; + #[derive(Debug)] pub struct FragmentCandidate { score: Score, @@ -282,8 +285,8 @@ impl SnippetGenerator { tokenizer, field, max_num_chars: DEFAULT_MAX_NUM_CHARS, - highlighten_prefix: "".to_string(), - highlighten_postfix: "".to_string(), + highlighten_prefix: DEFAULT_HIGHLIGHTEN_PREFIX.to_string(), + highlighten_postfix: DEFAULT_HIGHLIGHTEN_POSTFIX.to_string(), }) } @@ -335,7 +338,10 @@ mod tests { use maplit::btreemap; - use super::{search_fragments, select_best_fragment_combination}; + use super::{ + search_fragments, select_best_fragment_combination, DEFAULT_HIGHLIGHTEN_POSTFIX, + DEFAULT_HIGHLIGHTEN_PREFIX, + }; use crate::query::QueryParser; use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT}; use crate::tokenizer::SimpleTokenizer; @@ -355,9 +361,6 @@ to the project are from community members.[15] Rust won first place for "most loved programming language" in the Stack Overflow Developer Survey in 2016, 2017, and 2018."#; - const HIGHLIGHTEN_PREFIX: &str = ""; - const HIGHLIGHTEN_POSTFIX: &str = ""; - #[test] fn test_snippet() { let terms = btreemap! { @@ -375,7 +378,7 @@ Survey in 2016, 2017, and 2018."#; &fragments[..], TEST_TEXT, "", - HIGHLIGHTEN_POSTFIX, + DEFAULT_HIGHLIGHTEN_POSTFIX, ); assert_eq!( snippet.fragment, @@ -405,8 +408,8 @@ Survey in 2016, 2017, and 2018."#; let snippet = select_best_fragment_combination( &fragments[..], TEST_TEXT, - HIGHLIGHTEN_PREFIX, - HIGHLIGHTEN_POSTFIX, + DEFAULT_HIGHLIGHTEN_PREFIX, + DEFAULT_HIGHLIGHTEN_POSTFIX, ); assert_eq!(snippet.to_html(), "Rust is a systems") } @@ -425,8 +428,8 @@ Survey in 2016, 2017, and 2018."#; let snippet = select_best_fragment_combination( &fragments[..], TEST_TEXT, - HIGHLIGHTEN_PREFIX, - HIGHLIGHTEN_POSTFIX, + DEFAULT_HIGHLIGHTEN_PREFIX, + DEFAULT_HIGHLIGHTEN_POSTFIX, ); assert_eq!(snippet.to_html(), "programming language") } @@ -452,8 +455,8 @@ Survey in 2016, 2017, and 2018."#; let snippet = select_best_fragment_combination( &fragments[..], text, - HIGHLIGHTEN_PREFIX, - HIGHLIGHTEN_POSTFIX, + DEFAULT_HIGHLIGHTEN_PREFIX, + DEFAULT_HIGHLIGHTEN_POSTFIX, ); assert_eq!(snippet.fragment, "c d"); assert_eq!(snippet.to_html(), "c d"); @@ -479,8 +482,8 @@ Survey in 2016, 2017, and 2018."#; let snippet = select_best_fragment_combination( &fragments[..], text, - HIGHLIGHTEN_PREFIX, - HIGHLIGHTEN_POSTFIX, + DEFAULT_HIGHLIGHTEN_PREFIX, + DEFAULT_HIGHLIGHTEN_POSTFIX, ); assert_eq!(snippet.fragment, "e f"); assert_eq!(snippet.to_html(), "e f"); @@ -507,8 +510,8 @@ Survey in 2016, 2017, and 2018."#; let snippet = select_best_fragment_combination( &fragments[..], text, - HIGHLIGHTEN_PREFIX, - HIGHLIGHTEN_POSTFIX, + DEFAULT_HIGHLIGHTEN_PREFIX, + DEFAULT_HIGHLIGHTEN_POSTFIX, ); assert_eq!(snippet.fragment, "e f g"); assert_eq!(snippet.to_html(), "e f g"); @@ -528,8 +531,8 @@ Survey in 2016, 2017, and 2018."#; let snippet = select_best_fragment_combination( &fragments[..], text, - HIGHLIGHTEN_PREFIX, - HIGHLIGHTEN_POSTFIX, + DEFAULT_HIGHLIGHTEN_PREFIX, + DEFAULT_HIGHLIGHTEN_POSTFIX, ); assert_eq!(snippet.fragment, ""); assert_eq!(snippet.to_html(), ""); @@ -546,8 +549,8 @@ Survey in 2016, 2017, and 2018."#; let snippet = select_best_fragment_combination( &fragments[..], text, - HIGHLIGHTEN_PREFIX, - HIGHLIGHTEN_POSTFIX, + DEFAULT_HIGHLIGHTEN_PREFIX, + DEFAULT_HIGHLIGHTEN_POSTFIX, ); assert_eq!(snippet.fragment, ""); assert_eq!(snippet.to_html(), ""); From 06ccec4bbb6299a23ad3c2a62dacf2a014375b24 Mon Sep 17 00:00:00 2001 From: Sergei Lavrentev <23312691+lavrxxx@users.noreply.github.com> Date: Wed, 27 Jul 2022 19:38:37 +0400 Subject: [PATCH 05/11] fix spelling --- src/snippet/mod.rs | 94 +++++++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index dcf9915ab6..31a2e142d8 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -11,8 +11,8 @@ use crate::{Document, Score, Searcher}; const DEFAULT_MAX_NUM_CHARS: usize = 150; -const DEFAULT_HIGHLIGHTEN_PREFIX: &str = ""; -const DEFAULT_HIGHLIGHTEN_POSTFIX: &str = ""; +const DEFAULT_HIGHLIGHTING_PREFIX: &str = ""; +const DEFAULT_HIGHLIGHTING_POSTFIX: &str = ""; #[derive(Debug)] pub struct FragmentCandidate { @@ -58,8 +58,8 @@ impl FragmentCandidate { pub struct Snippet { fragment: String, highlighted: Vec>, - highlighten_prefix: String, - highlighten_postfix: String, + highlighting_prefix: String, + highlighting_postfix: String, } impl Snippet { @@ -67,14 +67,14 @@ impl Snippet { fn new( fragment: &str, highlighted: Vec>, - highlighten_prefix: &str, - highlighten_postfix: &str, + highlighting_prefix: &str, + highlighting_postfix: &str, ) -> Self { Self { fragment: fragment.to_string(), highlighted, - highlighten_prefix: highlighten_prefix.to_string(), - highlighten_postfix: highlighten_postfix.to_string(), + highlighting_prefix: highlighting_prefix.to_string(), + highlighting_postfix: highlighting_postfix.to_string(), } } @@ -83,8 +83,8 @@ impl Snippet { Snippet { fragment: String::new(), highlighted: Vec::new(), - highlighten_prefix: String::new(), - highlighten_postfix: String::new(), + highlighting_prefix: String::new(), + highlighting_postfix: String::new(), } } @@ -95,9 +95,9 @@ impl Snippet { for item in self.highlighted.iter() { html.push_str(&encode_minimal(&self.fragment[start_from..item.start])); - html.push_str(&self.highlighten_prefix); + html.push_str(&self.highlighting_prefix); html.push_str(&encode_minimal(&self.fragment[item.clone()])); - html.push_str(&self.highlighten_postfix); + html.push_str(&self.highlighting_postfix); start_from = item.end; } html.push_str(&encode_minimal( @@ -170,8 +170,8 @@ fn search_fragments<'a>( fn select_best_fragment_combination( fragments: &[FragmentCandidate], text: &str, - highlighten_prefix: &str, - highlighten_postfix: &str, + highlighting_prefix: &str, + highlighting_postfix: &str, ) -> Snippet { let best_fragment_opt = fragments.iter().max_by(|left, right| { let cmp_score = left @@ -194,8 +194,8 @@ fn select_best_fragment_combination( Snippet::new( fragment_text, highlighted, - highlighten_prefix, - highlighten_postfix, + highlighting_prefix, + highlighting_postfix, ) } else { // when there no fragments to chose from, @@ -250,8 +250,8 @@ pub struct SnippetGenerator { tokenizer: TextAnalyzer, field: Field, max_num_chars: usize, - highlighten_prefix: String, - highlighten_postfix: String, + highlighting_prefix: String, + highlighting_postfix: String, } impl SnippetGenerator { @@ -285,15 +285,15 @@ impl SnippetGenerator { tokenizer, field, max_num_chars: DEFAULT_MAX_NUM_CHARS, - highlighten_prefix: DEFAULT_HIGHLIGHTEN_PREFIX.to_string(), - highlighten_postfix: DEFAULT_HIGHLIGHTEN_POSTFIX.to_string(), + highlighting_prefix: DEFAULT_HIGHLIGHTING_PREFIX.to_string(), + highlighting_postfix: DEFAULT_HIGHLIGHTING_POSTFIX.to_string(), }) } - /// Sets highlighten prefix and postfix. - pub fn set_highlighten_elements(&mut self, prefix: &str, postfix: &str) { - self.highlighten_prefix = prefix.to_string(); - self.highlighten_postfix = postfix.to_string() + /// Sets highlighted prefix and postfix. + pub fn set_highlighted_elements(&mut self, prefix: &str, postfix: &str) { + self.highlighting_prefix = prefix.to_string(); + self.highlighting_postfix = postfix.to_string() } /// Sets a maximum number of chars. @@ -326,8 +326,8 @@ impl SnippetGenerator { select_best_fragment_combination( &fragment_candidates[..], text, - &self.highlighten_prefix, - &self.highlighten_postfix, + &self.highlighting_prefix, + &self.highlighting_postfix, ) } } @@ -339,8 +339,8 @@ mod tests { use maplit::btreemap; use super::{ - search_fragments, select_best_fragment_combination, DEFAULT_HIGHLIGHTEN_POSTFIX, - DEFAULT_HIGHLIGHTEN_PREFIX, + search_fragments, select_best_fragment_combination, DEFAULT_HIGHLIGHTING_POSTFIX, + DEFAULT_HIGHLIGHTING_PREFIX, }; use crate::query::QueryParser; use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT}; @@ -377,8 +377,8 @@ Survey in 2016, 2017, and 2018."#; let snippet = select_best_fragment_combination( &fragments[..], TEST_TEXT, - "", - DEFAULT_HIGHLIGHTEN_POSTFIX, + "", + DEFAULT_HIGHLIGHTING_POSTFIX, ); assert_eq!( snippet.fragment, @@ -387,7 +387,7 @@ Survey in 2016, 2017, and 2018."#; ); assert_eq!( snippet.to_html(), - "Rust is a systems programming language sponsored by\nMozilla which \ + "Rust is a systems programming language sponsored by\nMozilla which \ describes it as a "safe" ) } @@ -408,8 +408,8 @@ Survey in 2016, 2017, and 2018."#; let snippet = select_best_fragment_combination( &fragments[..], TEST_TEXT, - DEFAULT_HIGHLIGHTEN_PREFIX, - DEFAULT_HIGHLIGHTEN_POSTFIX, + DEFAULT_HIGHLIGHTING_PREFIX, + DEFAULT_HIGHLIGHTING_POSTFIX, ); assert_eq!(snippet.to_html(), "Rust is a systems") } @@ -428,8 +428,8 @@ Survey in 2016, 2017, and 2018."#; let snippet = select_best_fragment_combination( &fragments[..], TEST_TEXT, - DEFAULT_HIGHLIGHTEN_PREFIX, - DEFAULT_HIGHLIGHTEN_POSTFIX, + DEFAULT_HIGHLIGHTING_PREFIX, + DEFAULT_HIGHLIGHTING_POSTFIX, ); assert_eq!(snippet.to_html(), "programming language") } @@ -455,8 +455,8 @@ Survey in 2016, 2017, and 2018."#; let snippet = select_best_fragment_combination( &fragments[..], text, - DEFAULT_HIGHLIGHTEN_PREFIX, - DEFAULT_HIGHLIGHTEN_POSTFIX, + DEFAULT_HIGHLIGHTING_PREFIX, + DEFAULT_HIGHLIGHTING_POSTFIX, ); assert_eq!(snippet.fragment, "c d"); assert_eq!(snippet.to_html(), "c d"); @@ -482,8 +482,8 @@ Survey in 2016, 2017, and 2018."#; let snippet = select_best_fragment_combination( &fragments[..], text, - DEFAULT_HIGHLIGHTEN_PREFIX, - DEFAULT_HIGHLIGHTEN_POSTFIX, + DEFAULT_HIGHLIGHTING_PREFIX, + DEFAULT_HIGHLIGHTING_POSTFIX, ); assert_eq!(snippet.fragment, "e f"); assert_eq!(snippet.to_html(), "e f"); @@ -510,8 +510,8 @@ Survey in 2016, 2017, and 2018."#; let snippet = select_best_fragment_combination( &fragments[..], text, - DEFAULT_HIGHLIGHTEN_PREFIX, - DEFAULT_HIGHLIGHTEN_POSTFIX, + DEFAULT_HIGHLIGHTING_PREFIX, + DEFAULT_HIGHLIGHTING_POSTFIX, ); assert_eq!(snippet.fragment, "e f g"); assert_eq!(snippet.to_html(), "e f g"); @@ -531,8 +531,8 @@ Survey in 2016, 2017, and 2018."#; let snippet = select_best_fragment_combination( &fragments[..], text, - DEFAULT_HIGHLIGHTEN_PREFIX, - DEFAULT_HIGHLIGHTEN_POSTFIX, + DEFAULT_HIGHLIGHTING_PREFIX, + DEFAULT_HIGHLIGHTING_POSTFIX, ); assert_eq!(snippet.fragment, ""); assert_eq!(snippet.to_html(), ""); @@ -549,8 +549,8 @@ Survey in 2016, 2017, and 2018."#; let snippet = select_best_fragment_combination( &fragments[..], text, - DEFAULT_HIGHLIGHTEN_PREFIX, - DEFAULT_HIGHLIGHTEN_POSTFIX, + DEFAULT_HIGHLIGHTING_PREFIX, + DEFAULT_HIGHLIGHTING_POSTFIX, ); assert_eq!(snippet.fragment, ""); assert_eq!(snippet.to_html(), ""); @@ -649,7 +649,7 @@ Survey in 2016, 2017, and 2018."#; } #[test] - fn test_snippet_generator_custom_highlighten_elements() -> crate::Result<()> { + fn test_snippet_generator_custom_highlighted_elements() -> crate::Result<()> { let mut schema_builder = Schema::builder(); let text_options = TextOptions::default().set_indexing_options( TextFieldIndexing::default() @@ -682,7 +682,7 @@ Survey in 2016, 2017, and 2018."#; } { snippet_generator.set_max_num_chars(90); - snippet_generator.set_highlighten_elements("", ""); + snippet_generator.set_highlighted_elements("", ""); let snippet = snippet_generator.snippet(TEST_TEXT); assert_eq!( snippet.to_html(), From 5d6890bf2d396ea2eb7298d1fa96a28436d48eeb Mon Sep 17 00:00:00 2001 From: Sergei Lavrentev <23312691+lavrxxx@users.noreply.github.com> Date: Wed, 27 Jul 2022 19:41:03 +0400 Subject: [PATCH 06/11] fix tests --- src/snippet/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 31a2e142d8..8685695e7f 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -387,7 +387,7 @@ Survey in 2016, 2017, and 2018."#; ); assert_eq!( snippet.to_html(), - "Rust is a systems programming language sponsored by\nMozilla which \ + "Rust is a systems programming language sponsored by\nMozilla which \ describes it as a "safe" ) } From 75dd089bf65eb1ba0c3af9747eba1f3efd2cf351 Mon Sep 17 00:00:00 2001 From: Sergei Lavrentev <23312691+lavrxxx@users.noreply.github.com> Date: Fri, 29 Jul 2022 00:53:42 +0400 Subject: [PATCH 07/11] fix spelling --- src/snippet/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 8685695e7f..0ebc23c565 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -198,8 +198,8 @@ fn select_best_fragment_combination( highlighting_postfix, ) } else { - // when there no fragments to chose from, - // for now create a empty snippet + // When there are no fragments to chose from, + // for now create an empty snippet. Snippet::empty() } } From 2c33aa2aa64f67447367aa7491a8548dc6553d9e Mon Sep 17 00:00:00 2001 From: Sergei Lavrentev <23312691+lavrxxx@users.noreply.github.com> Date: Sun, 4 Sep 2022 12:04:05 +0400 Subject: [PATCH 08/11] do fixes after code review --- src/snippet/mod.rs | 114 ++++++++++----------------------------------- 1 file changed, 24 insertions(+), 90 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 8038d0070c..61896de146 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -64,17 +64,12 @@ pub struct Snippet { impl Snippet { /// Create a new `Snippet`. - fn new( - fragment: &str, - highlighted: Vec>, - highlighting_prefix: &str, - highlighting_postfix: &str, - ) -> Self { + fn new(fragment: &str, highlighted: Vec>) -> Self { Self { fragment: fragment.to_string(), highlighted, - highlighting_prefix: highlighting_prefix.to_string(), - highlighting_postfix: highlighting_postfix.to_string(), + highlighting_prefix: DEFAULT_HIGHLIGHTING_PREFIX.to_string(), + highlighting_postfix: DEFAULT_HIGHLIGHTING_POSTFIX.to_string(), } } @@ -120,6 +115,12 @@ impl Snippet { pub fn highlighted(&self) -> &[Range] { &self.highlighted } + + /// Sets highlighted prefix and postfix. + pub fn set_highlighted_elements(&mut self, prefix: &str, postfix: &str) { + self.highlighting_prefix = prefix.to_string(); + self.highlighting_postfix = postfix.to_string() + } } /// Returns a non-empty list of "good" fragments. @@ -172,12 +173,7 @@ fn search_fragments<'a>( /// /// Takes a vector of `FragmentCandidate`s and the text. /// Figures out the best fragment from it and creates a snippet. -fn select_best_fragment_combination( - fragments: &[FragmentCandidate], - text: &str, - highlighting_prefix: &str, - highlighting_postfix: &str, -) -> Snippet { +fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str) -> Snippet { let best_fragment_opt = fragments.iter().max_by(|left, right| { let cmp_score = left .score @@ -196,12 +192,7 @@ fn select_best_fragment_combination( .iter() .map(|item| item.start - fragment.start_offset..item.end - fragment.start_offset) .collect(); - Snippet::new( - fragment_text, - highlighted, - highlighting_prefix, - highlighting_postfix, - ) + Snippet::new(fragment_text, highlighted) } else { // When there are no fragments to chose from, // for now create an empty snippet. @@ -302,8 +293,6 @@ pub struct SnippetGenerator { tokenizer: TextAnalyzer, field: Field, max_num_chars: usize, - highlighting_prefix: String, - highlighting_postfix: String, } impl SnippetGenerator { @@ -352,17 +341,9 @@ impl SnippetGenerator { tokenizer, field, max_num_chars: DEFAULT_MAX_NUM_CHARS, - highlighting_prefix: DEFAULT_HIGHLIGHTING_PREFIX.to_string(), - highlighting_postfix: DEFAULT_HIGHLIGHTING_POSTFIX.to_string(), }) } - /// Sets highlighted prefix and postfix. - pub fn set_highlighted_elements(&mut self, prefix: &str, postfix: &str) { - self.highlighting_prefix = prefix.to_string(); - self.highlighting_postfix = postfix.to_string() - } - /// Sets a maximum number of chars. pub fn set_max_num_chars(&mut self, max_num_chars: usize) { self.max_num_chars = max_num_chars; @@ -390,12 +371,7 @@ impl SnippetGenerator { pub fn snippet(&self, text: &str) -> Snippet { let fragment_candidates = search_fragments(&self.tokenizer, text, &self.terms_text, self.max_num_chars); - select_best_fragment_combination( - &fragment_candidates[..], - text, - &self.highlighting_prefix, - &self.highlighting_postfix, - ) + select_best_fragment_combination(&fragment_candidates[..], text) } } @@ -410,9 +386,7 @@ mod tests { use crate::tokenizer::{NgramTokenizer, SimpleTokenizer}; use crate::{Index, SnippetGenerator}; - use super::{collapse_overlapped_ranges, search_fragments, select_best_fragment_combination,DEFAULT_HIGHLIGHTING_POSTFIX, - DEFAULT_HIGHLIGHTING_PREFIX, - }; + use super::{collapse_overlapped_ranges, search_fragments, select_best_fragment_combination}; const TEST_TEXT: &str = r#"Rust is a systems programming language sponsored by Mozilla which describes it as a "safe, concurrent, practical language", supporting functional and @@ -441,12 +415,7 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(first.score, 1.9); assert_eq!(first.stop_offset, 89); } - let snippet = select_best_fragment_combination( - &fragments[..], - TEST_TEXT, - "", - DEFAULT_HIGHLIGHTING_POSTFIX, - ); + let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT); assert_eq!( snippet.fragment, "Rust is a systems programming language sponsored by\nMozilla which describes it as a \ @@ -454,7 +423,7 @@ Survey in 2016, 2017, and 2018."#; ); assert_eq!( snippet.to_html(), - "Rust is a systems programming language sponsored by\nMozilla which \ + "Rust is a systems programming language sponsored by\nMozilla which \ describes it as a "safe" ) } @@ -472,12 +441,7 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(first.score, 1.0); assert_eq!(first.stop_offset, 17); } - let snippet = select_best_fragment_combination( - &fragments[..], - TEST_TEXT, - DEFAULT_HIGHLIGHTING_PREFIX, - DEFAULT_HIGHLIGHTING_POSTFIX, - ); + let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT); assert_eq!(snippet.to_html(), "Rust is a systems") } { @@ -492,12 +456,7 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(first.score, 0.9); assert_eq!(first.stop_offset, 17); } - let snippet = select_best_fragment_combination( - &fragments[..], - TEST_TEXT, - DEFAULT_HIGHLIGHTING_PREFIX, - DEFAULT_HIGHLIGHTING_POSTFIX, - ); + let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT); assert_eq!(snippet.to_html(), "programming language") } } @@ -519,12 +478,7 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(first.stop_offset, 7); } - let snippet = select_best_fragment_combination( - &fragments[..], - text, - DEFAULT_HIGHLIGHTING_PREFIX, - DEFAULT_HIGHLIGHTING_POSTFIX, - ); + let snippet = select_best_fragment_combination(&fragments[..], text); assert_eq!(snippet.fragment, "c d"); assert_eq!(snippet.to_html(), "c d"); } @@ -546,12 +500,7 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(first.start_offset, 8); } - let snippet = select_best_fragment_combination( - &fragments[..], - text, - DEFAULT_HIGHLIGHTING_PREFIX, - DEFAULT_HIGHLIGHTING_POSTFIX, - ); + let snippet = select_best_fragment_combination(&fragments[..], text); assert_eq!(snippet.fragment, "e f"); assert_eq!(snippet.to_html(), "e f"); } @@ -574,12 +523,7 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(first.start_offset, 0); } - let snippet = select_best_fragment_combination( - &fragments[..], - text, - DEFAULT_HIGHLIGHTING_PREFIX, - DEFAULT_HIGHLIGHTING_POSTFIX, - ); + let snippet = select_best_fragment_combination(&fragments[..], text); assert_eq!(snippet.fragment, "e f g"); assert_eq!(snippet.to_html(), "e f g"); } @@ -595,12 +539,7 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(fragments.len(), 0); - let snippet = select_best_fragment_combination( - &fragments[..], - text, - DEFAULT_HIGHLIGHTING_PREFIX, - DEFAULT_HIGHLIGHTING_POSTFIX, - ); + let snippet = select_best_fragment_combination(&fragments[..], text); assert_eq!(snippet.fragment, ""); assert_eq!(snippet.to_html(), ""); assert!(snippet.is_empty()); @@ -614,12 +553,7 @@ Survey in 2016, 2017, and 2018."#; let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 3); assert_eq!(fragments.len(), 0); - let snippet = select_best_fragment_combination( - &fragments[..], - text, - DEFAULT_HIGHLIGHTING_PREFIX, - DEFAULT_HIGHLIGHTING_POSTFIX, - ); + let snippet = select_best_fragment_combination(&fragments[..], text); assert_eq!(snippet.fragment, ""); assert_eq!(snippet.to_html(), ""); assert!(snippet.is_empty()); @@ -788,8 +722,8 @@ Survey in 2016, 2017, and 2018."#; } { snippet_generator.set_max_num_chars(90); - snippet_generator.set_highlighted_elements("", ""); - let snippet = snippet_generator.snippet(TEST_TEXT); + let mut snippet = snippet_generator.snippet(TEST_TEXT); + snippet.set_highlighted_elements("", ""); assert_eq!( snippet.to_html(), "Rust is syntactically similar to C++[according to whom?],\nbut its \ From 7bc2dd3fc9799a02fb6a51b6d0d30864ffe57873 Mon Sep 17 00:00:00 2001 From: Sergei Lavrentev <23312691+lavrxxx@users.noreply.github.com> Date: Sun, 4 Sep 2022 12:10:34 +0400 Subject: [PATCH 09/11] reduce test_snippet_generator_custom_highlighted_elements code --- src/snippet/mod.rs | 55 ++++++++++++---------------------------------- 1 file changed, 14 insertions(+), 41 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 61896de146..4f3a7c9c34 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -689,47 +689,20 @@ Survey in 2016, 2017, and 2018."#; } #[test] - fn test_snippet_generator_custom_highlighted_elements() -> crate::Result<()> { - let mut schema_builder = Schema::builder(); - let text_options = TextOptions::default().set_indexing_options( - TextFieldIndexing::default() - .set_tokenizer("en_stem") - .set_index_option(IndexRecordOption::Basic), + fn test_snippet_generator_custom_highlighted_elements() { + let terms = btreemap! { String::from("rust") => 1.0, String::from("language") => 0.9 }; + let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 100); + let mut snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT); + assert_eq!( + snippet.to_html(), + "Rust is a systems programming language sponsored by\nMozilla which \ + describes it as a "safe" + ); + snippet.set_highlighted_elements("", ""); + assert_eq!( + snippet.to_html(), + "Rust is a systems programming language \ + sponsored by\nMozilla which describes it as a "safe" ); - let text_field = schema_builder.add_text_field("text", text_options); - let schema = schema_builder.build(); - let index = Index::create_in_ram(schema); - { - // writing the segment - let mut index_writer = index.writer_for_tests()?; - let doc = doc!(text_field => TEST_TEXT); - index_writer.add_document(doc)?; - index_writer.commit()?; - } - let searcher = index.reader().unwrap().searcher(); - let query_parser = QueryParser::for_index(&index, vec![text_field]); - let query = query_parser.parse_query("rust design").unwrap(); - let mut snippet_generator = - SnippetGenerator::create(&searcher, &*query, text_field).unwrap(); - { - let snippet = snippet_generator.snippet(TEST_TEXT); - assert_eq!( - snippet.to_html(), - "imperative-procedural paradigms. Rust is syntactically similar to \ - C++[according to whom?],\nbut its designers intend it to provide better \ - memory safety" - ); - } - { - snippet_generator.set_max_num_chars(90); - let mut snippet = snippet_generator.snippet(TEST_TEXT); - snippet.set_highlighted_elements("", ""); - assert_eq!( - snippet.to_html(), - "Rust is syntactically similar to C++[according to whom?],\nbut its \ - designers intend it to" - ); - } - Ok(()) } } From 77702022ed0522d38d8bfbabe3be36f64f7a5e29 Mon Sep 17 00:00:00 2001 From: Sergei Lavrentev <23312691+lavrd@users.noreply.github.com> Date: Sat, 6 May 2023 14:09:10 +0400 Subject: [PATCH 10/11] fix fmt --- src/snippet/mod.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 16e686974e..bc96505cbf 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -382,13 +382,12 @@ mod tests { use maplit::btreemap; + use super::{collapse_overlapped_ranges, search_fragments, select_best_fragment_combination}; use crate::query::QueryParser; use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT}; use crate::tokenizer::{NgramTokenizer, SimpleTokenizer}; use crate::{Index, SnippetGenerator}; - use super::{collapse_overlapped_ranges, search_fragments, select_best_fragment_combination}; - const TEST_TEXT: &str = r#"Rust is a systems programming language sponsored by Mozilla which describes it as a "safe, concurrent, practical language", supporting functional and imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?], @@ -703,7 +702,7 @@ Survey in 2016, 2017, and 2018."#; assert_eq!( snippet.to_html(), "Rust is a systems programming language \ - sponsored by\nMozilla which describes it as a "safe" + sponsored by\nMozilla which describes it as a "safe" ); } } From 4350878965c5ce914e99795e995096e9be04c5e7 Mon Sep 17 00:00:00 2001 From: Sergei Lavrentev <23312691+lavrd@users.noreply.github.com> Date: Wed, 10 May 2023 15:01:47 +0400 Subject: [PATCH 11/11] change names to more convenient --- src/snippet/mod.rs | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index bc96505cbf..09fd9c8d6f 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -11,8 +11,8 @@ use crate::{Document, Score, Searcher, Term}; const DEFAULT_MAX_NUM_CHARS: usize = 150; -const DEFAULT_HIGHLIGHTING_PREFIX: &str = ""; -const DEFAULT_HIGHLIGHTING_POSTFIX: &str = ""; +const DEFAULT_SNIPPET_PREFIX: &str = ""; +const DEFAULT_SNIPPET_POSTFIX: &str = ""; #[derive(Debug)] pub struct FragmentCandidate { @@ -58,8 +58,8 @@ impl FragmentCandidate { pub struct Snippet { fragment: String, highlighted: Vec>, - highlighting_prefix: String, - highlighting_postfix: String, + snippet_prefix: String, + snippet_postfix: String, } impl Snippet { @@ -68,8 +68,8 @@ impl Snippet { Self { fragment: fragment.to_string(), highlighted, - highlighting_prefix: DEFAULT_HIGHLIGHTING_PREFIX.to_string(), - highlighting_postfix: DEFAULT_HIGHLIGHTING_POSTFIX.to_string(), + snippet_prefix: DEFAULT_SNIPPET_PREFIX.to_string(), + snippet_postfix: DEFAULT_SNIPPET_POSTFIX.to_string(), } } @@ -78,8 +78,8 @@ impl Snippet { Snippet { fragment: String::new(), highlighted: Vec::new(), - highlighting_prefix: String::new(), - highlighting_postfix: String::new(), + snippet_prefix: String::new(), + snippet_postfix: String::new(), } } @@ -95,9 +95,9 @@ impl Snippet { for item in collapse_overlapped_ranges(&self.highlighted) { html.push_str(&encode_minimal(&self.fragment[start_from..item.start])); - html.push_str(&self.highlighting_prefix); + html.push_str(&self.snippet_prefix); html.push_str(&encode_minimal(&self.fragment[item.clone()])); - html.push_str(&self.highlighting_postfix); + html.push_str(&self.snippet_postfix); start_from = item.end; } html.push_str(&encode_minimal( @@ -117,9 +117,9 @@ impl Snippet { } /// Sets highlighted prefix and postfix. - pub fn set_highlighted_elements(&mut self, prefix: &str, postfix: &str) { - self.highlighting_prefix = prefix.to_string(); - self.highlighting_postfix = postfix.to_string() + pub fn set_snippet_prefix_postfix(&mut self, prefix: &str, postfix: &str) { + self.snippet_prefix = prefix.to_string(); + self.snippet_postfix = postfix.to_string() } } @@ -698,7 +698,7 @@ Survey in 2016, 2017, and 2018."#; "Rust is a systems programming language sponsored by\nMozilla which \ describes it as a "safe" ); - snippet.set_highlighted_elements("", ""); + snippet.set_snippet_prefix_postfix("", ""); assert_eq!( snippet.to_html(), "Rust is a systems programming language \