diff --git a/query-grammar/Cargo.toml b/query-grammar/Cargo.toml index b395de9138..e4f2df98b4 100644 --- a/query-grammar/Cargo.toml +++ b/query-grammar/Cargo.toml @@ -14,4 +14,4 @@ edition = "2018" [dependencies] combine = {version="4", default-features=false, features=[] } once_cell = "1.7.2" -regex ={ version = "1.5.4", default-features = false, features = ["std"] } +regex ={ version = "1.5.4", default-features = false, features = ["std", "unicode"] } diff --git a/query-grammar/src/query_grammar.rs b/query-grammar/src/query_grammar.rs index dc8d06fbf3..4f05fbbea5 100644 --- a/query-grammar/src/query_grammar.rs +++ b/query-grammar/src/query_grammar.rs @@ -16,9 +16,9 @@ use crate::Occur; // Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to // special characters. const SPECIAL_CHARS: &[char] = &[ - '+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '~', '!', '\\', '*', ' ', + '+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '!', '\\', '*', ' ', ]; -const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|\~|!|\\|\*|\s)"#; +const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|!|\\|\*|\s)"#; /// Parses a field_name /// A field name must have at least one character and be followed by a colon. @@ -120,22 +120,36 @@ fn date_time<'a>() -> impl Parser<&'a str, Output = String> { fn term_val<'a>() -> impl Parser<&'a str, Output = String> { let phrase = char('"').with(many1(satisfy(|c| c != '"'))).skip(char('"')); - phrase.or(word()) + negative_number().or(phrase.or(word())) } fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> { - let term_val_with_field = negative_number().or(term_val()); - (field_name(), term_val_with_field).map(|(field_name, phrase)| UserInputLiteral { + (field_name(), term_val(), slop_val()).map(|(field_name, phrase, slop)| UserInputLiteral { field_name: Some(field_name), phrase, + slop, + }) +} + +fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> { + let slop = + (char('~'), many1(digit())).and_then(|(_, slop): (_, String)| match slop.parse::() { + Ok(d) => Ok(d), + _ => Err(StringStreamError::UnexpectedParse), + }); + optional(slop).map(|slop| match slop { + Some(d) => d, + _ => 0, }) } fn literal<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> { - let term_default_field = term_val().map(|phrase| UserInputLiteral { + let term_default_field = (term_val(), slop_val()).map(|(phrase, slop)| UserInputLiteral { field_name: None, phrase, + slop, }); + attempt(term_query()) .or(term_default_field) .map(UserInputLeaf::from) @@ -522,18 +536,10 @@ mod test { super::field_name().parse(".my.field.name:a"), Ok((".my.field.name".to_string(), "a")) ); - assert_eq!( - super::field_name().parse(r#"my\ field:a"#), - Ok(("my field".to_string(), "a")) - ); assert_eq!( super::field_name().parse(r#"にんじん:a"#), Ok(("にんじん".to_string(), "a")) ); - assert_eq!( - super::field_name().parse("my\\ field\\ name:a"), - Ok(("my field name".to_string(), "a")) - ); assert_eq!( super::field_name().parse(r#"my\field:a"#), Ok((r#"my\field"#.to_string(), "a")) @@ -562,6 +568,17 @@ mod test { super::field_name().parse("_my_field:a"), Ok(("_my_field".to_string(), "a")) ); + assert_eq!( + super::field_name().parse("~my~field:a"), + Ok(("~my~field".to_string(), "a")) + ); + for special_char in SPECIAL_CHARS.iter() { + let query = &format!("\\{special_char}my\\{special_char}field:a"); + assert_eq!( + super::field_name().parse(&query), + Ok((format!("{special_char}my{special_char}field"), "a")) + ); + } } #[test] @@ -714,4 +731,22 @@ mod test { ); test_is_parse_err("abc + "); } + + #[test] + fn test_slop() { + assert!(parse_to_ast().parse("\"a b\"~").is_err()); + assert!(parse_to_ast().parse("foo:\"a b\"~").is_err()); + assert!(parse_to_ast().parse("\"a b\"~a").is_err()); + assert!(parse_to_ast().parse("\"a b\"~100000000000000000").is_err()); + + test_parse_query_to_ast_helper("\"a b\"^2~4", "(*(\"a b\")^2 *\"~4\")"); + test_parse_query_to_ast_helper("\"~Document\"", "\"~Document\""); + test_parse_query_to_ast_helper("~Document", "\"~Document\""); + test_parse_query_to_ast_helper("a~2", "\"a~2\""); + test_parse_query_to_ast_helper("\"a b\"~0", "\"a b\""); + test_parse_query_to_ast_helper("\"a b\"~1", "\"a b\"~1"); + test_parse_query_to_ast_helper("\"a b\"~3", "\"a b\"~3"); + test_parse_query_to_ast_helper("foo:\"a b\"~300", "\"foo\":\"a b\"~300"); + test_parse_query_to_ast_helper("\"a b\"~300^2", "(\"a b\"~300)^2"); + } } diff --git a/query-grammar/src/user_input_ast.rs b/query-grammar/src/user_input_ast.rs index 359900bab1..3130ddbbe6 100644 --- a/query-grammar/src/user_input_ast.rs +++ b/query-grammar/src/user_input_ast.rs @@ -40,14 +40,19 @@ impl Debug for UserInputLeaf { pub struct UserInputLiteral { pub field_name: Option, pub phrase: String, + pub slop: u32, } impl fmt::Debug for UserInputLiteral { fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { - match self.field_name { - Some(ref field_name) => write!(formatter, "\"{}\":\"{}\"", field_name, self.phrase), - None => write!(formatter, "\"{}\"", self.phrase), + if let Some(ref field) = self.field_name { + write!(formatter, "\"{}\":", field)?; } + write!(formatter, "\"{}\"", self.phrase)?; + if self.slop > 0 { + write!(formatter, "~{}", self.slop)?; + } + Ok(()) } } diff --git a/src/query/phrase_query/phrase_query.rs b/src/query/phrase_query/phrase_query.rs index b147158a40..ee5fadec1e 100644 --- a/src/query/phrase_query/phrase_query.rs +++ b/src/query/phrase_query/phrase_query.rs @@ -40,7 +40,12 @@ impl PhraseQuery { /// Creates a new `PhraseQuery` given a list of terms and their offsets. /// /// Can be used to provide custom offset for each term. - pub fn new_with_offset(mut terms: Vec<(usize, Term)>) -> PhraseQuery { + pub fn new_with_offset(terms: Vec<(usize, Term)>) -> PhraseQuery { + PhraseQuery::new_with_offset_and_slop(terms, 0) + } + + /// Creates a new `PhraseQuery` given a list of terms, their offsets and a slop + pub fn new_with_offset_and_slop(mut terms: Vec<(usize, Term)>, slop: u32) -> PhraseQuery { assert!( terms.len() > 1, "A phrase query is required to have strictly more than one term." @@ -54,7 +59,7 @@ impl PhraseQuery { PhraseQuery { field, phrase_terms: terms, - slop: 0, + slop, } } diff --git a/src/query/query_parser/logical_ast.rs b/src/query/query_parser/logical_ast.rs index 9d26c3cd67..2eb75e675a 100644 --- a/src/query/query_parser/logical_ast.rs +++ b/src/query/query_parser/logical_ast.rs @@ -8,7 +8,7 @@ use crate::Score; #[derive(Clone)] pub enum LogicalLiteral { Term(Term), - Phrase(Vec<(usize, Term)>), + Phrase(Vec<(usize, Term)>, u32), Range { field: Field, value_type: Type, @@ -74,7 +74,14 @@ impl fmt::Debug for LogicalLiteral { fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { match *self { LogicalLiteral::Term(ref term) => write!(formatter, "{:?}", term), - LogicalLiteral::Phrase(ref terms) => write!(formatter, "\"{:?}\"", terms), + LogicalLiteral::Phrase(ref terms, slop) => { + write!(formatter, "\"{:?}\"", terms)?; + if slop > 0 { + write!(formatter, "~{:?}", slop) + } else { + Ok(()) + } + } LogicalLiteral::Range { ref lower, ref upper, diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 597dca07c4..15494dfd08 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -168,6 +168,9 @@ fn trim_ast(logical_ast: LogicalAst) -> Option { /// It is also possible to define a boost for a some specific field, at the query parser level. /// (See [`set_boost(...)`](#method.set_field_boost) ). Typically you may want to boost a title /// field. +/// +/// Phrase terms support the `~` slop operator which allows to set the phrase's matching +/// distance in words. `"big wolf"~1` will return documents containing the phrase `"big bad wolf"`. #[derive(Clone)] pub struct QueryParser { schema: Schema, @@ -405,6 +408,7 @@ impl QueryParser { field: Field, json_path: &str, phrase: &str, + slop: u32, ) -> Result, QueryParserError> { let field_entry = self.schema.get_field_entry(field); let field_type = field_entry.field_type(); @@ -461,6 +465,7 @@ impl QueryParser { field_name, field, phrase, + slop, &text_analyzer, index_record_option, )? @@ -626,7 +631,9 @@ impl QueryParser { self.compute_path_triplets_for_literal(&literal)?; let mut asts: Vec = Vec::new(); for (field, json_path, phrase) in term_phrases { - for ast in self.compute_logical_ast_for_leaf(field, json_path, phrase)? { + for ast in + self.compute_logical_ast_for_leaf(field, json_path, phrase, literal.slop)? + { // Apply some field specific boost defined at the query parser level. let boost = self.field_boost(field); asts.push(LogicalAst::Leaf(Box::new(ast)).boost(boost)); @@ -670,9 +677,9 @@ impl QueryParser { fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box { match logical_literal { LogicalLiteral::Term(term) => Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs)), - LogicalLiteral::Phrase(term_with_offsets) => { - Box::new(PhraseQuery::new_with_offset(term_with_offsets)) - } + LogicalLiteral::Phrase(term_with_offsets, slop) => Box::new( + PhraseQuery::new_with_offset_and_slop(term_with_offsets, slop), + ), LogicalLiteral::Range { field, value_type, @@ -689,6 +696,7 @@ fn generate_literals_for_str( field_name: &str, field: Field, phrase: &str, + slop: u32, text_analyzer: &TextAnalyzer, index_record_option: IndexRecordOption, ) -> Result, QueryParserError> { @@ -710,7 +718,7 @@ fn generate_literals_for_str( field_name.to_string(), )); } - Ok(Some(LogicalLiteral::Phrase(terms))) + Ok(Some(LogicalLiteral::Phrase(terms, slop))) } fn generate_literals_for_json_object( @@ -741,7 +749,7 @@ fn generate_literals_for_json_object( field_name.to_string(), )); } - logical_literals.push(LogicalLiteral::Phrase(terms)); + logical_literals.push(LogicalLiteral::Phrase(terms, 0)); Ok(logical_literals) } @@ -1493,4 +1501,23 @@ mod test { assert_eq!(&super::locate_splitting_dots(r#"a\.b.c"#), &[4]); assert_eq!(&super::locate_splitting_dots(r#"a\..b.c"#), &[3, 5]); } + + #[test] + pub fn test_phrase_slop() { + test_parse_query_to_logical_ast_helper( + "\"a b\"~0", + r#"("[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]" "[(0, Term(type=Str, field=1, "a")), (1, Term(type=Str, field=1, "b"))]")"#, + false, + ); + test_parse_query_to_logical_ast_helper( + "\"a b\"~2", + r#"("[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]"~2 "[(0, Term(type=Str, field=1, "a")), (1, Term(type=Str, field=1, "b"))]"~2)"#, + false, + ); + test_parse_query_to_logical_ast_helper( + "title:\"a b~4\"~2", + r#""[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b")), (2, Term(type=Str, field=0, "4"))]"~2"#, + false, + ); + } }