From 98d8933bd43b824146db56b0432fec1cc5c65aa0 Mon Sep 17 00:00:00 2001 From: Evance Soumaoro Date: Tue, 5 Jul 2022 17:48:56 +0000 Subject: [PATCH] updated tantivy version to get support for phrase query with slop --- CHANGELOG.md | 1 + Cargo.lock | 12 +++---- docs/reference/query-language.md | 30 ++++++++++++++++ quickwit-core/Cargo.toml | 2 +- quickwit-directories/Cargo.toml | 2 +- quickwit-doc-mapper/Cargo.toml | 4 +-- quickwit-indexing/Cargo.toml | 2 +- quickwit-search/Cargo.toml | 2 +- quickwit-search/src/tests.rs | 62 ++++++++++++++++++++++++++++++++ quickwit-storage/Cargo.toml | 2 +- 10 files changed, 106 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 05cab0ae89b..f91a44f8a40 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Support for boolean field + - Support for slop in phrase queries ### Fixed diff --git a/Cargo.lock b/Cargo.lock index db58f0bdcf7..c2eda58bc03 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -951,7 +951,7 @@ checksum = "25c7df09945d65ea8d70b3321547ed414bbc540aad5bac6883d021b970f35b04" [[package]] name = "fastfield_codecs" version = "0.2.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=a568857#a5688572a52b0202bfe83e868566e4e3aabe686f" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=5750224#5750224d4c9d13d863ba7fbe36bae1ddeaeb8038" dependencies = [ "tantivy-bitpacker", "tantivy-common", @@ -2135,7 +2135,7 @@ dependencies = [ [[package]] name = "ownedbytes" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=a568857#a5688572a52b0202bfe83e868566e4e3aabe686f" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=5750224#5750224d4c9d13d863ba7fbe36bae1ddeaeb8038" dependencies = [ "stable_deref_trait", ] @@ -4042,7 +4042,7 @@ dependencies = [ [[package]] name = "tantivy" version = "0.18.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=a568857#a5688572a52b0202bfe83e868566e4e3aabe686f" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=5750224#5750224d4c9d13d863ba7fbe36bae1ddeaeb8038" dependencies = [ "async-trait", "base64", @@ -4094,12 +4094,12 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" version = "0.2.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=a568857#a5688572a52b0202bfe83e868566e4e3aabe686f" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=5750224#5750224d4c9d13d863ba7fbe36bae1ddeaeb8038" [[package]] name = "tantivy-common" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=a568857#a5688572a52b0202bfe83e868566e4e3aabe686f" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=5750224#5750224d4c9d13d863ba7fbe36bae1ddeaeb8038" dependencies = [ "byteorder", "ownedbytes", @@ -4119,7 +4119,7 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" version = "0.18.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=a568857#a5688572a52b0202bfe83e868566e4e3aabe686f" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=5750224#5750224d4c9d13d863ba7fbe36bae1ddeaeb8038" dependencies = [ "combine", "once_cell", diff --git a/docs/reference/query-language.md b/docs/reference/query-language.md index fa463bc8441..f28df556eae 100644 --- a/docs/reference/query-language.md +++ b/docs/reference/query-language.md @@ -35,6 +35,36 @@ Quickwit supports parenthesis to group multiple clauses: (color:red OR color:green) AND size:large ``` +### Slop Operator + +Quickwit also supports phrase queries with a slop parameter using the slop operator `~` followed by the value of the slop. For instance, the query `body:"small bike"~2` will match documents containing the word `small`, followed by one or two words immediately followed by the word `bike`. + +:::caution +Slop queries can only be used on field indexed with the [record option](./../configuration/index-config.md#text-type) set to `position` value. +::: + +#### Examples: + +With the following corpus: +```json +[ + {"id": 1, "body": "a red bike"}, + {"id": 2, "body": "a small blue bike"}, + {"id": 3, "body": "a small, rusty, and yellow bike"}, + {"id": 4, "body": "fred's small bike"}, + {"id": 5, "body": "a tiny shelter"} +] +``` +The following queries will output: + +- `body:"small bird"~2`: no match [] +- `body:"red bike"~2`: matches [1] +- `body:"small blue bike"~3`: matches [2] +- `body:"small bike"`: matches [4] +- `body:"small bike"~1`: matches [2, 4] +- `body:"small bike"~2`: matches [2, 4] +- `body:"small bike"~3`: matches [2, 3, 4] + ### Escaping Special Characters Special reserved characters are: `+` , `^`, `` ` ``, `:`, `{`, `}`, `"`, `[`, `]`, `(`, `)`, `~`, `!`, `\\`, `*`, `SPACE`. Such characters can still appear in query terms, but they need to be escaped by an antislash `\` . diff --git a/quickwit-core/Cargo.toml b/quickwit-core/Cargo.toml index 4f9d87d5a8c..2b40ef92abe 100644 --- a/quickwit-core/Cargo.toml +++ b/quickwit-core/Cargo.toml @@ -26,7 +26,7 @@ quickwit-storage = { version = "0.3.1", path = "../quickwit-storage" } rand = "0.8" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" -tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "a568857", default-features = false, features = [ +tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "5750224", default-features = false, features = [ "mmap", "lz4-compression", "zstd-compression", diff --git a/quickwit-directories/Cargo.toml b/quickwit-directories/Cargo.toml index f0647fa7e61..aca161e7f0c 100644 --- a/quickwit-directories/Cargo.toml +++ b/quickwit-directories/Cargo.toml @@ -19,7 +19,7 @@ quickwit-storage = { version = "0.3.1", path = "../quickwit-storage" } serde = "1" serde_cbor = "0.11" serde_json = "1" -tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "a568857", default-features = false, features = [ +tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "5750224", default-features = false, features = [ "mmap", "lz4-compression", "zstd-compression", diff --git a/quickwit-doc-mapper/Cargo.toml b/quickwit-doc-mapper/Cargo.toml index 291ec4ace51..05b34531333 100644 --- a/quickwit-doc-mapper/Cargo.toml +++ b/quickwit-doc-mapper/Cargo.toml @@ -19,13 +19,13 @@ once_cell = "1.12" regex = "1" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" -tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "a568857", default-features = false, features = [ +tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "5750224", default-features = false, features = [ "mmap", "lz4-compression", "zstd-compression", "quickwit" ] } -tantivy-query-grammar = { git = "https://github.com/quickwit-oss/tantivy/", rev = "a568857" } +tantivy-query-grammar = { git = "https://github.com/quickwit-oss/tantivy/", rev = "5750224" } thiserror = "1.0" tracing = "0.1.29" typetag = "0.2" diff --git a/quickwit-indexing/Cargo.toml b/quickwit-indexing/Cargo.toml index 23789f900b9..912a7670baf 100644 --- a/quickwit-indexing/Cargo.toml +++ b/quickwit-indexing/Cargo.toml @@ -49,7 +49,7 @@ rusoto_kinesis = { version = "0.48", default-features = false, features = [ serde = "1" serde_json = "1" serde_yaml = "0.8" -tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "a568857", default-features = false, features = [ +tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "5750224", default-features = false, features = [ "mmap", "lz4-compression", "zstd-compression", diff --git a/quickwit-search/Cargo.toml b/quickwit-search/Cargo.toml index 9544235a295..4376b9e223f 100644 --- a/quickwit-search/Cargo.toml +++ b/quickwit-search/Cargo.toml @@ -40,7 +40,7 @@ quickwit-storage = { version = "0.3.1", path = "../quickwit-storage" } rayon = "1" serde = { version = "1.0", features = ["derive"] } serde_json = "1" -tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "a568857", default-features = false, features = [ +tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "5750224", default-features = false, features = [ "mmap", "lz4-compression", "zstd-compression", diff --git a/quickwit-search/src/tests.rs b/quickwit-search/src/tests.rs index 3306936c9bf..a2d21081115 100644 --- a/quickwit-search/src/tests.rs +++ b/quickwit-search/src/tests.rs @@ -44,6 +44,7 @@ async fn test_single_node_simple() -> anyhow::Result<()> { let docs = vec![ json!({"title": "snoopy", "body": "Snoopy is an anthropomorphic beagle[5] in the comic strip...", "url": "http://snoopy"}), json!({"title": "beagle", "body": "The beagle is a breed of small scent hound, similar in appearance to the much larger foxhound.", "url": "http://beagle"}), + json!({"title": "hamsters", "body": "A hamsters is a small rodent popularized these days as pets.", "url": "http://hamsters"}), ]; test_sandbox.add_documents(docs.clone()).await?; let search_request = SearchRequest { @@ -72,6 +73,67 @@ async fn test_single_node_simple() -> anyhow::Result<()> { Ok(()) } +async fn slop_search_and_check( + test_sandbox: &TestSandbox, + index_id: &str, + query: &str, + expected_num_match: u64, +) -> anyhow::Result<()> { + let search_request = SearchRequest { + index_id: index_id.to_string(), + query: query.to_string(), + search_fields: vec!["body".to_string()], + start_timestamp: None, + end_timestamp: None, + max_hits: 5, + start_offset: 0, + ..Default::default() + }; + let single_node_result = single_node_search( + &search_request, + &*test_sandbox.metastore(), + test_sandbox.storage_uri_resolver(), + ) + .await?; + assert_eq!(single_node_result.num_hits, expected_num_match); + assert_eq!(single_node_result.hits.len(), expected_num_match as usize); + Ok(()) +} + +#[tokio::test] +async fn test_slop_queries() -> anyhow::Result<()> { + let index_id = "slop-query"; + let doc_mapping_yaml = r#" + field_mappings: + - name: title + type: text + - name: body + type: text + record: position + "#; + + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "{}", &["body"]).await?; + let docs = vec![ + json!({"title": "one", "body": "a red bike"}), + json!({"title": "two", "body": "a small blue bike"}), + json!({"title": "three", "body": "a small, rusty, and yellow bike"}), + json!({"title": "four", "body": "fred's small bike"}), + json!({"title": "five", "body": "a tiny shelter"}), + ]; + test_sandbox.add_documents(docs.clone()).await?; + + slop_search_and_check(&test_sandbox, index_id, "\"small bird\"~2", 0).await?; + slop_search_and_check(&test_sandbox, index_id, "\"red bike\"~2", 1).await?; + slop_search_and_check(&test_sandbox, index_id, "\"small blue bike\"~3", 1).await?; + slop_search_and_check(&test_sandbox, index_id, "\"small bike\"", 1).await?; + slop_search_and_check(&test_sandbox, index_id, "\"small bike\"~1", 2).await?; + slop_search_and_check(&test_sandbox, index_id, "\"small bike\"~2", 2).await?; + slop_search_and_check(&test_sandbox, index_id, "\"small bike\"~3", 3).await?; + slop_search_and_check(&test_sandbox, index_id, "\"tiny shelter\"~3", 1).await?; + + Ok(()) +} + // TODO remove me once `Iterator::is_sorted_by_key` is stabilized. fn is_sorted>(mut it: I) -> bool where E: Ord { diff --git a/quickwit-storage/Cargo.toml b/quickwit-storage/Cargo.toml index b1d9e22fccf..65487bcc114 100644 --- a/quickwit-storage/Cargo.toml +++ b/quickwit-storage/Cargo.toml @@ -33,7 +33,7 @@ rusoto_s3 = { version = "0.48", default-features = false, features = [ ] } serde = { version = "1.0", features = ["derive"] } serde_json = "1" -tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "a568857", default-features = false, features = [ +tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "5750224", default-features = false, features = [ "mmap", "lz4-compression", "zstd-compression",