Add rest api test and clarify docs about leniency

quickwit-oss · Dec 9, 2024 · f04c1b7 · f04c1b7
1 parent 2951107
commit f04c1b7
Show file tree

Hide file tree

Showing 11 changed files with 54 additions and 27 deletions.
diff --git a/docs/reference/es_compatible_api.md b/docs/reference/es_compatible_api.md
@@ -394,6 +394,7 @@ The following query types are supported.
 | `fields`           | `String[]` (Optional) | Default search target fields.                                                                                               | -             |
 | `default_operator` | `"AND"` or `"OR"`     | In the absence of boolean operator defines whether terms should be combined as a conjunction (`AND`) or disjunction (`OR`). | `OR`          |
 | `boost`            | `Number`              | Multiplier boost for score computation.                                                                                     | 1.0           |
+| `lenient`          | `Boolean`             | [See note](#about-the-lenient-argument).                                                                                    | false         |
 
 
 ### `bool`
@@ -494,7 +495,7 @@ The following query types are supported.
 | `operator`         | `"AND"` or `"OR"` | Defines whether all terms should be present (`AND`) or if at least one term is sufficient to match (`OR`).                     | OR      |
 | `zero_terms_query` | `all` or `none`   | Defines if all (`all`) or no documents (`none`) should be returned if the query does not contain any terms after tokenization. | `none`  |
 | `boost`            | `Number`          | Multiplier boost for score computation                                                                                         | 1.0     |
-
+| `lenient`          | `Boolean`         | [See note](#about-the-lenient-argument).                                                                                       | false   |
 
 
 
@@ -637,8 +638,17 @@ Contrary to ES/Opensearch, in Quickwit, at most 50 terms will be considered when
 }
 ```
 
-#### Supported Multi-match Queries
-| Type            | Description                                                                                 |
+#### Supported parameters
+
+| Variable           | Type                  | Description                                  | Default value |
+| ------------------ | --------------------- | ---------------------------------------------| ------------- |
+| `type`             | `String`              | See supported types below                    | `most_fields` |
+| `fields`           | `String[]` (Optional) | Default search target fields.                | -             |
+| `lenient`          | `Boolean`             | [See note](#about-the-lenient-argument).     | false         |
+
+Supported types:
+
+| `type` value    | Description                                                                                 |
 | --------------- | ------------------------------------------------------------------------------------------- |
 | `most_fields`   | Finds documents matching any field and combines the `_score` from each field (default).  |
 | `phrase`        | Runs a `match_phrase` query on each field.       |
@@ -721,6 +731,12 @@ Query matching only documents containing a non-null value for a given field.
 | `field`  | String | Only documents with a value for field will be returned. | -       |
 
 
+### About the `lenient` argument
+
+Quickwit and Elasticsearch have different interpretations of the `lenient` setting:
+- In Quickwit, lenient mode allows ignoring parts of the query that reference non-existing columns. This is a behavior that Elasticsearch supports by default.
+- In Elasticsearch, lenient mode primarily addresses type errors (such as searching for text in an integer field). Quickwit always supports this behavior, regardless of the `lenient` setting.
+
 ## Search multiple indices
 
 Search APIs that accept <index_id> requests path parameter also support multi-target syntax.

diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs
@@ -702,12 +702,14 @@ mod test {
             phrase: "short".to_string(),
             max_expansions: 50,
             params: params.clone(),
+            lenient: false,
         };
         let long = PhrasePrefixQuery {
             field: "title".to_string(),
             phrase: "not so short".to_string(),
             max_expansions: 50,
             params: params.clone(),
+            lenient: false,
         };
         let mut extractor1 = ExtractPrefixTermRanges::with_schema(&schema, &tokenizer_manager);
         extractor1.visit_phrase_prefix(&short).unwrap();

diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/match_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/match_query.rs
@@ -19,6 +19,7 @@
 
 use serde::Deserialize;
 
+use super::LeniencyBool;
 use crate::elastic_query_dsl::{
     ConvertibleToQueryAst, ElasticQueryDslInner, StringOrStructForSerialization,
 };
@@ -42,11 +43,8 @@ pub(crate) struct MatchQueryParams {
     pub(crate) operator: BooleanOperand,
     #[serde(default)]
     pub(crate) zero_terms_query: MatchAllOrNone,
-    // Quickwit and Elastic have different notions of lenient. For us, it means it's okay to
-    // disregard part of the query where which uses non-existing collumn (which Elastic does by
-    // default). For Elastic, it covers type errors (searching text in an integer field).
     #[serde(default)]
-    pub(crate) lenient: bool,
+    pub(crate) lenient: LeniencyBool,
 }
 
 impl ConvertibleToQueryAst for MatchQuery {

diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs b/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs
@@ -50,6 +50,14 @@ use crate::elastic_query_dsl::terms_query::TermsQuery;
 use crate::not_nan_f32::NotNaNf32;
 use crate::query_ast::QueryAst;
 
+/// Quickwit and Elasticsearch have different interpretations of leniency:
+/// - In Quickwit, lenient mode allows ignoring parts of the query that reference non-existing
+///   columns. This is a behavior that Elasticsearch supports by default.
+/// - In Elasticsearch, lenient mode primarily addresses type errors (such as searching for text in
+///   an integer field). Quickwit always supports this behavior, regardless of the `lenient`
+///   setting.
+pub type LeniencyBool = bool;
+
 fn default_max_expansions() -> u32 {
     50
 }

diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/multi_match.rs b/quickwit/quickwit-query/src/elastic_query_dsl/multi_match.rs
@@ -21,6 +21,7 @@ use serde::Deserialize;
 use serde_with::formats::PreferMany;
 use serde_with::{serde_as, OneOrMany};
 
+use super::LeniencyBool;
 use crate::elastic_query_dsl::bool_query::BoolQuery;
 use crate::elastic_query_dsl::match_bool_prefix::MatchBoolPrefixQuery;
 use crate::elastic_query_dsl::match_phrase_query::{MatchPhraseQuery, MatchPhraseQueryParams};
@@ -48,11 +49,8 @@ struct MultiMatchQueryForDeserialization {
     #[serde_as(deserialize_as = "OneOrMany<_, PreferMany>")]
     #[serde(default)]
     fields: Vec<String>,
-    // Quickwit and Elastic have different notions of lenient. For us, it means it's okay to
-    // disregard part of the query where which uses non-existing collumn (which Elastic does by
-    // default). For Elastic, it covers type errors (searching text in an integer field).
     #[serde(default)]
-    lenient: bool,
+    lenient: LeniencyBool,
 }
 
 fn deserialize_match_query_for_one_field(

diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/query_string_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/query_string_query.rs
@@ -19,6 +19,7 @@
 
 use serde::Deserialize;
 
+use super::LeniencyBool;
 use crate::elastic_query_dsl::ConvertibleToQueryAst;
 use crate::not_nan_f32::NotNaNf32;
 use crate::query_ast::UserInputQuery;
@@ -40,11 +41,8 @@ pub(crate) struct QueryStringQuery {
     default_operator: BooleanOperand,
     #[serde(default)]
     boost: Option<NotNaNf32>,
-    // Regardless of this option Quickwit behaves in elasticsearch definition of
-    // lenient. We include this property here just to accept user queries containing
-    // this option.
     #[serde(default)]
-    lenient: bool,
+    lenient: LeniencyBool,
 }
 
 impl ConvertibleToQueryAst for QueryStringQuery {

diff --git a/quickwit/quickwit-query/src/query_ast/full_text_query.rs b/quickwit/quickwit-query/src/query_ast/full_text_query.rs
@@ -227,6 +227,7 @@ pub struct FullTextQuery {
     pub field: String,
     pub text: String,
     pub params: FullTextParams,
+    /// Support missing fields
     pub lenient: bool,
 }
 

diff --git a/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs b/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs
@@ -38,6 +38,7 @@ pub struct PhrasePrefixQuery {
     pub phrase: String,
     pub max_expansions: u32,
     pub params: FullTextParams,
+    /// Support missing fields
     pub lenient: bool,
 }
 

diff --git a/quickwit/quickwit-query/src/query_ast/user_input_query.rs b/quickwit/quickwit-query/src/query_ast/user_input_query.rs
@@ -49,6 +49,7 @@ pub struct UserInputQuery {
     #[serde(default, skip_serializing_if = "Option::is_none")]
     pub default_fields: Option<Vec<String>>,
     pub default_operator: BooleanOperand,
+    /// Support missing fields
     pub lenient: bool,
 }
 

diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs
@@ -34,6 +34,7 @@ use crate::{find_field_or_hit_dynamic, InvalidQuery};
 pub struct WildcardQuery {
     pub field: String,
     pub value: String,
+    /// Support missing fields
     pub lenient: bool,
 }
 
@@ -43,16 +44,6 @@ impl From<WildcardQuery> for QueryAst {
     }
 }
 
-impl WildcardQuery {
-    #[cfg(test)]
-    pub fn from_field_value(field: impl ToString, value: impl ToString) -> Self {
-        Self {
-            field: field.to_string(),
-            value: value.to_string(),
-        }
-    }
-}
-
 fn extract_unique_token(mut tokens: Vec<Term>) -> anyhow::Result<Term> {
     let term = tokens
         .pop()
@@ -218,6 +209,7 @@ mod tests {
         let query = WildcardQuery {
             field: "my_field".to_string(),
             value: "MyString Wh1ch a nOrMal Tokenizer would cut*".to_string(),
+            lenient: false,
         };
         let tokenizer_manager = create_default_quickwit_tokenizer_manager();
         for tokenizer in ["raw", "whitespace"] {

diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml
@@ -226,19 +226,31 @@ json:
   query:
     query_string:
       query: "true"
-      fields: ["public", "public.inner"]
+      fields: ["public", "public.notdefined", "notdefined"]
       lenient: true
 expected:
   hits:
     total:
       value: 100
 ---
+# trailing wildcard
+json:
+  query:
+    query_string:
+      query: "jour*"
+      fields: ["payload.description", "payload.notdefined", "notdefined"]
+      lenient: true
+expected:
+  hits:
+    total:
+      value: 3
+---
 # elasticsearch accepts this query
 engines:
   - quickwit
 json:
   query:
     query_string:
       query: "true"
-      fields: ["public", "public.inner"]
+      fields: ["public", "public.notdefined"]
 status_code: 400