From c3ac2bd58a5c406982212def72580cc25e89761a Mon Sep 17 00:00:00 2001 From: Liam Thompson <32779855+leemthompo@users.noreply.github.com> Date: Thu, 28 Nov 2024 08:23:28 +0100 Subject: [PATCH 01/15] [DOCS] Add Elastic Rerank usage docs (#117625) --- .../inference/service-elasticsearch.asciidoc | 41 +++++++-- .../reranking/semantic-reranking.asciidoc | 20 +++-- docs/reference/search/retriever.asciidoc | 83 +++++++++++++++++-- 3 files changed, 121 insertions(+), 23 deletions(-) diff --git a/docs/reference/inference/service-elasticsearch.asciidoc b/docs/reference/inference/service-elasticsearch.asciidoc index 0103b425faefe..cd06e6d7b2f64 100644 --- a/docs/reference/inference/service-elasticsearch.asciidoc +++ b/docs/reference/inference/service-elasticsearch.asciidoc @@ -69,15 +69,15 @@ include::inference-shared.asciidoc[tag=service-settings] These settings are specific to the `elasticsearch` service. -- -`adaptive_allocations`::: -(Optional, object) -include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=adaptive-allocation] - `deployment_id`::: (Optional, string) The `deployment_id` of an existing trained model deployment. When `deployment_id` is used the `model_id` is optional. +`adaptive_allocations`::: +(Optional, object) +include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=adaptive-allocation] + `enabled`:::: (Optional, Boolean) include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=adaptive-allocation-enabled] @@ -119,7 +119,6 @@ include::inference-shared.asciidoc[tag=task-settings] Returns the document instead of only the index. Defaults to `true`. ===== - [discrete] [[inference-example-elasticsearch-elser]] ==== ELSER via the `elasticsearch` service @@ -137,7 +136,7 @@ PUT _inference/sparse_embedding/my-elser-model "adaptive_allocations": { <1> "enabled": true, "min_number_of_allocations": 1, - "max_number_of_allocations": 10 + "max_number_of_allocations": 4 }, "num_threads": 1, "model_id": ".elser_model_2" <2> @@ -150,6 +149,34 @@ PUT _inference/sparse_embedding/my-elser-model Valid values are `.elser_model_2` and `.elser_model_2_linux-x86_64`. For further details, refer to the {ml-docs}/ml-nlp-elser.html[ELSER model documentation]. +[discrete] +[[inference-example-elastic-reranker]] +==== Elastic Rerank via the `elasticsearch` service + +The following example shows how to create an {infer} endpoint called `my-elastic-rerank` to perform a `rerank` task type using the built-in Elastic Rerank cross-encoder model. + +The API request below will automatically download the Elastic Rerank model if it isn't already downloaded and then deploy the model. +Once deployed, the model can be used for semantic re-ranking with a <>. + +[source,console] +------------------------------------------------------------ +PUT _inference/rerank/my-elastic-rerank +{ + "service": "elasticsearch", + "service_settings": { + "model_id": ".rerank-v1", <1> + "num_threads": 1, + "adaptive_allocations": { <2> + "enabled": true, + "min_number_of_allocations": 1, + "max_number_of_allocations": 4 + } + } +} +------------------------------------------------------------ +// TEST[skip:TBD] +<1> The `model_id` must be the ID of the built-in Elastic Rerank model: `.rerank-v1`. +<2> {ml-docs}/ml-nlp-auto-scale.html#nlp-model-adaptive-allocations[Adaptive allocations] will be enabled with the minimum of 1 and the maximum of 10 allocations. [discrete] [[inference-example-elasticsearch]] @@ -186,7 +213,7 @@ If using the Python client, you can set the `timeout` parameter to a higher valu [discrete] [[inference-example-eland]] -==== Models uploaded by Eland via the elasticsearch service +==== Models uploaded by Eland via the `elasticsearch` service The following example shows how to create an {infer} endpoint called `my-msmarco-minilm-model` to perform a `text_embedding` task type. diff --git a/docs/reference/reranking/semantic-reranking.asciidoc b/docs/reference/reranking/semantic-reranking.asciidoc index 4ebe90e44708e..e1e2abd224a8e 100644 --- a/docs/reference/reranking/semantic-reranking.asciidoc +++ b/docs/reference/reranking/semantic-reranking.asciidoc @@ -85,14 +85,16 @@ In {es}, semantic re-rankers are implemented using the {es} <> using the `rerank` task type -** Integrate directly with the <> using the `rerank` task type -** Upload a model to {es} from Hugging Face with {eland-docs}/machine-learning.html#ml-nlp-pytorch[Eland]. You'll need to use the `text_similarity` NLP task type when loading the model using Eland. Refer to {ml-docs}/ml-nlp-model-ref.html#ml-nlp-model-ref-text-similarity[the Elastic NLP model reference] for a list of third party text similarity models supported by {es} for semantic re-ranking. -*** Then set up an <> with the `rerank` task type -. *Create a `rerank` task using the <>*. +. *Select and configure a re-ranking model*. +You have the following options: +.. Use the <> cross-encoder model via the inference API's {es} service. +.. Use the <> to create a `rerank` endpoint. +.. Use the <> to create a `rerank` endpoint. +.. Upload a model to {es} from Hugging Face with {eland-docs}/machine-learning.html#ml-nlp-pytorch[Eland]. You'll need to use the `text_similarity` NLP task type when loading the model using Eland. Then set up an <> with the `rerank` endpoint type. ++ +Refer to {ml-docs}/ml-nlp-model-ref.html#ml-nlp-model-ref-text-similarity[the Elastic NLP model reference] for a list of third party text similarity models supported by {es} for semantic re-ranking. + +. *Create a `rerank` endpoint using the <>*. The Inference API creates an inference endpoint and configures your chosen machine learning model to perform the re-ranking task. . *Define a `text_similarity_reranker` retriever in your search request*. The retriever syntax makes it simple to configure both the retrieval and re-ranking of search results in a single API call. @@ -117,7 +119,7 @@ POST _search } }, "field": "text", - "inference_id": "my-cohere-rerank-model", + "inference_id": "elastic-rerank", "inference_text": "How often does the moon hide the sun?", "rank_window_size": 100, "min_score": 0.5 diff --git a/docs/reference/search/retriever.asciidoc b/docs/reference/search/retriever.asciidoc index 86a81f1d155d2..b90b7e312c790 100644 --- a/docs/reference/search/retriever.asciidoc +++ b/docs/reference/search/retriever.asciidoc @@ -11,6 +11,7 @@ This allows for complex behavior to be depicted in a tree-like structure, called [TIP] ==== Refer to <> for a high level overview of the retrievers abstraction. +Refer to <> for additional examples. ==== The following retrievers are available: @@ -382,16 +383,17 @@ Refer to <> for a high level overview of semantic re-ranking ===== Prerequisites -To use `text_similarity_reranker` you must first set up a `rerank` task using the <>. -The `rerank` task should be set up with a machine learning model that can compute text similarity. +To use `text_similarity_reranker` you must first set up an inference endpoint for the `rerank` task using the <>. +The endpoint should be set up with a machine learning model that can compute text similarity. Refer to {ml-docs}/ml-nlp-model-ref.html#ml-nlp-model-ref-text-similarity[the Elastic NLP model reference] for a list of third-party text similarity models supported by {es}. -Currently you can: +You have the following options: -* Integrate directly with the <> using the `rerank` task type -* Integrate directly with the <> using the `rerank` task type +* Use the the built-in <> cross-encoder model via the inference API's {es} service. +* Use the <> with the `rerank` task type. +* Use the <> with the `rerank` task type. * Upload a model to {es} with {eland-docs}/machine-learning.html#ml-nlp-pytorch[Eland] using the `text_similarity` NLP task type. -** Then set up an <> with the `rerank` task type +** Then set up an <> with the `rerank` task type. ** Refer to the <> on this page for a step-by-step guide. ===== Parameters @@ -436,13 +438,70 @@ Note that score calculations vary depending on the model used. Applies the specified <> to the child <>. If the child retriever already specifies any filters, then this top-level filter is applied in conjuction with the filter defined in the child retriever. +[discrete] +[[text-similarity-reranker-retriever-example-elastic-rerank]] +==== Example: Elastic Rerank + +This examples demonstrates how to deploy the Elastic Rerank model and use it to re-rank search results using the `text_similarity_reranker` retriever. + +Follow these steps: + +. Create an inference endpoint for the `rerank` task using the <>. ++ +[source,console] +---- +PUT _inference/rerank/my-elastic-rerank +{ + "service": "elasticsearch", + "service_settings": { + "model_id": ".rerank-v1", + "num_threads": 1, + "adaptive_allocations": { <1> + "enabled": true, + "min_number_of_allocations": 1, + "max_number_of_allocations": 10 + } + } +} +---- +// TEST[skip:uses ML] +<1> {ml-docs}/ml-nlp-auto-scale.html#nlp-model-adaptive-allocations[Adaptive allocations] will be enabled with the minimum of 1 and the maximum of 10 allocations. ++ +. Define a `text_similarity_rerank` retriever: ++ +[source,console] +---- +POST _search +{ + "retriever": { + "text_similarity_reranker": { + "retriever": { + "standard": { + "query": { + "match": { + "text": "How often does the moon hide the sun?" + } + } + } + }, + "field": "text", + "inference_id": "my-elastic-rerank", + "inference_text": "How often does the moon hide the sun?", + "rank_window_size": 100, + "min_score": 0.5 + } + } +} +---- +// TEST[skip:uses ML] + [discrete] [[text-similarity-reranker-retriever-example-cohere]] ==== Example: Cohere Rerank This example enables out-of-the-box semantic search by re-ranking top documents using the Cohere Rerank API. This approach eliminates the need to generate and store embeddings for all indexed documents. -This requires a <> using the `rerank` task type. +This requires a <> that is set up for the `rerank` task type. [source,console] ---- @@ -680,6 +739,12 @@ GET movies/_search <1> The `rule` retriever is the outermost retriever, applying rules to the search results that were previously reranked using the `rrf` retriever. <2> The `rrf` retriever returns results from all of its sub-retrievers, and the output of the `rrf` retriever is used as input to the `rule` retriever. +[discrete] +[[retriever-common-parameters]] +=== Common usage guidelines + +[discrete] +[[retriever-size-pagination]] ==== Using `from` and `size` with a retriever tree The <> and <> @@ -688,12 +753,16 @@ parameters are provided globally as part of the general They are applied to all retrievers in a retriever tree, unless a specific retriever overrides the `size` parameter using a different parameter such as `rank_window_size`. Though, the final search hits are always limited to `size`. +[discrete] +[[retriever-aggregations]] ==== Using aggregations with a retriever tree <> are globally specified as part of a search request. The query used for an aggregation is the combination of all leaf retrievers as `should` clauses in a <>. +[discrete] +[[retriever-restrictions]] ==== Restrictions on search parameters when specifying a retriever When a retriever is specified as part of a search, the following elements are not allowed at the top-level. From 79d70686b3ba86dcab4694d46e5a81de74ba06f8 Mon Sep 17 00:00:00 2001 From: kosabogi <105062005+kosabogi@users.noreply.github.com> Date: Thu, 28 Nov 2024 09:26:16 +0100 Subject: [PATCH 02/15] Fixes typo (#117684) --- .../ml/trained-models/apis/get-trained-models-stats.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference/ml/trained-models/apis/get-trained-models-stats.asciidoc b/docs/reference/ml/trained-models/apis/get-trained-models-stats.asciidoc index beff87e6ec6e6..b55f022a5d168 100644 --- a/docs/reference/ml/trained-models/apis/get-trained-models-stats.asciidoc +++ b/docs/reference/ml/trained-models/apis/get-trained-models-stats.asciidoc @@ -235,7 +235,7 @@ The reason for the current state. Usually only populated when the `routing_state (string) The current routing state. -- -* `starting`: The model is attempting to allocate on this model, inference calls are not yet accepted. +* `starting`: The model is attempting to allocate on this node, inference calls are not yet accepted. * `started`: The model is allocated and ready to accept inference requests. * `stopping`: The model is being deallocated from this node. * `stopped`: The model is fully deallocated from this node. From dc7ea9eff9a5897fabc2fb9dd3bb291eee77ca11 Mon Sep 17 00:00:00 2001 From: Alexander Spies Date: Thu, 28 Nov 2024 09:40:38 +0100 Subject: [PATCH 03/15] ESQL: Fix LookupJoin output (#117639) * Fix output methods related to LookupJoin * Add tests with subsequent EVAL * Fix BinaryPlan.computeReferences This must not just use the references from its own output. Not only is this wrong, it also leads to failures when we call the .references() method on unresolved plans. --- .../xpack/esql/ccq/MultiClusterSpecIT.java | 4 +- .../src/main/resources/lookup-join.csv-spec | 67 +++++++++++++++---- .../xpack/esql/action/EsqlCapabilities.java | 2 +- .../xpack/esql/analysis/Analyzer.java | 15 ++--- .../xpack/esql/plan/QueryPlan.java | 5 ++ .../xpack/esql/plan/logical/BinaryPlan.java | 7 -- .../xpack/esql/plan/logical/join/Join.java | 48 ++++--------- .../esql/plan/logical/join/LookupJoin.java | 43 +++--------- .../xpack/esql/session/EsqlSession.java | 4 -- .../elasticsearch/xpack/esql/CsvTests.java | 2 +- .../xpack/esql/analysis/AnalyzerTests.java | 5 +- 11 files changed, 91 insertions(+), 111 deletions(-) diff --git a/x-pack/plugin/esql/qa/server/multi-clusters/src/javaRestTest/java/org/elasticsearch/xpack/esql/ccq/MultiClusterSpecIT.java b/x-pack/plugin/esql/qa/server/multi-clusters/src/javaRestTest/java/org/elasticsearch/xpack/esql/ccq/MultiClusterSpecIT.java index 5df85d1004dd1..8f4522573f880 100644 --- a/x-pack/plugin/esql/qa/server/multi-clusters/src/javaRestTest/java/org/elasticsearch/xpack/esql/ccq/MultiClusterSpecIT.java +++ b/x-pack/plugin/esql/qa/server/multi-clusters/src/javaRestTest/java/org/elasticsearch/xpack/esql/ccq/MultiClusterSpecIT.java @@ -47,7 +47,7 @@ import static org.elasticsearch.xpack.esql.EsqlTestUtils.classpathResources; import static org.elasticsearch.xpack.esql.action.EsqlCapabilities.Cap.INLINESTATS; import static org.elasticsearch.xpack.esql.action.EsqlCapabilities.Cap.INLINESTATS_V2; -import static org.elasticsearch.xpack.esql.action.EsqlCapabilities.Cap.JOIN_LOOKUP; +import static org.elasticsearch.xpack.esql.action.EsqlCapabilities.Cap.JOIN_LOOKUP_V2; import static org.elasticsearch.xpack.esql.action.EsqlCapabilities.Cap.JOIN_PLANNING_V1; import static org.elasticsearch.xpack.esql.action.EsqlCapabilities.Cap.METADATA_FIELDS_REMOTE_TEST; import static org.elasticsearch.xpack.esql.qa.rest.EsqlSpecTestCase.Mode.SYNC; @@ -125,7 +125,7 @@ protected void shouldSkipTest(String testName) throws IOException { assumeFalse("INLINESTATS not yet supported in CCS", testCase.requiredCapabilities.contains(INLINESTATS.capabilityName())); assumeFalse("INLINESTATS not yet supported in CCS", testCase.requiredCapabilities.contains(INLINESTATS_V2.capabilityName())); assumeFalse("INLINESTATS not yet supported in CCS", testCase.requiredCapabilities.contains(JOIN_PLANNING_V1.capabilityName())); - assumeFalse("LOOKUP JOIN not yet supported in CCS", testCase.requiredCapabilities.contains(JOIN_LOOKUP.capabilityName())); + assumeFalse("LOOKUP JOIN not yet supported in CCS", testCase.requiredCapabilities.contains(JOIN_LOOKUP_V2.capabilityName())); } private TestFeatureService remoteFeaturesService() throws IOException { diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/lookup-join.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/lookup-join.csv-spec index 605bf78c20a32..11786fb905c60 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/lookup-join.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/lookup-join.csv-spec @@ -3,22 +3,22 @@ // Reuses the sample dataset and commands from enrich.csv-spec // -basicOnTheDataNode -required_capability: join_lookup +//TODO: this sometimes returns null instead of the looked up value (likely related to the execution order) +basicOnTheDataNode-Ignore +required_capability: join_lookup_v2 -//TODO: this returns different results in CI then locally -// sometimes null, sometimes spanish (likely related to the execution order) FROM employees | EVAL language_code = languages | LOOKUP JOIN languages_lookup ON language_code -| WHERE emp_no < 500 -| KEEP emp_no, language_name +| WHERE emp_no >= 10091 AND emp_no < 10094 | SORT emp_no -| LIMIT 1 +| KEEP emp_no, language_code, language_name ; -emp_no:integer | language_name:keyword -//10091 | Spanish +emp_no:integer | language_code:integer | language_name:keyword +10091 | 3 | Spanish +10092 | 1 | English +10093 | 3 | Spanish ; basicRow-Ignore @@ -33,16 +33,55 @@ language_code:keyword | language_name:keyword ; basicOnTheCoordinator -required_capability: join_lookup +required_capability: join_lookup_v2 + +FROM employees +| SORT emp_no +| LIMIT 3 +| EVAL language_code = languages +| LOOKUP JOIN languages_lookup ON language_code +| KEEP emp_no, language_code, language_name +; + +emp_no:integer | language_code:integer | language_name:keyword +10001 | 2 | French +10002 | 5 | null +10003 | 4 | German +; + +//TODO: this sometimes returns null instead of the looked up value (likely related to the execution order) +subsequentEvalOnTheDataNode-Ignore +required_capability: join_lookup_v2 + +FROM employees +| EVAL language_code = languages +| LOOKUP JOIN languages_lookup ON language_code +| WHERE emp_no >= 10091 AND emp_no < 10094 +| SORT emp_no +| KEEP emp_no, language_code, language_name +| EVAL language_name = TO_LOWER(language_name), language_code_x2 = 2*language_code +; + +emp_no:integer | language_code:integer | language_name:keyword | language_code_x2:integer +10091 | 3 | spanish | 6 +10092 | 1 | english | 2 +10093 | 3 | spanish | 6 +; + +subsequentEvalOnTheCoordinator +required_capability: join_lookup_v2 FROM employees | SORT emp_no -| LIMIT 1 +| LIMIT 3 | EVAL language_code = languages | LOOKUP JOIN languages_lookup ON language_code -| KEEP emp_no, language_name +| KEEP emp_no, language_code, language_name +| EVAL language_name = TO_LOWER(language_name), language_code_x2 = 2*language_code ; -emp_no:integer | language_name:keyword -10001 | French +emp_no:integer | language_code:integer | language_name:keyword | language_code_x2:integer +10001 | 2 | french | 4 +10002 | 5 | null | 10 +10003 | 4 | german | 8 ; diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java index 58748781d1778..d8004f73f613f 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java @@ -524,7 +524,7 @@ public enum Cap { /** * LOOKUP JOIN */ - JOIN_LOOKUP(Build.current().isSnapshot()), + JOIN_LOOKUP_V2(Build.current().isSnapshot()), /** * Fix for https://github.com/elastic/elasticsearch/issues/117054 diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/analysis/Analyzer.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/analysis/Analyzer.java index dde7bc09ac615..b847508d2b161 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/analysis/Analyzer.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/analysis/Analyzer.java @@ -21,7 +21,6 @@ import org.elasticsearch.xpack.esql.core.capabilities.Resolvables; import org.elasticsearch.xpack.esql.core.expression.Alias; import org.elasticsearch.xpack.esql.core.expression.Attribute; -import org.elasticsearch.xpack.esql.core.expression.AttributeSet; import org.elasticsearch.xpack.esql.core.expression.EmptyAttribute; import org.elasticsearch.xpack.esql.core.expression.Expression; import org.elasticsearch.xpack.esql.core.expression.Expressions; @@ -609,8 +608,7 @@ private Join resolveLookupJoin(LookupJoin join) { JoinConfig config = join.config(); // for now, support only (LEFT) USING clauses JoinType type = config.type(); - // rewrite the join into a equi-join between the field with the same name between left and right - // per SQL standard, the USING columns are placed first in the output, followed by the rest of left, then right + // rewrite the join into an equi-join between the field with the same name between left and right if (type instanceof UsingJoinType using) { List cols = using.columns(); // the lookup cannot be resolved, bail out @@ -632,14 +630,9 @@ private Join resolveLookupJoin(LookupJoin join) { // resolve the using columns against the left and the right side then assemble the new join config List leftKeys = resolveUsingColumns(cols, join.left().output(), "left"); List rightKeys = resolveUsingColumns(cols, join.right().output(), "right"); - List output = new ArrayList<>(join.left().output()); - // the order is stable (since the AttributeSet preservers the insertion order) - output.addAll(join.right().outputSet().subtract(new AttributeSet(rightKeys))); - - // update the config - pick the left keys as those in the output - type = new UsingJoinType(coreJoin, rightKeys); - config = new JoinConfig(type, leftKeys, leftKeys, rightKeys); - join = new LookupJoin(join.source(), join.left(), join.right(), config, output); + + config = new JoinConfig(coreJoin, leftKeys, leftKeys, rightKeys); + join = new LookupJoin(join.source(), join.left(), join.right(), config); } // everything else is unsupported for now else { diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plan/QueryPlan.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plan/QueryPlan.java index ef8c3983faf2e..02373cc62e81f 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plan/QueryPlan.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plan/QueryPlan.java @@ -33,6 +33,10 @@ public QueryPlan(Source source, List children) { super(source, children); } + /** + * The ordered list of attributes (i.e. columns) this plan produces when executed. + * Must be called only on resolved plans, otherwise may throw an exception or return wrong results. + */ public abstract List output(); public AttributeSet outputSet() { @@ -87,6 +91,7 @@ public AttributeSet references() { /** * This very likely needs to be overridden for {@link QueryPlan#references} to be correct when inheriting. + * This can be called on unresolved plans and therefore must not rely on calls to {@link QueryPlan#output()}. */ protected AttributeSet computeReferences() { return Expressions.references(expressions()); diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plan/logical/BinaryPlan.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plan/logical/BinaryPlan.java index e65cdda4b6069..91cd7f7a15840 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plan/logical/BinaryPlan.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plan/logical/BinaryPlan.java @@ -6,8 +6,6 @@ */ package org.elasticsearch.xpack.esql.plan.logical; -import org.elasticsearch.xpack.esql.core.expression.AttributeSet; -import org.elasticsearch.xpack.esql.core.expression.Expressions; import org.elasticsearch.xpack.esql.core.tree.Source; import java.util.Arrays; @@ -45,11 +43,6 @@ public final BinaryPlan replaceRight(LogicalPlan newRight) { return replaceChildren(left, newRight); } - protected AttributeSet computeReferences() { - // TODO: this needs to be driven by the join config - return Expressions.references(output()); - } - public abstract BinaryPlan replaceChildren(LogicalPlan left, LogicalPlan right); @Override diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plan/logical/join/Join.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plan/logical/join/Join.java index 0e182646d914a..dd6b3ea3455f7 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plan/logical/join/Join.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plan/logical/join/Join.java @@ -10,9 +10,8 @@ import org.elasticsearch.common.io.stream.NamedWriteableRegistry; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; -import org.elasticsearch.common.util.Maps; import org.elasticsearch.xpack.esql.core.expression.Attribute; -import org.elasticsearch.xpack.esql.core.expression.Nullability; +import org.elasticsearch.xpack.esql.core.expression.NamedExpression; import org.elasticsearch.xpack.esql.core.expression.ReferenceAttribute; import org.elasticsearch.xpack.esql.core.tree.NodeInfo; import org.elasticsearch.xpack.esql.core.tree.Source; @@ -23,9 +22,11 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.util.Map; import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; +import static org.elasticsearch.xpack.esql.expression.NamedExpressions.mergeOutputAttributes; import static org.elasticsearch.xpack.esql.plan.logical.join.JoinTypes.LEFT; import static org.elasticsearch.xpack.esql.plan.logical.join.JoinTypes.RIGHT; @@ -107,37 +108,24 @@ public static List computeOutput(List leftOutput, List output; // TODO: make the other side nullable + Set matchFieldNames = config.matchFields().stream().map(NamedExpression::name).collect(Collectors.toSet()); if (LEFT.equals(joinType)) { - // right side becomes nullable and overrides left - // output = merge(leftOutput, makeNullable(rightOutput)); - output = merge(leftOutput, rightOutput); + // right side becomes nullable and overrides left except for match fields, which we preserve from the left + List rightOutputWithoutMatchFields = rightOutput.stream() + .filter(attr -> matchFieldNames.contains(attr.name()) == false) + .toList(); + output = mergeOutputAttributes(rightOutputWithoutMatchFields, leftOutput); } else if (RIGHT.equals(joinType)) { - // left side becomes nullable and overrides right - // output = merge(makeNullable(leftOutput), rightOutput); - output = merge(leftOutput, rightOutput); + List leftOutputWithoutMatchFields = leftOutput.stream() + .filter(attr -> matchFieldNames.contains(attr.name()) == false) + .toList(); + output = mergeOutputAttributes(leftOutputWithoutMatchFields, rightOutput); } else { throw new IllegalArgumentException(joinType.joinName() + " unsupported"); } return output; } - /** - * Merge the two lists of attributes into one and preserves order. - */ - private static List merge(List left, List right) { - // use linked hash map to preserve order - Map nameToAttribute = Maps.newLinkedHashMapWithExpectedSize(left.size() + right.size()); - for (Attribute a : left) { - nameToAttribute.put(a.name(), a); - } - for (Attribute a : right) { - // override the existing entry in place - nameToAttribute.compute(a.name(), (name, existing) -> a); - } - - return new ArrayList<>(nameToAttribute.values()); - } - /** * Make fields references, so we don't check if they exist in the index. * We do this for fields that we know don't come from the index. @@ -161,14 +149,6 @@ public static List makeReference(List output) { return out; } - private static List makeNullable(List output) { - List out = new ArrayList<>(output.size()); - for (Attribute a : output) { - out.add(a.withNullability(Nullability.TRUE)); - } - return out; - } - @Override public boolean expressionsResolved() { return config.expressionsResolved(); diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plan/logical/join/LookupJoin.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plan/logical/join/LookupJoin.java index 2ee9213f45b36..57c8cb00baa32 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plan/logical/join/LookupJoin.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plan/logical/join/LookupJoin.java @@ -16,7 +16,6 @@ import org.elasticsearch.xpack.esql.plan.logical.join.JoinTypes.UsingJoinType; import java.util.List; -import java.util.Objects; import static java.util.Collections.emptyList; import static org.elasticsearch.xpack.esql.plan.logical.join.JoinTypes.LEFT; @@ -26,10 +25,8 @@ */ public class LookupJoin extends Join implements SurrogateLogicalPlan { - private final List output; - public LookupJoin(Source source, LogicalPlan left, LogicalPlan right, List joinFields) { - this(source, left, right, new UsingJoinType(LEFT, joinFields), emptyList(), emptyList(), emptyList(), emptyList()); + this(source, left, right, new UsingJoinType(LEFT, joinFields), emptyList(), emptyList(), emptyList()); } public LookupJoin( @@ -39,15 +36,13 @@ public LookupJoin( JoinType type, List joinFields, List leftFields, - List rightFields, - List output + List rightFields ) { - this(source, left, right, new JoinConfig(type, joinFields, leftFields, rightFields), output); + this(source, left, right, new JoinConfig(type, joinFields, leftFields, rightFields)); } - public LookupJoin(Source source, LogicalPlan left, LogicalPlan right, JoinConfig joinConfig, List output) { + public LookupJoin(Source source, LogicalPlan left, LogicalPlan right, JoinConfig joinConfig) { super(source, left, right, joinConfig); - this.output = output; } /** @@ -55,20 +50,14 @@ public LookupJoin(Source source, LogicalPlan left, LogicalPlan right, JoinConfig */ @Override public LogicalPlan surrogate() { - JoinConfig cfg = config(); - JoinConfig newConfig = new JoinConfig(LEFT, cfg.matchFields(), cfg.leftFields(), cfg.rightFields()); - Join normalized = new Join(source(), left(), right(), newConfig); + Join normalized = new Join(source(), left(), right(), config()); // TODO: decide whether to introduce USING or just basic ON semantics - keep the ordering out for now - return new Project(source(), normalized, output); - } - - public List output() { - return output; + return new Project(source(), normalized, output()); } @Override public Join replaceChildren(LogicalPlan left, LogicalPlan right) { - return new LookupJoin(source(), left, right, config(), output); + return new LookupJoin(source(), left, right, config()); } @Override @@ -81,23 +70,7 @@ protected NodeInfo info() { config().type(), config().matchFields(), config().leftFields(), - config().rightFields(), - output + config().rightFields() ); } - - @Override - public int hashCode() { - return Objects.hash(super.hashCode(), output); - } - - @Override - public boolean equals(Object obj) { - if (super.equals(obj) == false) { - return false; - } - - LookupJoin other = (LookupJoin) obj; - return Objects.equals(output, other.output); - } } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/session/EsqlSession.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/session/EsqlSession.java index 021596c31f65d..3b0f9ab578df9 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/session/EsqlSession.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/session/EsqlSession.java @@ -79,7 +79,6 @@ import java.util.List; import java.util.Map; import java.util.Set; -import java.util.function.Predicate; import java.util.stream.Collectors; import static org.elasticsearch.index.query.QueryBuilders.boolQuery; @@ -466,8 +465,6 @@ static Set fieldNames(LogicalPlan parsed, Set enrichPolicyMatchF // ie "from test | eval lang = languages + 1 | keep *l" should consider both "languages" and "*l" as valid fields to ask for AttributeSet keepCommandReferences = new AttributeSet(); AttributeSet keepJoinReferences = new AttributeSet(); - List> keepMatches = new ArrayList<>(); - List keepPatterns = new ArrayList<>(); parsed.forEachDown(p -> {// go over each plan top-down if (p instanceof RegexExtract re) { // for Grok and Dissect @@ -501,7 +498,6 @@ static Set fieldNames(LogicalPlan parsed, Set enrichPolicyMatchF references.add(ua); if (p instanceof Keep) { keepCommandReferences.add(ua); - keepMatches.add(up::match); } }); if (p instanceof Keep) { diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/CsvTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/CsvTests.java index c745801bf505f..6763988eac638 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/CsvTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/CsvTests.java @@ -263,7 +263,7 @@ public final void test() throws Throwable { ); assumeFalse( "lookup join disabled for csv tests", - testCase.requiredCapabilities.contains(EsqlCapabilities.Cap.JOIN_LOOKUP.capabilityName()) + testCase.requiredCapabilities.contains(EsqlCapabilities.Cap.JOIN_LOOKUP_V2.capabilityName()) ); if (Build.current().isSnapshot()) { assertThat( diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/AnalyzerTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/AnalyzerTests.java index 2770ed1f336ae..e0ebc92afa95d 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/AnalyzerTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/AnalyzerTests.java @@ -1945,9 +1945,10 @@ public void testLookup() { .item(startsWith("job{f}")) .item(startsWith("job.raw{f}")) /* - * Int key is returned as a full field (despite the rename) + * Int is a reference here because we renamed it in project. + * If we hadn't it'd be a field and that'd be fine. */ - .item(containsString("int{f}")) + .item(containsString("int{r}")) .item(startsWith("last_name{f}")) .item(startsWith("long_noidx{f}")) .item(startsWith("salary{f}")) From 11ffe8831793a5cad91b5bb5fb63e2365286451a Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Thu, 28 Nov 2024 09:54:42 +0100 Subject: [PATCH 04/15] Speedup HealthNodeTaskExecutor CS listener (#113436) This method was quite slow in tests because there's an expensive assertion in `ClusterApplierService.state()` that we run when calling `ClusterService.localNode()` --- .../selection/HealthNodeTaskExecutor.java | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/health/node/selection/HealthNodeTaskExecutor.java b/server/src/main/java/org/elasticsearch/health/node/selection/HealthNodeTaskExecutor.java index 3efad1aee26b0..5991bc248ba76 100644 --- a/server/src/main/java/org/elasticsearch/health/node/selection/HealthNodeTaskExecutor.java +++ b/server/src/main/java/org/elasticsearch/health/node/selection/HealthNodeTaskExecutor.java @@ -182,8 +182,8 @@ void startTask(ClusterChangedEvent event) { // visible for testing void shuttingDown(ClusterChangedEvent event) { - DiscoveryNode node = clusterService.localNode(); - if (isNodeShuttingDown(event, node.getId())) { + if (isNodeShuttingDown(event)) { + var node = event.state().getNodes().getLocalNode(); abortTaskIfApplicable("node [{" + node.getName() + "}{" + node.getId() + "}] shutting down"); } } @@ -198,9 +198,18 @@ void abortTaskIfApplicable(String reason) { } } - private static boolean isNodeShuttingDown(ClusterChangedEvent event, String nodeId) { - return event.previousState().metadata().nodeShutdowns().contains(nodeId) == false - && event.state().metadata().nodeShutdowns().contains(nodeId); + private static boolean isNodeShuttingDown(ClusterChangedEvent event) { + if (event.metadataChanged() == false) { + return false; + } + var shutdownsOld = event.previousState().metadata().nodeShutdowns(); + var shutdownsNew = event.state().metadata().nodeShutdowns(); + if (shutdownsNew == shutdownsOld) { + return false; + } + String nodeId = event.state().nodes().getLocalNodeId(); + return shutdownsOld.contains(nodeId) == false && shutdownsNew.contains(nodeId); + } public static List getNamedXContentParsers() { From d4bcd979a5b9196f23b00d97cb17aad1679818c8 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Thu, 28 Nov 2024 10:05:26 +0100 Subject: [PATCH 05/15] Update synthetic source legacy license cutoff date. (#117658) Update default cutoff date from 12-12-2024T00:00 UTC to 01-02-2025T00:00 UTC. --- .../xpack/logsdb/SyntheticSourceLicenseService.java | 2 +- .../SyntheticSourceIndexSettingsProviderLegacyLicenseTests.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/SyntheticSourceLicenseService.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/SyntheticSourceLicenseService.java index 71de2f7909835..26a672fb1c903 100644 --- a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/SyntheticSourceLicenseService.java +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/SyntheticSourceLicenseService.java @@ -29,7 +29,7 @@ final class SyntheticSourceLicenseService { // You can only override this property if you received explicit approval from Elastic. static final String CUTOFF_DATE_SYS_PROP_NAME = "es.mapping.synthetic_source_fallback_to_stored_source.cutoff_date_restricted_override"; private static final Logger LOGGER = LogManager.getLogger(SyntheticSourceLicenseService.class); - static final long DEFAULT_CUTOFF_DATE = LocalDateTime.of(2024, 12, 12, 0, 0).toInstant(ZoneOffset.UTC).toEpochMilli(); + static final long DEFAULT_CUTOFF_DATE = LocalDateTime.of(2025, 2, 1, 0, 0).toInstant(ZoneOffset.UTC).toEpochMilli(); /** * A setting that determines whether source mode should always be stored source. Regardless of licence. diff --git a/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/SyntheticSourceIndexSettingsProviderLegacyLicenseTests.java b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/SyntheticSourceIndexSettingsProviderLegacyLicenseTests.java index 939d7d892a48d..eda0d87868745 100644 --- a/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/SyntheticSourceIndexSettingsProviderLegacyLicenseTests.java +++ b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/SyntheticSourceIndexSettingsProviderLegacyLicenseTests.java @@ -98,7 +98,7 @@ public void testGetAdditionalIndexSettingsTsdb() throws IOException { } public void testGetAdditionalIndexSettingsTsdbAfterCutoffDate() throws Exception { - long start = LocalDateTime.of(2024, 12, 20, 0, 0).toInstant(ZoneOffset.UTC).toEpochMilli(); + long start = LocalDateTime.of(2025, 2, 2, 0, 0).toInstant(ZoneOffset.UTC).toEpochMilli(); License license = createGoldOrPlatinumLicense(start); long time = LocalDateTime.of(2024, 12, 31, 0, 0).toInstant(ZoneOffset.UTC).toEpochMilli(); var licenseState = new XPackLicenseState(() -> time, new XPackLicenseStatus(license.operationMode(), true, null)); From 5d686973084e926a2dbec96a311a6684807f5406 Mon Sep 17 00:00:00 2001 From: David Kyle Date: Thu, 28 Nov 2024 09:36:59 +0000 Subject: [PATCH 06/15] [ML] Delete accidental changelog for a non issue (#117636) --- docs/changelog/117235.yaml | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 docs/changelog/117235.yaml diff --git a/docs/changelog/117235.yaml b/docs/changelog/117235.yaml deleted file mode 100644 index dbf0b4cc18388..0000000000000 --- a/docs/changelog/117235.yaml +++ /dev/null @@ -1,5 +0,0 @@ -pr: 117235 -summary: "Deprecate `ChunkingOptions` parameter" -area: ES|QL -type: enhancement -issues: [] From 6a4b68d263fe3533fc44e90d779537b48ffaf5f6 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Thu, 28 Nov 2024 10:53:39 +0100 Subject: [PATCH 07/15] Add source mode stats to MappingStats (#117463) --- docs/reference/cluster/stats.asciidoc | 5 +- .../test/cluster.stats/40_source_modes.yml | 50 ++++++++++ server/src/main/java/module-info.java | 3 +- .../org/elasticsearch/TransportVersions.java | 3 + .../cluster/stats/ClusterStatsFeatures.java | 26 ++++++ .../admin/cluster/stats/MappingStats.java | 55 ++++++++++- ...lasticsearch.features.FeatureSpecification | 1 + .../cluster/stats/MappingStatsTests.java | 92 ++++++++++++++++++- .../ClusterStatsMonitoringDocTests.java | 3 +- 9 files changed, 226 insertions(+), 12 deletions(-) create mode 100644 rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.stats/40_source_modes.yml create mode 100644 server/src/main/java/org/elasticsearch/action/admin/cluster/stats/ClusterStatsFeatures.java diff --git a/docs/reference/cluster/stats.asciidoc b/docs/reference/cluster/stats.asciidoc index bd818a538f78b..d875417bde51a 100644 --- a/docs/reference/cluster/stats.asciidoc +++ b/docs/reference/cluster/stats.asciidoc @@ -1644,7 +1644,10 @@ The API returns the following response: "total_deduplicated_mapping_size": "0b", "total_deduplicated_mapping_size_in_bytes": 0, "field_types": [], - "runtime_field_types": [] + "runtime_field_types": [], + "source_modes" : { + "stored": 0 + } }, "analysis": { "char_filter_types": [], diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.stats/40_source_modes.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.stats/40_source_modes.yml new file mode 100644 index 0000000000000..64bbad7fb1c6d --- /dev/null +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.stats/40_source_modes.yml @@ -0,0 +1,50 @@ +--- +test source modes: + - requires: + cluster_features: ["cluster.stats.source_modes"] + reason: requires source modes features + + - do: + indices.create: + index: test-synthetic + body: + settings: + index: + mapping: + source.mode: synthetic + + - do: + indices.create: + index: test-stored + + - do: + indices.create: + index: test-disabled + body: + settings: + index: + mapping: + source.mode: disabled + + - do: + bulk: + refresh: true + body: + - '{ "create": { "_index": "test-synthetic" } }' + - '{ "name": "aaaa", "some_string": "AaAa", "some_int": 1000, "some_double": 123.456789, "some_bool": true }' + - '{ "create": { "_index": "test-stored" } }' + - '{ "name": "bbbb", "some_string": "BbBb", "some_int": 2000, "some_double": 321.987654, "some_bool": false }' + - '{ "create": { "_index": "test-disabled" } }' + - '{ "name": "cccc", "some_string": "CcCc", "some_int": 3000, "some_double": 421.484654, "some_bool": false }' + + - do: + search: + index: test-* + - match: { hits.total.value: 3 } + + - do: + cluster.stats: { } + + - match: { indices.mappings.source_modes.disabled: 1 } + - match: { indices.mappings.source_modes.stored: 1 } + - match: { indices.mappings.source_modes.synthetic: 1 } diff --git a/server/src/main/java/module-info.java b/server/src/main/java/module-info.java index 35d1a44624b0f..63dbac3a72487 100644 --- a/server/src/main/java/module-info.java +++ b/server/src/main/java/module-info.java @@ -433,7 +433,8 @@ org.elasticsearch.search.SearchFeatures, org.elasticsearch.script.ScriptFeatures, org.elasticsearch.search.retriever.RetrieversFeatures, - org.elasticsearch.reservedstate.service.FileSettingsFeatures; + org.elasticsearch.reservedstate.service.FileSettingsFeatures, + org.elasticsearch.action.admin.cluster.stats.ClusterStatsFeatures; uses org.elasticsearch.plugins.internal.SettingsExtension; uses RestExtension; diff --git a/server/src/main/java/org/elasticsearch/TransportVersions.java b/server/src/main/java/org/elasticsearch/TransportVersions.java index dda7d7e5d4c4c..a1315ccf66701 100644 --- a/server/src/main/java/org/elasticsearch/TransportVersions.java +++ b/server/src/main/java/org/elasticsearch/TransportVersions.java @@ -205,10 +205,13 @@ static TransportVersion def(int id) { public static final TransportVersion ESQL_ENRICH_RUNTIME_WARNINGS = def(8_796_00_0); public static final TransportVersion INGEST_PIPELINE_CONFIGURATION_AS_MAP = def(8_797_00_0); public static final TransportVersion LOGSDB_TELEMETRY_CUSTOM_CUTOFF_DATE_FIX_8_17 = def(8_797_00_1); + public static final TransportVersion SOURCE_MODE_TELEMETRY_FIX_8_17 = def(8_797_00_2); public static final TransportVersion INDEXING_PRESSURE_THROTTLING_STATS = def(8_798_00_0); public static final TransportVersion REINDEX_DATA_STREAMS = def(8_799_00_0); public static final TransportVersion ESQL_REMOVE_NODE_LEVEL_PLAN = def(8_800_00_0); public static final TransportVersion LOGSDB_TELEMETRY_CUSTOM_CUTOFF_DATE = def(8_801_00_0); + public static final TransportVersion SOURCE_MODE_TELEMETRY = def(8_802_00_0); + /* * STOP! READ THIS FIRST! No, really, * ____ _____ ___ ____ _ ____ _____ _ ____ _____ _ _ ___ ____ _____ ___ ____ ____ _____ _ diff --git a/server/src/main/java/org/elasticsearch/action/admin/cluster/stats/ClusterStatsFeatures.java b/server/src/main/java/org/elasticsearch/action/admin/cluster/stats/ClusterStatsFeatures.java new file mode 100644 index 0000000000000..6e85093a52cdd --- /dev/null +++ b/server/src/main/java/org/elasticsearch/action/admin/cluster/stats/ClusterStatsFeatures.java @@ -0,0 +1,26 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.action.admin.cluster.stats; + +import org.elasticsearch.features.FeatureSpecification; +import org.elasticsearch.features.NodeFeature; + +import java.util.Set; + +/** + * Spec for cluster stats features. + */ +public class ClusterStatsFeatures implements FeatureSpecification { + + @Override + public Set getFeatures() { + return Set.of(MappingStats.SOURCE_MODES_FEATURE); + } +} diff --git a/server/src/main/java/org/elasticsearch/action/admin/cluster/stats/MappingStats.java b/server/src/main/java/org/elasticsearch/action/admin/cluster/stats/MappingStats.java index d2e5973169919..1bc2e1d13c864 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/cluster/stats/MappingStats.java +++ b/server/src/main/java/org/elasticsearch/action/admin/cluster/stats/MappingStats.java @@ -9,6 +9,7 @@ package org.elasticsearch.action.admin.cluster.stats; +import org.elasticsearch.TransportVersion; import org.elasticsearch.TransportVersions; import org.elasticsearch.cluster.metadata.IndexMetadata; import org.elasticsearch.cluster.metadata.MappingMetadata; @@ -19,6 +20,8 @@ import org.elasticsearch.common.io.stream.Writeable; import org.elasticsearch.common.unit.ByteSizeValue; import org.elasticsearch.core.Nullable; +import org.elasticsearch.features.NodeFeature; +import org.elasticsearch.index.mapper.SourceFieldMapper; import org.elasticsearch.xcontent.ToXContentFragment; import org.elasticsearch.xcontent.XContentBuilder; @@ -31,6 +34,7 @@ import java.util.HashSet; import java.util.IdentityHashMap; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.OptionalLong; @@ -44,6 +48,8 @@ */ public final class MappingStats implements ToXContentFragment, Writeable { + static final NodeFeature SOURCE_MODES_FEATURE = new NodeFeature("cluster.stats.source_modes"); + private static final Pattern DOC_PATTERN = Pattern.compile("doc[\\[.]"); private static final Pattern SOURCE_PATTERN = Pattern.compile("params\\._source"); @@ -53,6 +59,8 @@ public final class MappingStats implements ToXContentFragment, Writeable { public static MappingStats of(Metadata metadata, Runnable ensureNotCancelled) { Map fieldTypes = new HashMap<>(); Set concreteFieldNames = new HashSet<>(); + // Account different source modes based on index.mapping.source.mode setting: + Map sourceModeUsageCount = new HashMap<>(); Map runtimeFieldTypes = new HashMap<>(); final Map mappingCounts = new IdentityHashMap<>(metadata.getMappingsByHash().size()); for (IndexMetadata indexMetadata : metadata) { @@ -62,6 +70,9 @@ public static MappingStats of(Metadata metadata, Runnable ensureNotCancelled) { continue; } AnalysisStats.countMapping(mappingCounts, indexMetadata); + + var sourceMode = SourceFieldMapper.INDEX_MAPPER_SOURCE_MODE_SETTING.get(indexMetadata.getSettings()); + sourceModeUsageCount.merge(sourceMode.toString().toLowerCase(Locale.ENGLISH), 1, Integer::sum); } final AtomicLong totalFieldCount = new AtomicLong(); final AtomicLong totalDeduplicatedFieldCount = new AtomicLong(); @@ -175,12 +186,14 @@ public static MappingStats of(Metadata metadata, Runnable ensureNotCancelled) { for (MappingMetadata mappingMetadata : metadata.getMappingsByHash().values()) { totalMappingSizeBytes += mappingMetadata.source().compressed().length; } + return new MappingStats( totalFieldCount.get(), totalDeduplicatedFieldCount.get(), totalMappingSizeBytes, fieldTypes.values(), - runtimeFieldTypes.values() + runtimeFieldTypes.values(), + sourceModeUsageCount ); } @@ -215,17 +228,20 @@ private static int countOccurrences(String script, Pattern pattern) { private final List fieldTypeStats; private final List runtimeFieldStats; + private final Map sourceModeUsageCount; MappingStats( long totalFieldCount, long totalDeduplicatedFieldCount, long totalMappingSizeBytes, Collection fieldTypeStats, - Collection runtimeFieldStats + Collection runtimeFieldStats, + Map sourceModeUsageCount ) { this.totalFieldCount = totalFieldCount; this.totalDeduplicatedFieldCount = totalDeduplicatedFieldCount; this.totalMappingSizeBytes = totalMappingSizeBytes; + this.sourceModeUsageCount = sourceModeUsageCount; List stats = new ArrayList<>(fieldTypeStats); stats.sort(Comparator.comparing(IndexFeatureStats::getName)); this.fieldTypeStats = Collections.unmodifiableList(stats); @@ -246,6 +262,10 @@ private static int countOccurrences(String script, Pattern pattern) { } fieldTypeStats = in.readCollectionAsImmutableList(FieldStats::new); runtimeFieldStats = in.readCollectionAsImmutableList(RuntimeFieldStats::new); + var transportVersion = in.getTransportVersion(); + sourceModeUsageCount = canReadOrWriteSourceModeTelemetry(transportVersion) + ? in.readImmutableMap(StreamInput::readString, StreamInput::readVInt) + : Map.of(); } @Override @@ -257,6 +277,15 @@ public void writeTo(StreamOutput out) throws IOException { } out.writeCollection(fieldTypeStats); out.writeCollection(runtimeFieldStats); + var transportVersion = out.getTransportVersion(); + if (canReadOrWriteSourceModeTelemetry(transportVersion)) { + out.writeMap(sourceModeUsageCount, StreamOutput::writeVInt); + } + } + + private static boolean canReadOrWriteSourceModeTelemetry(TransportVersion version) { + return version.isPatchFrom(TransportVersions.SOURCE_MODE_TELEMETRY_FIX_8_17) + || version.onOrAfter(TransportVersions.SOURCE_MODE_TELEMETRY); } private static OptionalLong ofNullable(Long l) { @@ -300,6 +329,10 @@ public List getRuntimeFieldStats() { return runtimeFieldStats; } + public Map getSourceModeUsageCount() { + return sourceModeUsageCount; + } + @Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { builder.startObject("mappings"); @@ -326,6 +359,12 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws st.toXContent(builder, params); } builder.endArray(); + builder.startObject("source_modes"); + var entries = sourceModeUsageCount.entrySet().stream().sorted(Map.Entry.comparingByKey()).toList(); + for (var entry : entries) { + builder.field(entry.getKey(), entry.getValue()); + } + builder.endObject(); builder.endObject(); return builder; } @@ -344,11 +383,19 @@ public boolean equals(Object o) { && Objects.equals(totalDeduplicatedFieldCount, that.totalDeduplicatedFieldCount) && Objects.equals(totalMappingSizeBytes, that.totalMappingSizeBytes) && fieldTypeStats.equals(that.fieldTypeStats) - && runtimeFieldStats.equals(that.runtimeFieldStats); + && runtimeFieldStats.equals(that.runtimeFieldStats) + && sourceModeUsageCount.equals(that.sourceModeUsageCount); } @Override public int hashCode() { - return Objects.hash(totalFieldCount, totalDeduplicatedFieldCount, totalMappingSizeBytes, fieldTypeStats, runtimeFieldStats); + return Objects.hash( + totalFieldCount, + totalDeduplicatedFieldCount, + totalMappingSizeBytes, + fieldTypeStats, + runtimeFieldStats, + sourceModeUsageCount + ); } } diff --git a/server/src/main/resources/META-INF/services/org.elasticsearch.features.FeatureSpecification b/server/src/main/resources/META-INF/services/org.elasticsearch.features.FeatureSpecification index 3955fc87bf392..12965152f260c 100644 --- a/server/src/main/resources/META-INF/services/org.elasticsearch.features.FeatureSpecification +++ b/server/src/main/resources/META-INF/services/org.elasticsearch.features.FeatureSpecification @@ -23,3 +23,4 @@ org.elasticsearch.search.retriever.RetrieversFeatures org.elasticsearch.script.ScriptFeatures org.elasticsearch.reservedstate.service.FileSettingsFeatures org.elasticsearch.cluster.routing.RoutingFeatures +org.elasticsearch.action.admin.cluster.stats.ClusterStatsFeatures diff --git a/server/src/test/java/org/elasticsearch/action/admin/cluster/stats/MappingStatsTests.java b/server/src/test/java/org/elasticsearch/action/admin/cluster/stats/MappingStatsTests.java index 2c374c7d26dee..96954458c18c4 100644 --- a/server/src/test/java/org/elasticsearch/action/admin/cluster/stats/MappingStatsTests.java +++ b/server/src/test/java/org/elasticsearch/action/admin/cluster/stats/MappingStatsTests.java @@ -18,6 +18,7 @@ import org.elasticsearch.common.io.stream.Writeable.Reader; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.IndexVersion; +import org.elasticsearch.index.mapper.SourceFieldMapper; import org.elasticsearch.script.Script; import org.elasticsearch.tasks.TaskCancelledException; import org.elasticsearch.test.AbstractWireSerializingTestCase; @@ -29,7 +30,15 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.HashMap; import java.util.List; +import java.util.Locale; +import java.util.Map; + +import static org.elasticsearch.index.mapper.SourceFieldMapper.Mode.DISABLED; +import static org.elasticsearch.index.mapper.SourceFieldMapper.Mode.STORED; +import static org.elasticsearch.index.mapper.SourceFieldMapper.Mode.SYNTHETIC; +import static org.hamcrest.Matchers.equalTo; public class MappingStatsTests extends AbstractWireSerializingTestCase { @@ -203,7 +212,10 @@ public void testToXContent() { "doc_max" : 0, "doc_total" : 0 } - ] + ], + "source_modes" : { + "stored" : 2 + } } }""", Strings.toString(mappingStats, true, true)); } @@ -332,7 +344,10 @@ public void testToXContentWithSomeSharedMappings() { "doc_max" : 0, "doc_total" : 0 } - ] + ], + "source_modes" : { + "stored" : 3 + } } }""", Strings.toString(mappingStats, true, true)); } @@ -362,7 +377,24 @@ protected MappingStats createTestInstance() { if (randomBoolean()) { runtimeFieldStats.add(randomRuntimeFieldStats("long")); } - return new MappingStats(randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong(), stats, runtimeFieldStats); + Map sourceModeUsageCount = randomBoolean() + ? Map.of() + : Map.of( + STORED.toString().toLowerCase(Locale.ENGLISH), + randomNonNegativeInt(), + SYNTHETIC.toString().toLowerCase(Locale.ENGLISH), + randomNonNegativeInt(), + DISABLED.toString().toLowerCase(Locale.ENGLISH), + randomNonNegativeInt() + ); + return new MappingStats( + randomNonNegativeLong(), + randomNonNegativeLong(), + randomNonNegativeLong(), + stats, + runtimeFieldStats, + sourceModeUsageCount + ); } private static FieldStats randomFieldStats(String type) { @@ -410,7 +442,8 @@ protected MappingStats mutateInstance(MappingStats instance) { long totalFieldCount = instance.getTotalFieldCount().getAsLong(); long totalDeduplicatedFieldCount = instance.getTotalDeduplicatedFieldCount().getAsLong(); long totalMappingSizeBytes = instance.getTotalMappingSizeBytes().getAsLong(); - switch (between(1, 5)) { + var sourceModeUsageCount = new HashMap<>(instance.getSourceModeUsageCount()); + switch (between(1, 6)) { case 1 -> { boolean remove = fieldTypes.size() > 0 && randomBoolean(); if (remove) { @@ -435,8 +468,22 @@ protected MappingStats mutateInstance(MappingStats instance) { case 3 -> totalFieldCount = randomValueOtherThan(totalFieldCount, ESTestCase::randomNonNegativeLong); case 4 -> totalDeduplicatedFieldCount = randomValueOtherThan(totalDeduplicatedFieldCount, ESTestCase::randomNonNegativeLong); case 5 -> totalMappingSizeBytes = randomValueOtherThan(totalMappingSizeBytes, ESTestCase::randomNonNegativeLong); + case 6 -> { + if (sourceModeUsageCount.isEmpty() == false) { + sourceModeUsageCount.remove(sourceModeUsageCount.keySet().stream().findFirst().get()); + } else { + sourceModeUsageCount.put("stored", randomNonNegativeInt()); + } + } } - return new MappingStats(totalFieldCount, totalDeduplicatedFieldCount, totalMappingSizeBytes, fieldTypes, runtimeFieldTypes); + return new MappingStats( + totalFieldCount, + totalDeduplicatedFieldCount, + totalMappingSizeBytes, + fieldTypes, + runtimeFieldTypes, + sourceModeUsageCount + ); } public void testDenseVectorType() { @@ -531,4 +578,39 @@ public void testWriteTo() throws IOException { assertEquals(instance.getFieldTypeStats(), deserialized.getFieldTypeStats()); assertEquals(instance.getRuntimeFieldStats(), deserialized.getRuntimeFieldStats()); } + + public void testSourceModes() { + var builder = Metadata.builder(); + int numStoredIndices = randomIntBetween(1, 5); + int numSyntheticIndices = randomIntBetween(1, 5); + int numDisabledIndices = randomIntBetween(1, 5); + for (int i = 0; i < numSyntheticIndices; i++) { + IndexMetadata.Builder indexMetadata = new IndexMetadata.Builder("foo-synthetic-" + i).settings( + indexSettings(IndexVersion.current(), 4, 1).put(SourceFieldMapper.INDEX_MAPPER_SOURCE_MODE_SETTING.getKey(), "synthetic") + ); + builder.put(indexMetadata); + } + for (int i = 0; i < numStoredIndices; i++) { + IndexMetadata.Builder indexMetadata; + if (randomBoolean()) { + indexMetadata = new IndexMetadata.Builder("foo-stored-" + i).settings( + indexSettings(IndexVersion.current(), 4, 1).put(SourceFieldMapper.INDEX_MAPPER_SOURCE_MODE_SETTING.getKey(), "stored") + ); + } else { + indexMetadata = new IndexMetadata.Builder("foo-stored-" + i).settings(indexSettings(IndexVersion.current(), 4, 1)); + } + builder.put(indexMetadata); + } + for (int i = 0; i < numDisabledIndices; i++) { + IndexMetadata.Builder indexMetadata = new IndexMetadata.Builder("foo-disabled-" + i).settings( + indexSettings(IndexVersion.current(), 4, 1).put(SourceFieldMapper.INDEX_MAPPER_SOURCE_MODE_SETTING.getKey(), "disabled") + ); + builder.put(indexMetadata); + } + var mappingStats = MappingStats.of(builder.build(), () -> {}); + assertThat(mappingStats.getSourceModeUsageCount().get("synthetic"), equalTo(numSyntheticIndices)); + assertThat(mappingStats.getSourceModeUsageCount().get("stored"), equalTo(numStoredIndices)); + assertThat(mappingStats.getSourceModeUsageCount().get("disabled"), equalTo(numDisabledIndices)); + } + } diff --git a/x-pack/plugin/monitoring/src/test/java/org/elasticsearch/xpack/monitoring/collector/cluster/ClusterStatsMonitoringDocTests.java b/x-pack/plugin/monitoring/src/test/java/org/elasticsearch/xpack/monitoring/collector/cluster/ClusterStatsMonitoringDocTests.java index 9458442557694..f4d50df4ff613 100644 --- a/x-pack/plugin/monitoring/src/test/java/org/elasticsearch/xpack/monitoring/collector/cluster/ClusterStatsMonitoringDocTests.java +++ b/x-pack/plugin/monitoring/src/test/java/org/elasticsearch/xpack/monitoring/collector/cluster/ClusterStatsMonitoringDocTests.java @@ -572,7 +572,8 @@ public void testToXContent() throws IOException { "total_deduplicated_field_count": 0, "total_deduplicated_mapping_size_in_bytes": 0, "field_types": [], - "runtime_field_types": [] + "runtime_field_types": [], + "source_modes": {} }, "analysis": { "char_filter_types": [], From 64dfed4e1f0610014f01fc7285fccac831a62c74 Mon Sep 17 00:00:00 2001 From: Alexander Spies Date: Thu, 28 Nov 2024 11:01:52 +0100 Subject: [PATCH 08/15] ESQL: Mute CATEGORIZE optimizer tests on release builds (#117690) --- .../xpack/esql/optimizer/LogicalPlanOptimizerTests.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizerTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizerTests.java index 2b4fb6ad68972..8373528531902 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizerTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizerTests.java @@ -20,6 +20,7 @@ import org.elasticsearch.xpack.esql.EsqlTestUtils; import org.elasticsearch.xpack.esql.TestBlockFactory; import org.elasticsearch.xpack.esql.VerificationException; +import org.elasticsearch.xpack.esql.action.EsqlCapabilities; import org.elasticsearch.xpack.esql.analysis.Analyzer; import org.elasticsearch.xpack.esql.analysis.AnalyzerContext; import org.elasticsearch.xpack.esql.analysis.AnalyzerTestUtils; @@ -1211,6 +1212,8 @@ public void testCombineProjectionWithAggregationFirstAndAliasedGroupingUsedInAgg * \_EsRelation[test][_meta_field{f}#23, emp_no{f}#17, first_name{f}#18, ..] */ public void testCombineProjectionWithCategorizeGrouping() { + assumeTrue("requires Categorize capability", EsqlCapabilities.Cap.CATEGORIZE_V2.isEnabled()); + var plan = plan(""" from test | eval k = first_name, k1 = k @@ -3946,6 +3949,8 @@ public void testNestedExpressionsInGroups() { * \_EsRelation[test][_meta_field{f}#14, emp_no{f}#8, first_name{f}#9, ge..] */ public void testNestedExpressionsInGroupsWithCategorize() { + assumeTrue("requires Categorize capability", EsqlCapabilities.Cap.CATEGORIZE_V2.isEnabled()); + var plan = optimizedPlan(""" from test | stats c = count(salary) by CATEGORIZE(CONCAT(first_name, "abc")) From 146cb39143f93b6ce453229abf5be08335a75366 Mon Sep 17 00:00:00 2001 From: Tommaso Teofili Date: Thu, 28 Nov 2024 13:46:24 +0100 Subject: [PATCH 09/15] ESQL - enabling scoring with METADATA _score (#113120) * ESQL - enabling scoring with METADATA _score Co-authored-by: ChrisHegarty --- docs/changelog/113120.yaml | 5 + muted-tests.yml | 6 + .../search/sort/SortBuilder.java | 15 +- .../core/expression/MetadataAttribute.java | 5 +- .../compute/lucene/LuceneOperator.java | 5 +- .../compute/lucene/LuceneSourceOperator.java | 96 ++++-- .../lucene/LuceneTopNSourceOperator.java | 141 +++++++-- .../elasticsearch/compute/OperatorTests.java | 3 +- .../LuceneQueryExpressionEvaluatorTests.java | 33 +- .../lucene/LuceneSourceOperatorTests.java | 31 +- .../LuceneTopNSourceOperatorScoringTests.java | 151 +++++++++ .../lucene/LuceneTopNSourceOperatorTests.java | 50 ++- .../ValueSourceReaderTypeConversionTests.java | 9 +- .../ValuesSourceReaderOperatorTests.java | 9 +- .../src/main/resources/qstr-function.csv-spec | 1 - .../src/main/resources/scoring.csv-spec | 285 +++++++++++++++++ .../xpack/esql/action/EsqlActionTaskIT.java | 7 +- .../xpack/esql/action/LookupFromIndexIT.java | 3 +- .../xpack/esql/plugin/MatchFunctionIT.java | 299 ++++++++++++++++++ .../xpack/esql/plugin/MatchOperatorIT.java | 51 +++ .../xpack/esql/plugin/QueryStringIT.java | 96 ++++++ .../xpack/esql/action/EsqlCapabilities.java | 7 +- .../xpack/esql/analysis/Verifier.java | 9 + .../local/LucenePushdownPredicates.java | 5 + .../physical/local/PushTopNToSource.java | 18 +- .../local/ReplaceSourceAttributes.java | 14 +- .../xpack/esql/parser/LogicalPlanBuilder.java | 4 +- .../xpack/esql/plan/physical/EsQueryExec.java | 14 + .../planner/EsPhysicalOperationProviders.java | 14 +- .../xpack/esql/analysis/VerifierTests.java | 25 ++ .../optimizer/PhysicalPlanOptimizerTests.java | 62 ++++ .../physical/local/PushTopNToSourceTests.java | 193 ++++++++++- 32 files changed, 1570 insertions(+), 96 deletions(-) create mode 100644 docs/changelog/113120.yaml create mode 100644 x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/LuceneTopNSourceOperatorScoringTests.java create mode 100644 x-pack/plugin/esql/qa/testFixtures/src/main/resources/scoring.csv-spec create mode 100644 x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/plugin/MatchFunctionIT.java diff --git a/docs/changelog/113120.yaml b/docs/changelog/113120.yaml new file mode 100644 index 0000000000000..801167d61c19c --- /dev/null +++ b/docs/changelog/113120.yaml @@ -0,0 +1,5 @@ +pr: 113120 +summary: ESQL - enabling scoring with METADATA `_score` +area: ES|QL +type: enhancement +issues: [] diff --git a/muted-tests.yml b/muted-tests.yml index 5cf16fdf3da0a..fdadc747289bb 100644 --- a/muted-tests.yml +++ b/muted-tests.yml @@ -224,6 +224,12 @@ tests: issue: https://github.com/elastic/elasticsearch/issues/117591 - class: org.elasticsearch.repositories.s3.RepositoryS3ClientYamlTestSuiteIT issue: https://github.com/elastic/elasticsearch/issues/117596 +- class: "org.elasticsearch.xpack.esql.qa.multi_node.EsqlSpecIT" + method: "test {scoring.*}" + issue: https://github.com/elastic/elasticsearch/issues/117641 +- class: "org.elasticsearch.xpack.esql.qa.single_node.EsqlSpecIT" + method: "test {scoring.*}" + issue: https://github.com/elastic/elasticsearch/issues/117641 # Examples: # diff --git a/server/src/main/java/org/elasticsearch/search/sort/SortBuilder.java b/server/src/main/java/org/elasticsearch/search/sort/SortBuilder.java index 0ac3b42dd5b10..5832b93b9462f 100644 --- a/server/src/main/java/org/elasticsearch/search/sort/SortBuilder.java +++ b/server/src/main/java/org/elasticsearch/search/sort/SortBuilder.java @@ -158,6 +158,11 @@ private static void parseCompoundSortField(XContentParser parser, List buildSort(List> sortBuilders, SearchExecutionContext context) throws IOException { + return buildSort(sortBuilders, context, true); + } + + public static Optional buildSort(List> sortBuilders, SearchExecutionContext context, boolean optimize) + throws IOException { List sortFields = new ArrayList<>(sortBuilders.size()); List sortFormats = new ArrayList<>(sortBuilders.size()); for (SortBuilder builder : sortBuilders) { @@ -172,9 +177,13 @@ public static Optional buildSort(List> sortBuilde if (sortFields.size() > 1) { sort = true; } else { - SortField sortField = sortFields.get(0); - if (sortField.getType() == SortField.Type.SCORE && sortField.getReverse() == false) { - sort = false; + if (optimize) { + SortField sortField = sortFields.get(0); + if (sortField.getType() == SortField.Type.SCORE && sortField.getReverse() == false) { + sort = false; + } else { + sort = true; + } } else { sort = true; } diff --git a/x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/MetadataAttribute.java b/x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/MetadataAttribute.java index 6e4e9292bfc99..0f1cfbb85039c 100644 --- a/x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/MetadataAttribute.java +++ b/x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/MetadataAttribute.java @@ -31,6 +31,7 @@ public class MetadataAttribute extends TypedAttribute { public static final String TIMESTAMP_FIELD = "@timestamp"; public static final String TSID_FIELD = "_tsid"; + public static final String SCORE = "_score"; static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry( Attribute.class, @@ -50,7 +51,9 @@ public class MetadataAttribute extends TypedAttribute { SourceFieldMapper.NAME, tuple(DataType.SOURCE, false), IndexModeFieldMapper.NAME, - tuple(DataType.KEYWORD, true) + tuple(DataType.KEYWORD, true), + SCORE, + tuple(DataType.DOUBLE, false) ); private final boolean searchable; diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneOperator.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneOperator.java index 6f75298e95dd7..bbc3ace3716ba 100644 --- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneOperator.java +++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneOperator.java @@ -79,6 +79,7 @@ public abstract static class Factory implements SourceOperator.SourceOperatorFac protected final DataPartitioning dataPartitioning; protected final int taskConcurrency; protected final int limit; + protected final ScoreMode scoreMode; protected final LuceneSliceQueue sliceQueue; /** @@ -95,6 +96,7 @@ protected Factory( ScoreMode scoreMode ) { this.limit = limit; + this.scoreMode = scoreMode; this.dataPartitioning = dataPartitioning; var weightFunction = weightFunction(queryFunction, scoreMode); this.sliceQueue = LuceneSliceQueue.create(contexts, weightFunction, dataPartitioning, taskConcurrency); @@ -438,7 +440,8 @@ static Function weightFunction(Function 0) { - --remainingDocs; - docsBuilder.appendInt(doc); - currentPagePos++; - } else { - throw new CollectionTerminatedException(); - } + class LimitingCollector implements LeafCollector { + @Override + public void setScorer(Scorable scorer) {} + + @Override + public void collect(int doc) throws IOException { + if (remainingDocs > 0) { + --remainingDocs; + docsBuilder.appendInt(doc); + currentPagePos++; + } else { + throw new CollectionTerminatedException(); } - }; + } + } + + final class ScoringCollector extends LuceneSourceOperator.LimitingCollector { + private Scorable scorable; + + @Override + public void setScorer(Scorable scorer) { + this.scorable = scorer; + } + + @Override + public void collect(int doc) throws IOException { + super.collect(doc); + scoreBuilder.appendDouble(scorable.score()); + } } @Override @@ -139,15 +179,27 @@ public Page getCheckedOutput() throws IOException { IntBlock shard = null; IntBlock leaf = null; IntVector docs = null; + DoubleVector scores = null; + DocBlock docBlock = null; try { shard = blockFactory.newConstantIntBlockWith(scorer.shardContext().index(), currentPagePos); leaf = blockFactory.newConstantIntBlockWith(scorer.leafReaderContext().ord, currentPagePos); docs = docsBuilder.build(); docsBuilder = blockFactory.newIntVectorBuilder(Math.min(remainingDocs, maxPageSize)); - page = new Page(currentPagePos, new DocVector(shard.asVector(), leaf.asVector(), docs, true).asBlock()); + docBlock = new DocVector(shard.asVector(), leaf.asVector(), docs, true).asBlock(); + shard = null; + leaf = null; + docs = null; + if (scoreBuilder == null) { + page = new Page(currentPagePos, docBlock); + } else { + scores = scoreBuilder.build(); + scoreBuilder = blockFactory.newDoubleVectorBuilder(Math.min(remainingDocs, maxPageSize)); + page = new Page(currentPagePos, docBlock, scores.asBlock()); + } } finally { if (page == null) { - Releasables.closeExpectNoException(shard, leaf, docs); + Releasables.closeExpectNoException(shard, leaf, docs, docBlock, scores); } } currentPagePos = 0; @@ -160,7 +212,7 @@ public Page getCheckedOutput() throws IOException { @Override public void close() { - docsBuilder.close(); + Releasables.close(docsBuilder, scoreBuilder); } @Override diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneTopNSourceOperator.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneTopNSourceOperator.java index 0f600958b93b3..8da62963ffb64 100644 --- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneTopNSourceOperator.java +++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneTopNSourceOperator.java @@ -10,15 +10,22 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.search.CollectionTerminatedException; +import org.apache.lucene.search.FieldDoc; import org.apache.lucene.search.LeafCollector; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.ScoreMode; -import org.apache.lucene.search.TopFieldCollector; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.TopDocsCollector; import org.apache.lucene.search.TopFieldCollectorManager; +import org.apache.lucene.search.TopScoreDocCollectorManager; import org.elasticsearch.common.Strings; import org.elasticsearch.compute.data.BlockFactory; +import org.elasticsearch.compute.data.DocBlock; import org.elasticsearch.compute.data.DocVector; +import org.elasticsearch.compute.data.DoubleBlock; +import org.elasticsearch.compute.data.DoubleVector; import org.elasticsearch.compute.data.IntBlock; import org.elasticsearch.compute.data.IntVector; import org.elasticsearch.compute.data.Page; @@ -29,17 +36,21 @@ import org.elasticsearch.search.sort.SortBuilder; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Optional; import java.util.function.Function; import java.util.stream.Collectors; +import static org.apache.lucene.search.ScoreMode.COMPLETE; +import static org.apache.lucene.search.ScoreMode.TOP_DOCS; + /** * Source operator that builds Pages out of the output of a TopFieldCollector (aka TopN) */ public final class LuceneTopNSourceOperator extends LuceneOperator { - public static final class Factory extends LuceneOperator.Factory { + public static class Factory extends LuceneOperator.Factory { private final int maxPageSize; private final List> sorts; @@ -50,16 +61,17 @@ public Factory( int taskConcurrency, int maxPageSize, int limit, - List> sorts + List> sorts, + boolean scoring ) { - super(contexts, queryFunction, dataPartitioning, taskConcurrency, limit, ScoreMode.TOP_DOCS); + super(contexts, queryFunction, dataPartitioning, taskConcurrency, limit, scoring ? COMPLETE : TOP_DOCS); this.maxPageSize = maxPageSize; this.sorts = sorts; } @Override public SourceOperator get(DriverContext driverContext) { - return new LuceneTopNSourceOperator(driverContext.blockFactory(), maxPageSize, sorts, limit, sliceQueue); + return new LuceneTopNSourceOperator(driverContext.blockFactory(), maxPageSize, sorts, limit, sliceQueue, scoreMode); } public int maxPageSize() { @@ -75,6 +87,8 @@ public String describe() { + maxPageSize + ", limit = " + limit + + ", scoreMode = " + + scoreMode + ", sorts = [" + notPrettySorts + "]]"; @@ -93,17 +107,20 @@ public String describe() { private PerShardCollector perShardCollector; private final List> sorts; private final int limit; + private final ScoreMode scoreMode; public LuceneTopNSourceOperator( BlockFactory blockFactory, int maxPageSize, List> sorts, int limit, - LuceneSliceQueue sliceQueue + LuceneSliceQueue sliceQueue, + ScoreMode scoreMode ) { super(blockFactory, maxPageSize, sliceQueue); this.sorts = sorts; this.limit = limit; + this.scoreMode = scoreMode; } @Override @@ -145,7 +162,7 @@ private Page collect() throws IOException { try { if (perShardCollector == null || perShardCollector.shardContext.index() != scorer.shardContext().index()) { // TODO: share the bottom between shardCollectors - perShardCollector = new PerShardCollector(scorer.shardContext(), sorts, limit); + perShardCollector = newPerShardCollector(scorer.shardContext(), sorts, limit); } var leafCollector = perShardCollector.getLeafCollector(scorer.leafReaderContext()); scorer.scoreNextRange(leafCollector, scorer.leafReaderContext().reader().getLiveDocs(), maxPageSize); @@ -171,7 +188,7 @@ private Page emit(boolean startEmitting) { assert isEmitting() == false : "offset=" + offset + " score_docs=" + Arrays.toString(scoreDocs); offset = 0; if (perShardCollector != null) { - scoreDocs = perShardCollector.topFieldCollector.topDocs().scoreDocs; + scoreDocs = perShardCollector.collector.topDocs().scoreDocs; } else { scoreDocs = new ScoreDoc[0]; } @@ -183,10 +200,13 @@ private Page emit(boolean startEmitting) { IntBlock shard = null; IntVector segments = null; IntVector docs = null; + DocBlock docBlock = null; + DoubleBlock scores = null; Page page = null; try ( IntVector.Builder currentSegmentBuilder = blockFactory.newIntVectorFixedBuilder(size); - IntVector.Builder currentDocsBuilder = blockFactory.newIntVectorFixedBuilder(size) + IntVector.Builder currentDocsBuilder = blockFactory.newIntVectorFixedBuilder(size); + DoubleVector.Builder currentScoresBuilder = scoreVectorOrNull(size); ) { int start = offset; offset += size; @@ -196,53 +216,130 @@ private Page emit(boolean startEmitting) { int segment = ReaderUtil.subIndex(doc, leafContexts); currentSegmentBuilder.appendInt(segment); currentDocsBuilder.appendInt(doc - leafContexts.get(segment).docBase); // the offset inside the segment + if (currentScoresBuilder != null) { + float score = getScore(scoreDocs[i]); + currentScoresBuilder.appendDouble(score); + } } shard = blockFactory.newConstantIntBlockWith(perShardCollector.shardContext.index(), size); segments = currentSegmentBuilder.build(); docs = currentDocsBuilder.build(); - page = new Page(size, new DocVector(shard.asVector(), segments, docs, null).asBlock()); + docBlock = new DocVector(shard.asVector(), segments, docs, null).asBlock(); + shard = null; + segments = null; + docs = null; + if (currentScoresBuilder == null) { + page = new Page(size, docBlock); + } else { + scores = currentScoresBuilder.build().asBlock(); + page = new Page(size, docBlock, scores); + } } finally { if (page == null) { - Releasables.closeExpectNoException(shard, segments, docs); + Releasables.closeExpectNoException(shard, segments, docs, docBlock, scores); } } pagesEmitted++; return page; } + private float getScore(ScoreDoc scoreDoc) { + if (scoreDoc instanceof FieldDoc fieldDoc) { + if (Float.isNaN(fieldDoc.score)) { + if (sorts != null) { + return (Float) fieldDoc.fields[sorts.size() + 1]; + } else { + return (Float) fieldDoc.fields[0]; + } + } else { + return fieldDoc.score; + } + } else { + return scoreDoc.score; + } + } + + private DoubleVector.Builder scoreVectorOrNull(int size) { + if (scoreMode.needsScores()) { + return blockFactory.newDoubleVectorFixedBuilder(size); + } else { + return null; + } + } + @Override protected void describe(StringBuilder sb) { sb.append(", limit = ").append(limit); + sb.append(", scoreMode = ").append(scoreMode); String notPrettySorts = sorts.stream().map(Strings::toString).collect(Collectors.joining(",")); sb.append(", sorts = [").append(notPrettySorts).append("]"); } - static final class PerShardCollector { + PerShardCollector newPerShardCollector(ShardContext shardContext, List> sorts, int limit) throws IOException { + Optional sortAndFormats = shardContext.buildSort(sorts); + if (sortAndFormats.isEmpty()) { + throw new IllegalStateException("sorts must not be disabled in TopN"); + } + if (scoreMode.needsScores() == false) { + return new NonScoringPerShardCollector(shardContext, sortAndFormats.get().sort, limit); + } else { + SortField[] sortFields = sortAndFormats.get().sort.getSort(); + if (sortFields != null && sortFields.length == 1 && sortFields[0].needsScores() && sortFields[0].getReverse() == false) { + // SORT _score DESC + return new ScoringPerShardCollector( + shardContext, + new TopScoreDocCollectorManager(limit, null, limit, false).newCollector() + ); + } else { + // SORT ..., _score, ... + var sort = new Sort(); + if (sortFields != null) { + var l = new ArrayList<>(Arrays.asList(sortFields)); + l.add(SortField.FIELD_DOC); + l.add(SortField.FIELD_SCORE); + sort = new Sort(l.toArray(SortField[]::new)); + } + return new ScoringPerShardCollector( + shardContext, + new TopFieldCollectorManager(sort, limit, null, limit, false).newCollector() + ); + } + } + } + + abstract static class PerShardCollector { private final ShardContext shardContext; - private final TopFieldCollector topFieldCollector; + private final TopDocsCollector collector; private int leafIndex; private LeafCollector leafCollector; private Thread currentThread; - PerShardCollector(ShardContext shardContext, List> sorts, int limit) throws IOException { + PerShardCollector(ShardContext shardContext, TopDocsCollector collector) { this.shardContext = shardContext; - Optional sortAndFormats = shardContext.buildSort(sorts); - if (sortAndFormats.isEmpty()) { - throw new IllegalStateException("sorts must not be disabled in TopN"); - } - - // We don't use CollectorManager here as we don't retrieve the total hits and sort by score. - this.topFieldCollector = new TopFieldCollectorManager(sortAndFormats.get().sort, limit, null, 0, false).newCollector(); + this.collector = collector; } LeafCollector getLeafCollector(LeafReaderContext leafReaderContext) throws IOException { if (currentThread != Thread.currentThread() || leafIndex != leafReaderContext.ord) { - leafCollector = topFieldCollector.getLeafCollector(leafReaderContext); + leafCollector = collector.getLeafCollector(leafReaderContext); leafIndex = leafReaderContext.ord; currentThread = Thread.currentThread(); } return leafCollector; } } + + static final class NonScoringPerShardCollector extends PerShardCollector { + NonScoringPerShardCollector(ShardContext shardContext, Sort sort, int limit) { + // We don't use CollectorManager here as we don't retrieve the total hits and sort by score. + super(shardContext, new TopFieldCollectorManager(sort, limit, null, 0, false).newCollector()); + } + } + + static final class ScoringPerShardCollector extends PerShardCollector { + ScoringPerShardCollector(ShardContext shardContext, TopDocsCollector topDocsCollector) { + super(shardContext, topDocsCollector); + } + } } diff --git a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/OperatorTests.java b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/OperatorTests.java index 0d39a5bf8227e..e6ef10e53ec7c 100644 --- a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/OperatorTests.java +++ b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/OperatorTests.java @@ -394,7 +394,8 @@ static LuceneOperator.Factory luceneOperatorFactory(IndexReader reader, Query qu randomFrom(DataPartitioning.values()), randomIntBetween(1, 10), randomPageSize(), - limit + limit, + false // no scoring ); } } diff --git a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/LuceneQueryExpressionEvaluatorTests.java b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/LuceneQueryExpressionEvaluatorTests.java index beca522878358..ffaee536b443e 100644 --- a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/LuceneQueryExpressionEvaluatorTests.java +++ b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/LuceneQueryExpressionEvaluatorTests.java @@ -27,6 +27,8 @@ import org.elasticsearch.compute.data.BooleanVector; import org.elasticsearch.compute.data.BytesRefBlock; import org.elasticsearch.compute.data.BytesRefVector; +import org.elasticsearch.compute.data.DocBlock; +import org.elasticsearch.compute.data.DoubleBlock; import org.elasticsearch.compute.data.ElementType; import org.elasticsearch.compute.data.Page; import org.elasticsearch.compute.lucene.LuceneQueryExpressionEvaluator.DenseCollector; @@ -120,8 +122,9 @@ public void testTermQueryShuffled() throws IOException { private void assertTermQuery(String term, List results) { int matchCount = 0; for (Page page : results) { - BytesRefVector terms = page.getBlock(1).asVector(); - BooleanVector matches = page.getBlock(2).asVector(); + int initialBlockIndex = initialBlockIndex(page); + BytesRefVector terms = page.getBlock(initialBlockIndex).asVector(); + BooleanVector matches = page.getBlock(initialBlockIndex + 1).asVector(); for (int i = 0; i < page.getPositionCount(); i++) { BytesRef termAtPosition = terms.getBytesRef(i, new BytesRef()); assertThat(matches.getBoolean(i), equalTo(termAtPosition.utf8ToString().equals(term))); @@ -155,8 +158,9 @@ private void testTermsQuery(boolean shuffleDocs) throws IOException { List results = runQuery(values, new TermInSetQuery(MultiTermQuery.CONSTANT_SCORE_REWRITE, FIELD, matchingBytes), shuffleDocs); int matchCount = 0; for (Page page : results) { - BytesRefVector terms = page.getBlock(1).asVector(); - BooleanVector matches = page.getBlock(2).asVector(); + int initialBlockIndex = initialBlockIndex(page); + BytesRefVector terms = page.getBlock(initialBlockIndex).asVector(); + BooleanVector matches = page.getBlock(initialBlockIndex + 1).asVector(); for (int i = 0; i < page.getPositionCount(); i++) { BytesRef termAtPosition = terms.getBytesRef(i, new BytesRef()); assertThat(matches.getBoolean(i), equalTo(matching.contains(termAtPosition.utf8ToString()))); @@ -207,7 +211,7 @@ private List runQuery(Set values, Query query, boolean shuffleDocs List results = new ArrayList<>(); Driver driver = new Driver( driverContext, - luceneOperatorFactory(reader, new MatchAllDocsQuery(), LuceneOperator.NO_LIMIT).get(driverContext), + luceneOperatorFactory(reader, new MatchAllDocsQuery(), LuceneOperator.NO_LIMIT, scoring).get(driverContext), operators, new TestResultPageSinkOperator(results::add), () -> {} @@ -248,7 +252,21 @@ private DriverContext driverContext() { return new DriverContext(blockFactory.bigArrays(), blockFactory); } - static LuceneOperator.Factory luceneOperatorFactory(IndexReader reader, Query query, int limit) { + // Scores are not interesting to this test, but enabled conditionally and effectively ignored just for coverage. + private final boolean scoring = randomBoolean(); + + // Returns the initial block index, ignoring the score block if scoring is enabled + private int initialBlockIndex(Page page) { + assert page.getBlock(0) instanceof DocBlock : "expected doc block at index 0"; + if (scoring) { + assert page.getBlock(1) instanceof DoubleBlock : "expected double block at index 1"; + return 2; + } else { + return 1; + } + } + + static LuceneOperator.Factory luceneOperatorFactory(IndexReader reader, Query query, int limit, boolean scoring) { final ShardContext searchContext = new LuceneSourceOperatorTests.MockShardContext(reader, 0); return new LuceneSourceOperator.Factory( List.of(searchContext), @@ -256,7 +274,8 @@ static LuceneOperator.Factory luceneOperatorFactory(IndexReader reader, Query qu randomFrom(DataPartitioning.values()), randomIntBetween(1, 10), randomPageSize(), - limit + limit, + scoring ); } } diff --git a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/LuceneSourceOperatorTests.java b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/LuceneSourceOperatorTests.java index 626190c04c501..2dcc5e20d3f98 100644 --- a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/LuceneSourceOperatorTests.java +++ b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/LuceneSourceOperatorTests.java @@ -17,6 +17,8 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.RandomIndexWriter; import org.elasticsearch.common.breaker.CircuitBreakingException; +import org.elasticsearch.compute.data.DocBlock; +import org.elasticsearch.compute.data.DoubleBlock; import org.elasticsearch.compute.data.ElementType; import org.elasticsearch.compute.data.LongBlock; import org.elasticsearch.compute.data.Page; @@ -63,10 +65,10 @@ public void closeIndex() throws IOException { @Override protected LuceneSourceOperator.Factory simple() { - return simple(randomFrom(DataPartitioning.values()), between(1, 10_000), 100); + return simple(randomFrom(DataPartitioning.values()), between(1, 10_000), 100, scoring); } - private LuceneSourceOperator.Factory simple(DataPartitioning dataPartitioning, int numDocs, int limit) { + private LuceneSourceOperator.Factory simple(DataPartitioning dataPartitioning, int numDocs, int limit, boolean scoring) { int commitEvery = Math.max(1, numDocs / 10); try ( RandomIndexWriter writer = new RandomIndexWriter( @@ -91,7 +93,7 @@ private LuceneSourceOperator.Factory simple(DataPartitioning dataPartitioning, i ShardContext ctx = new MockShardContext(reader, 0); Function queryFunction = c -> new MatchAllDocsQuery(); int maxPageSize = between(10, Math.max(10, numDocs)); - return new LuceneSourceOperator.Factory(List.of(ctx), queryFunction, dataPartitioning, 1, maxPageSize, limit); + return new LuceneSourceOperator.Factory(List.of(ctx), queryFunction, dataPartitioning, 1, maxPageSize, limit, scoring); } @Override @@ -101,7 +103,10 @@ protected Matcher expectedToStringOfSimple() { @Override protected Matcher expectedDescriptionOfSimple() { - return matchesRegex("LuceneSourceOperator\\[dataPartitioning = (DOC|SHARD|SEGMENT), maxPageSize = \\d+, limit = 100]"); + return matchesRegex( + "LuceneSourceOperator" + + "\\[dataPartitioning = (DOC|SHARD|SEGMENT), maxPageSize = \\d+, limit = 100, scoreMode = (COMPLETE|COMPLETE_NO_SCORES)]" + ); } // TODO tests for the other data partitioning configurations @@ -149,7 +154,7 @@ public void testShardDataPartitioningWithCranky() { } private void testSimple(DriverContext ctx, int size, int limit) { - LuceneSourceOperator.Factory factory = simple(DataPartitioning.SHARD, size, limit); + LuceneSourceOperator.Factory factory = simple(DataPartitioning.SHARD, size, limit, scoring); Operator.OperatorFactory readS = ValuesSourceReaderOperatorTests.factory(reader, S_FIELD, ElementType.LONG); List results = new ArrayList<>(); @@ -164,7 +169,7 @@ private void testSimple(DriverContext ctx, int size, int limit) { } for (Page page : results) { - LongBlock sBlock = page.getBlock(1); + LongBlock sBlock = page.getBlock(initialBlockIndex(page)); for (int p = 0; p < page.getPositionCount(); p++) { assertThat(sBlock.getLong(sBlock.getFirstValueIndex(p)), both(greaterThanOrEqualTo(0L)).and(lessThan((long) size))); } @@ -174,6 +179,20 @@ private void testSimple(DriverContext ctx, int size, int limit) { assertThat(results, hasSize(both(greaterThanOrEqualTo(minPages)).and(lessThanOrEqualTo(maxPages)))); } + // Scores are not interesting to this test, but enabled conditionally and effectively ignored just for coverage. + private final boolean scoring = randomBoolean(); + + // Returns the initial block index, ignoring the score block if scoring is enabled + private int initialBlockIndex(Page page) { + assert page.getBlock(0) instanceof DocBlock : "expected doc block at index 0"; + if (scoring) { + assert page.getBlock(1) instanceof DoubleBlock : "expected double block at index 1"; + return 2; + } else { + return 1; + } + } + /** * Creates a mock search context with the given index reader. * The returned mock search context can be used to test with {@link LuceneOperator}. diff --git a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/LuceneTopNSourceOperatorScoringTests.java b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/LuceneTopNSourceOperatorScoringTests.java new file mode 100644 index 0000000000000..a0fa1c2c01c0a --- /dev/null +++ b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/LuceneTopNSourceOperatorScoringTests.java @@ -0,0 +1,151 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.compute.lucene; + +import org.apache.lucene.document.SortedNumericDocValuesField; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.NoMergePolicy; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.SortedNumericSelector; +import org.apache.lucene.search.SortedNumericSortField; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.elasticsearch.compute.data.DoubleBlock; +import org.elasticsearch.compute.data.ElementType; +import org.elasticsearch.compute.data.Page; +import org.elasticsearch.compute.operator.Driver; +import org.elasticsearch.compute.operator.DriverContext; +import org.elasticsearch.compute.operator.Operator; +import org.elasticsearch.compute.operator.OperatorTestCase; +import org.elasticsearch.compute.operator.TestResultPageSinkOperator; +import org.elasticsearch.core.IOUtils; +import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.NumberFieldMapper; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.sort.FieldSortBuilder; +import org.elasticsearch.search.sort.SortAndFormats; +import org.elasticsearch.search.sort.SortBuilder; +import org.hamcrest.Matcher; +import org.junit.After; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.function.Function; + +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.hasSize; +import static org.hamcrest.Matchers.matchesRegex; + +public class LuceneTopNSourceOperatorScoringTests extends LuceneTopNSourceOperatorTests { + private static final MappedFieldType S_FIELD = new NumberFieldMapper.NumberFieldType("s", NumberFieldMapper.NumberType.LONG); + private Directory directory = newDirectory(); + private IndexReader reader; + + @After + private void closeIndex() throws IOException { + IOUtils.close(reader, directory); + } + + @Override + protected LuceneTopNSourceOperator.Factory simple() { + return simple(DataPartitioning.SHARD, 10_000, 100); + } + + private LuceneTopNSourceOperator.Factory simple(DataPartitioning dataPartitioning, int size, int limit) { + int commitEvery = Math.max(1, size / 10); + try ( + RandomIndexWriter writer = new RandomIndexWriter( + random(), + directory, + newIndexWriterConfig().setMergePolicy(NoMergePolicy.INSTANCE) + ) + ) { + for (int d = 0; d < size; d++) { + List doc = new ArrayList<>(); + doc.add(new SortedNumericDocValuesField("s", d)); + writer.addDocument(doc); + if (d % commitEvery == 0) { + writer.commit(); + } + } + reader = writer.getReader(); + } catch (IOException e) { + throw new RuntimeException(e); + } + + ShardContext ctx = new LuceneSourceOperatorTests.MockShardContext(reader, 0) { + @Override + public Optional buildSort(List> sorts) { + SortField field = new SortedNumericSortField("s", SortField.Type.LONG, false, SortedNumericSelector.Type.MIN); + return Optional.of(new SortAndFormats(new Sort(field), new DocValueFormat[] { null })); + } + }; + Function queryFunction = c -> new MatchAllDocsQuery(); + int taskConcurrency = 0; + int maxPageSize = between(10, Math.max(10, size)); + List> sorts = List.of(new FieldSortBuilder("s")); + return new LuceneTopNSourceOperator.Factory( + List.of(ctx), + queryFunction, + dataPartitioning, + taskConcurrency, + maxPageSize, + limit, + sorts, + true // scoring + ); + } + + @Override + protected Matcher expectedToStringOfSimple() { + return matchesRegex("LuceneTopNSourceOperator\\[maxPageSize = \\d+, limit = 100, scoreMode = COMPLETE, sorts = \\[\\{.+}]]"); + } + + @Override + protected Matcher expectedDescriptionOfSimple() { + return matchesRegex( + "LuceneTopNSourceOperator" + + "\\[dataPartitioning = (DOC|SHARD|SEGMENT), maxPageSize = \\d+, limit = 100, scoreMode = COMPLETE, sorts = \\[\\{.+}]]" + ); + } + + @Override + protected void testSimple(DriverContext ctx, int size, int limit) { + LuceneTopNSourceOperator.Factory factory = simple(DataPartitioning.SHARD, size, limit); + Operator.OperatorFactory readS = ValuesSourceReaderOperatorTests.factory(reader, S_FIELD, ElementType.LONG); + + List results = new ArrayList<>(); + OperatorTestCase.runDriver( + new Driver(ctx, factory.get(ctx), List.of(readS.get(ctx)), new TestResultPageSinkOperator(results::add), () -> {}) + ); + OperatorTestCase.assertDriverContext(ctx); + + long expectedS = 0; + int maxPageSize = factory.maxPageSize(); + for (Page page : results) { + if (limit - expectedS < maxPageSize) { + assertThat(page.getPositionCount(), equalTo((int) (limit - expectedS))); + } else { + assertThat(page.getPositionCount(), equalTo(maxPageSize)); + } + DoubleBlock sBlock = page.getBlock(1); + for (int p = 0; p < page.getPositionCount(); p++) { + assertThat(sBlock.getDouble(sBlock.getFirstValueIndex(p)), equalTo(1.0d)); + expectedS++; + } + } + int pages = (int) Math.ceil((float) Math.min(size, limit) / maxPageSize); + assertThat(results, hasSize(pages)); + } +} diff --git a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/LuceneTopNSourceOperatorTests.java b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/LuceneTopNSourceOperatorTests.java index 938c4ce5c9f7d..d9a0b70b7931e 100644 --- a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/LuceneTopNSourceOperatorTests.java +++ b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/LuceneTopNSourceOperatorTests.java @@ -20,6 +20,8 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.RandomIndexWriter; import org.elasticsearch.common.breaker.CircuitBreakingException; +import org.elasticsearch.compute.data.DocBlock; +import org.elasticsearch.compute.data.DoubleBlock; import org.elasticsearch.compute.data.ElementType; import org.elasticsearch.compute.data.LongBlock; import org.elasticsearch.compute.data.Page; @@ -56,7 +58,7 @@ public class LuceneTopNSourceOperatorTests extends AnyOperatorTestCase { private IndexReader reader; @After - public void closeIndex() throws IOException { + private void closeIndex() throws IOException { IOUtils.close(reader, directory); } @@ -105,19 +107,25 @@ public Optional buildSort(List> sorts) { taskConcurrency, maxPageSize, limit, - sorts + sorts, + scoring ); } @Override protected Matcher expectedToStringOfSimple() { - return matchesRegex("LuceneTopNSourceOperator\\[maxPageSize = \\d+, limit = 100, sorts = \\[\\{.+}]]"); + var s = scoring ? "COMPLETE" : "TOP_DOCS"; + return matchesRegex("LuceneTopNSourceOperator\\[maxPageSize = \\d+, limit = 100, scoreMode = " + s + ", sorts = \\[\\{.+}]]"); } @Override protected Matcher expectedDescriptionOfSimple() { + var s = scoring ? "COMPLETE" : "TOP_DOCS"; return matchesRegex( - "LuceneTopNSourceOperator\\[dataPartitioning = (DOC|SHARD|SEGMENT), maxPageSize = \\d+, limit = 100, sorts = \\[\\{.+}]]" + "LuceneTopNSourceOperator" + + "\\[dataPartitioning = (DOC|SHARD|SEGMENT), maxPageSize = \\d+, limit = 100, scoreMode = " + + s + + ", sorts = \\[\\{.+}]]" ); } @@ -137,12 +145,24 @@ public void testShardDataPartitioningWithCranky() { } } - private void testShardDataPartitioning(DriverContext context) { + void testShardDataPartitioning(DriverContext context) { int size = between(1_000, 20_000); int limit = between(10, size); testSimple(context, size, limit); } + public void testWithCranky() { + try { + int size = between(1_000, 20_000); + int limit = between(10, size); + testSimple(crankyDriverContext(), size, limit); + logger.info("cranky didn't break"); + } catch (CircuitBreakingException e) { + logger.info("broken", e); + assertThat(e.getMessage(), equalTo(CrankyCircuitBreakerService.ERROR_MESSAGE)); + } + } + public void testEmpty() { testEmpty(driverContext()); } @@ -157,11 +177,11 @@ public void testEmptyWithCranky() { } } - private void testEmpty(DriverContext context) { + void testEmpty(DriverContext context) { testSimple(context, 0, between(10, 10_000)); } - private void testSimple(DriverContext ctx, int size, int limit) { + protected void testSimple(DriverContext ctx, int size, int limit) { LuceneTopNSourceOperator.Factory factory = simple(DataPartitioning.SHARD, size, limit); Operator.OperatorFactory readS = ValuesSourceReaderOperatorTests.factory(reader, S_FIELD, ElementType.LONG); @@ -178,7 +198,7 @@ private void testSimple(DriverContext ctx, int size, int limit) { } else { assertThat(page.getPositionCount(), equalTo(factory.maxPageSize())); } - LongBlock sBlock = page.getBlock(1); + LongBlock sBlock = page.getBlock(initialBlockIndex(page)); for (int p = 0; p < page.getPositionCount(); p++) { assertThat(sBlock.getLong(sBlock.getFirstValueIndex(p)), equalTo(expectedS++)); } @@ -186,4 +206,18 @@ private void testSimple(DriverContext ctx, int size, int limit) { int pages = (int) Math.ceil((float) Math.min(size, limit) / factory.maxPageSize()); assertThat(results, hasSize(pages)); } + + // Scores are not interesting to this test, but enabled conditionally and effectively ignored just for coverage. + private final boolean scoring = randomBoolean(); + + // Returns the initial block index, ignoring the score block if scoring is enabled + private int initialBlockIndex(Page page) { + assert page.getBlock(0) instanceof DocBlock : "expected doc block at index 0"; + if (scoring) { + assert page.getBlock(1) instanceof DoubleBlock : "expected double block at index 1"; + return 2; + } else { + return 1; + } + } } diff --git a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/ValueSourceReaderTypeConversionTests.java b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/ValueSourceReaderTypeConversionTests.java index f6d81af7c14e5..f31573f121a71 100644 --- a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/ValueSourceReaderTypeConversionTests.java +++ b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/ValueSourceReaderTypeConversionTests.java @@ -265,7 +265,8 @@ private SourceOperator simpleInput(DriverContext context, int size, int commitEv DataPartitioning.SHARD, 1,// randomIntBetween(1, 10), pageSize, - LuceneOperator.NO_LIMIT + LuceneOperator.NO_LIMIT, + false // no scoring ); return luceneFactory.get(context); } @@ -1292,7 +1293,8 @@ public void testWithNulls() throws IOException { randomFrom(DataPartitioning.values()), randomIntBetween(1, 10), randomPageSize(), - LuceneOperator.NO_LIMIT + LuceneOperator.NO_LIMIT, + false // no scoring ); var vsShardContext = new ValuesSourceReaderOperator.ShardContext(reader(indexKey), () -> SourceLoader.FROM_STORED_SOURCE); try ( @@ -1450,7 +1452,8 @@ public void testManyShards() throws IOException { DataPartitioning.SHARD, randomIntBetween(1, 10), 1000, - LuceneOperator.NO_LIMIT + LuceneOperator.NO_LIMIT, + false // no scoring ); // TODO add index2 MappedFieldType ft = mapperService(indexKey).fieldType("key"); diff --git a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/ValuesSourceReaderOperatorTests.java b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/ValuesSourceReaderOperatorTests.java index c8dd6f87be5fc..95b313b0b5412 100644 --- a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/ValuesSourceReaderOperatorTests.java +++ b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/ValuesSourceReaderOperatorTests.java @@ -170,7 +170,8 @@ private SourceOperator simpleInput(DriverContext context, int size, int commitEv DataPartitioning.SHARD, randomIntBetween(1, 10), pageSize, - LuceneOperator.NO_LIMIT + LuceneOperator.NO_LIMIT, + false // no scoring ); return luceneFactory.get(context); } @@ -1301,7 +1302,8 @@ public void testWithNulls() throws IOException { randomFrom(DataPartitioning.values()), randomIntBetween(1, 10), randomPageSize(), - LuceneOperator.NO_LIMIT + LuceneOperator.NO_LIMIT, + false // no scoring ); try ( Driver driver = new Driver( @@ -1524,7 +1526,8 @@ public void testManyShards() throws IOException { DataPartitioning.SHARD, randomIntBetween(1, 10), 1000, - LuceneOperator.NO_LIMIT + LuceneOperator.NO_LIMIT, + false // no scoring ); MappedFieldType ft = mapperService.fieldType("key"); var readerFactory = new ValuesSourceReaderOperator.Factory( diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/qstr-function.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/qstr-function.csv-spec index 6039dc05b6c44..2c84bdae6b32e 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/qstr-function.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/qstr-function.csv-spec @@ -100,7 +100,6 @@ book_no:keyword | title:text 7140 | The Lord of the Rings Poster Collection: Six Paintings by Alan Lee (No. 1) ; - qstrWithMultivaluedTextField required_capability: qstr_function diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/scoring.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/scoring.csv-spec new file mode 100644 index 0000000000000..d4c7b8c59fdbc --- /dev/null +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/scoring.csv-spec @@ -0,0 +1,285 @@ +############################################### +# Tests for scoring support +# + +singleQstrBoostScoringSorted +required_capability: metadata_score +required_capability: qstr_function + +from books metadata _score +| where qstr("author:Lord Rings^2") +| eval c_score = ceil(_score) +| keep book_no, title, c_score +| sort c_score desc, book_no asc +| LIMIT 2; + +book_no:keyword | title:text | c_score:double +2675 | The Lord of the Rings - Boxed Set | 6.0 +4023 | A Tolkien Compass: Including J. R. R. Tolkien's Guide to the Names in The Lord of the Rings | 6.0 +; + +singleMatchWithKeywordFieldScoring +required_capability: metadata_score +required_capability: match_operator_colon + +from books metadata _score +| where author.keyword:"William Faulkner" +| keep book_no, author, _score +| sort book_no; + +book_no:keyword | author:text | _score:double +2713 | William Faulkner | 2.3142893314361572 +2883 | William Faulkner | 2.3142893314361572 +4724 | William Faulkner | 2.3142893314361572 +4977 | William Faulkner | 2.3142893314361572 +5119 | William Faulkner | 2.3142893314361572 +5404 | William Faulkner | 2.3142893314361572 +5578 | William Faulkner | 2.3142893314361572 +8077 | William Faulkner | 2.3142893314361572 +9896 | William Faulkner | 2.3142893314361572 +; + +qstrWithFieldAndScoringSortedEval +required_capability: qstr_function +required_capability: metadata_score + +from books metadata _score +| where qstr("title:rings") +| sort _score desc +| eval _score::long +| keep book_no, title, _score +| limit 3; + +book_no:keyword | title:text | _score:double +2675 | The Lord of the Rings - Boxed Set | 2.7583377361297607 +7140 | The Lord of the Rings Poster Collection: Six Paintings by Alan Lee (No. 1) | 1.9239964485168457 +2714 | Return of the King Being the Third Part of The Lord of the Rings | 1.9239964485168457 +; + +qstrWithFieldAndScoringSorted +required_capability: qstr_function +required_capability: metadata_score + +from books metadata _score +| where qstr("title:rings") +| sort _score desc, book_no desc +| keep book_no, title, _score +| limit 3; + +book_no:keyword | title:text | _score:double +2675 | The Lord of the Rings - Boxed Set | 2.7583377361297607 +7140 | The Lord of the Rings Poster Collection: Six Paintings by Alan Lee (No. 1) | 1.9239964485168457 +2714 | Return of the King Being the Third Part of The Lord of the Rings | 1.9239964485168457 +; + +singleQstrScoringManipulated +required_capability: metadata_score +required_capability: qstr_function + +from books metadata _score +| where qstr("author:William Faulkner") +| eval add_score = ceil(_score) + 1 +| keep book_no, author, add_score +| sort book_no +| LIMIT 2; + +book_no:keyword | author:text | add_score:double +2378 | [Carol Faulkner, Holly Byers Ochoa, Lucretia Mott] | 2.0 +2713 | William Faulkner | 7.0 +; + +testMultiValuedFieldWithConjunctionWithScore +required_capability: match_function +required_capability: metadata_score + +from employees metadata _score +| where match(job_positions, "Data Scientist") and match(job_positions, "Support Engineer") +| keep emp_no, first_name, last_name, job_positions, _score; + +emp_no:integer | first_name:keyword | last_name:keyword | job_positions:keyword | _score:double +10043 | Yishay | Tzvieli | [Data Scientist, Python Developer, Support Engineer] | 5.233309745788574 +; + +testMatchAndQueryStringFunctionsWithScore +required_capability: match_function +required_capability: metadata_score + +from employees metadata _score +| where match(job_positions, "Data Scientist") and qstr("job_positions: (Support Engineer) and gender: F") +| keep emp_no, first_name, last_name, job_positions, _score; +ignoreOrder:true + +emp_no:integer | first_name:keyword | last_name:keyword | job_positions:keyword | _score:double +10041 | Uri | Lenart | [Data Scientist, Head Human Resources, Internship, Senior Team Lead] | 3.509873867034912 +10043 | Yishay | Tzvieli | [Data Scientist, Python Developer, Support Engineer] | 5.233309745788574 +; + +multipleWhereWithMatchScoringNoSort +required_capability: metadata_score +required_capability: match_operator_colon + +from books metadata _score +| where title:"short stories" +| where author:"Ursula K. Le Guin" +| keep book_no, title, author, _score; + +ignoreOrder:true +book_no:keyword | title:text | author:text | _score:double +8480 | The wind's twelve quarters: Short stories | Ursula K. Le Guin | 14.489097595214844 +; + +multipleWhereWithMatchScoring +required_capability: metadata_score +required_capability: match_operator_colon + +from books metadata _score +| where title:"short stories" +| where author:"Ursula K. Le Guin" +| keep book_no, title, author, _score +| sort book_no; + +book_no:keyword | title:text | author:text | _score:double +8480 | The wind's twelve quarters: Short stories | Ursula K. Le Guin | 14.489097595214844 +; + +combinedMatchWithFunctionsScoring +required_capability: metadata_score +required_capability: match_operator_colon + +from books metadata _score +| where title:"Tolkien" AND author:"Tolkien" AND year > 2000 +| where mv_count(author) == 1 +| keep book_no, title, author, year, _score +| sort book_no; + +book_no:keyword | title:text | author:text | year:integer | _score:double +5335 | Letters of J R R Tolkien | J.R.R. Tolkien | 2014 | 5.448054313659668 +; + +singleQstrScoring +required_capability: metadata_score +required_capability: qstr_function + +from books metadata _score +| where qstr("author:William Faulkner") +| keep book_no, author, _score +| sort book_no +| LIMIT 2; + +book_no:keyword | author:text | _score:double +2378 | [Carol Faulkner, Holly Byers Ochoa, Lucretia Mott] | 0.9976131916046143 +2713 | William Faulkner | 5.9556169509887695 +; + +singleQstrScoringGrok +required_capability: metadata_score +required_capability: qstr_function + +from books metadata _score +| where qstr("author:Lord Rings") +| GROK title "%{WORD:title} %{WORD}" +| sort _score desc +| keep book_no, title, _score +| LIMIT 3; + +book_no:keyword | title:keyword | _score:double +8875 | The | 2.9505908489227295 +4023 | A | 2.8327860832214355 +2675 | The | 2.7583377361297607 +; + +combinedMatchWithScoringEvalNoSort +required_capability: metadata_score +required_capability: match_operator_colon + +from books metadata _score +| where title:"Tolkien" AND author:"Tolkien" AND year > 2000 +| where mv_count(author) == 1 +| eval c_score = ceil(_score) +| keep book_no, title, author, year, c_score; + +ignoreOrder:true +book_no:keyword | title:text | author:text | year:integer | c_score:double +5335 | Letters of J R R Tolkien | J.R.R. Tolkien | 2014 | 6 +; + +singleQstrScoringRename +required_capability: metadata_score +required_capability: qstr_function + +from books metadata _score +| where qstr("author:Lord Rings") +| rename _score as rank +| sort rank desc +| keep book_no, rank +| LIMIT 3; + +book_no:keyword | rank:double +8875 | 2.9505908489227295 +4023 | 2.8327860832214355 +2675 | 2.7583377361297607 +; + +singleMatchWithTextFieldScoring +required_capability: metadata_score +required_capability: match_operator_colon + +from books metadata _score +| where author:"William Faulkner" +| sort book_no +| keep book_no, author, _score +| limit 5; + +book_no:keyword | author:text | _score:double +2378 | [Carol Faulkner, Holly Byers Ochoa, Lucretia Mott] | 0.9976131916046143 +2713 | William Faulkner | 4.272439002990723 +2847 | Colleen Faulkner | 1.7401835918426514 +2883 | William Faulkner | 4.272439002990723 +3293 | Danny Faulkner | 1.7401835918426514 +; + +combinedMatchWithFunctionsScoringNoSort +required_capability: metadata_score +required_capability: match_operator_colon + +from books metadata _score +| where title:"Tolkien" AND author:"Tolkien" AND year > 2000 +| where mv_count(author) == 1 +| keep book_no, title, author, year, _score; + +ignoreOrder:true +book_no:keyword | title:text | author:text | year:integer | _score:double +5335 | Letters of J R R Tolkien | J.R.R. Tolkien | 2014 | 5.448054313659668 +; + +combinedMatchWithScoringEval +required_capability: metadata_score +required_capability: match_operator_colon + +from books metadata _score +| where title:"Tolkien" AND author:"Tolkien" AND year > 2000 +| where mv_count(author) == 1 +| eval c_score = ceil(_score) +| keep book_no, title, author, year, c_score +| sort book_no; + +book_no:keyword | title:text | author:text | year:integer | c_score:double +5335 | Letters of J R R Tolkien | J.R.R. Tolkien | 2014 | 6 +; + +singleQstrScoringEval +required_capability: metadata_score +required_capability: qstr_function + +from books metadata _score +| where qstr("author:Lord Rings") +| eval c_score = ceil(_score) +| keep book_no, c_score +| sort book_no desc +| LIMIT 3; + +book_no:keyword | c_score:double +8875 | 3.0 +7350 | 2.0 +7140 | 3.0 +; diff --git a/x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/action/EsqlActionTaskIT.java b/x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/action/EsqlActionTaskIT.java index 56453a291ea81..1939f81353c0e 100644 --- a/x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/action/EsqlActionTaskIT.java +++ b/x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/action/EsqlActionTaskIT.java @@ -89,7 +89,7 @@ public void setup() { assumeTrue("requires query pragmas", canUseQueryPragmas()); nodeLevelReduction = randomBoolean(); READ_DESCRIPTION = """ - \\_LuceneSourceOperator[dataPartitioning = SHARD, maxPageSize = pageSize(), limit = 2147483647] + \\_LuceneSourceOperator[dataPartitioning = SHARD, maxPageSize = pageSize(), limit = 2147483647, scoreMode = COMPLETE_NO_SCORES] \\_ValuesSourceReaderOperator[fields = [pause_me]] \\_AggregationOperator[mode = INITIAL, aggs = sum of longs] \\_ExchangeSinkOperator""".replace("pageSize()", Integer.toString(pageSize())); @@ -448,6 +448,7 @@ protected void doRun() throws Exception { public void testTaskContentsForTopNQuery() throws Exception { READ_DESCRIPTION = ("\\_LuceneTopNSourceOperator[dataPartitioning = SHARD, maxPageSize = pageSize(), limit = 1000, " + + "scoreMode = TOP_DOCS, " + "sorts = [{\"pause_me\":{\"order\":\"asc\",\"missing\":\"_last\",\"unmapped_type\":\"long\"}}]]\n" + "\\_ValuesSourceReaderOperator[fields = [pause_me]]\n" + "\\_ProjectOperator[projection = [1]]\n" @@ -482,7 +483,7 @@ public void testTaskContentsForTopNQuery() throws Exception { public void testTaskContentsForLimitQuery() throws Exception { String limit = Integer.toString(randomIntBetween(pageSize() + 1, 2 * numberOfDocs())); READ_DESCRIPTION = """ - \\_LuceneSourceOperator[dataPartitioning = SHARD, maxPageSize = pageSize(), limit = limit()] + \\_LuceneSourceOperator[dataPartitioning = SHARD, maxPageSize = pageSize(), limit = limit(), scoreMode = COMPLETE_NO_SCORES] \\_ValuesSourceReaderOperator[fields = [pause_me]] \\_ProjectOperator[projection = [1]] \\_ExchangeSinkOperator""".replace("pageSize()", Integer.toString(pageSize())).replace("limit()", limit); @@ -511,7 +512,7 @@ public void testTaskContentsForLimitQuery() throws Exception { public void testTaskContentsForGroupingStatsQuery() throws Exception { READ_DESCRIPTION = """ - \\_LuceneSourceOperator[dataPartitioning = SHARD, maxPageSize = pageSize(), limit = 2147483647] + \\_LuceneSourceOperator[dataPartitioning = SHARD, maxPageSize = pageSize(), limit = 2147483647, scoreMode = COMPLETE_NO_SCORES] \\_ValuesSourceReaderOperator[fields = [foo]] \\_OrdinalsGroupingOperator(aggs = max of longs) \\_ExchangeSinkOperator""".replace("pageSize()", Integer.toString(pageSize())); diff --git a/x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/action/LookupFromIndexIT.java b/x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/action/LookupFromIndexIT.java index 5c0c13b48df3b..3b9359fe66d40 100644 --- a/x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/action/LookupFromIndexIT.java +++ b/x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/action/LookupFromIndexIT.java @@ -148,7 +148,8 @@ public void testLookupIndex() throws IOException { DataPartitioning.SEGMENT, 1, 10000, - DocIdSetIterator.NO_MORE_DOCS + DocIdSetIterator.NO_MORE_DOCS, + false // no scoring ); ValuesSourceReaderOperator.Factory reader = new ValuesSourceReaderOperator.Factory( List.of( diff --git a/x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/plugin/MatchFunctionIT.java b/x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/plugin/MatchFunctionIT.java new file mode 100644 index 0000000000000..99f7d48a0d636 --- /dev/null +++ b/x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/plugin/MatchFunctionIT.java @@ -0,0 +1,299 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.esql.plugin; + +import org.elasticsearch.ElasticsearchException; +import org.elasticsearch.action.index.IndexRequest; +import org.elasticsearch.action.support.WriteRequest; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.xpack.esql.VerificationException; +import org.elasticsearch.xpack.esql.action.AbstractEsqlIntegTestCase; +import org.elasticsearch.xpack.esql.action.EsqlCapabilities; +import org.elasticsearch.xpack.esql.action.EsqlQueryRequest; +import org.elasticsearch.xpack.esql.action.EsqlQueryResponse; +import org.junit.Before; + +import java.util.List; + +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; +import static org.hamcrest.CoreMatchers.containsString; + +//@TestLogging(value = "org.elasticsearch.xpack.esql:TRACE,org.elasticsearch.compute:TRACE", reason = "debug") +public class MatchFunctionIT extends AbstractEsqlIntegTestCase { + + @Before + public void setupIndex() { + createAndPopulateIndex(); + } + + @Override + protected EsqlQueryResponse run(EsqlQueryRequest request) { + assumeTrue("match function capability not available", EsqlCapabilities.Cap.MATCH_FUNCTION.isEnabled()); + return super.run(request); + } + + public void testSimpleWhereMatch() { + var query = """ + FROM test + | WHERE match(content, "fox") + | KEEP id + | SORT id + """; + + try (var resp = run(query)) { + assertColumnNames(resp.columns(), List.of("id")); + assertColumnTypes(resp.columns(), List.of("integer")); + assertValues(resp.values(), List.of(List.of(1), List.of(6))); + } + } + + public void testCombinedWhereMatch() { + var query = """ + FROM test + | WHERE match(content, "fox") AND id > 5 + | KEEP id + | SORT id + """; + + try (var resp = run(query)) { + assertColumnNames(resp.columns(), List.of("id")); + assertColumnTypes(resp.columns(), List.of("integer")); + assertValues(resp.values(), List.of(List.of(6))); + } + } + + public void testMultipleMatch() { + var query = """ + FROM test + | WHERE match(content, "fox") AND match(content, "brown") + | KEEP id + | SORT id + """; + + try (var resp = run(query)) { + assertColumnNames(resp.columns(), List.of("id")); + assertColumnTypes(resp.columns(), List.of("integer")); + assertValues(resp.values(), List.of(List.of(1), List.of(6))); + } + } + + public void testMultipleWhereMatch() { + var query = """ + FROM test + | WHERE match(content, "fox") AND match(content, "brown") + | EVAL summary = CONCAT("document with id: ", to_str(id), "and content: ", content) + | SORT summary + | LIMIT 4 + | WHERE match(content, "brown fox") + | KEEP id + """; + + var error = expectThrows(ElasticsearchException.class, () -> run(query)); + assertThat(error.getMessage(), containsString("[MATCH] function cannot be used after LIMIT")); + } + + public void testNotWhereMatch() { + var query = """ + FROM test + | WHERE NOT match(content, "brown fox") + | KEEP id + | SORT id + """; + + try (var resp = run(query)) { + assertColumnNames(resp.columns(), List.of("id")); + assertColumnTypes(resp.columns(), List.of("integer")); + assertValues(resp.values(), List.of(List.of(5))); + } + } + + public void testWhereMatchWithScoring() { + assumeTrue("'METADATA _score' is disabled", EsqlCapabilities.Cap.METADATA_SCORE.isEnabled()); + var query = """ + FROM test + METADATA _score + | WHERE match(content, "fox") + | KEEP id, _score + | SORT id ASC + """; + + try (var resp = run(query)) { + assertColumnNames(resp.columns(), List.of("id", "_score")); + assertColumnTypes(resp.columns(), List.of("integer", "double")); + assertValues(resp.values(), List.of(List.of(1, 1.156558871269226), List.of(6, 0.9114001989364624))); + } + } + + public void testWhereMatchWithScoringDifferentSort() { + assumeTrue("'METADATA _score' is disabled", EsqlCapabilities.Cap.METADATA_SCORE.isEnabled()); + var query = """ + FROM test + METADATA _score + | WHERE match(content, "fox") + | KEEP id, _score + | SORT id DESC + """; + + try (var resp = run(query)) { + assertColumnNames(resp.columns(), List.of("id", "_score")); + assertColumnTypes(resp.columns(), List.of("integer", "double")); + assertValues(resp.values(), List.of(List.of(6, 0.9114001989364624), List.of(1, 1.156558871269226))); + } + } + + public void testWhereMatchWithScoringSortScore() { + assumeTrue("'METADATA _score' is disabled", EsqlCapabilities.Cap.METADATA_SCORE.isEnabled()); + var query = """ + FROM test + METADATA _score + | WHERE match(content, "fox") + | KEEP id, _score + | SORT _score DESC + """; + + try (var resp = run(query)) { + assertColumnNames(resp.columns(), List.of("id", "_score")); + assertColumnTypes(resp.columns(), List.of("integer", "double")); + assertValues(resp.values(), List.of(List.of(1, 1.156558871269226), List.of(6, 0.9114001989364624))); + } + } + + public void testWhereMatchWithScoringNoSort() { + assumeTrue("'METADATA _score' is disabled", EsqlCapabilities.Cap.METADATA_SCORE.isEnabled()); + var query = """ + FROM test + METADATA _score + | WHERE content:"fox" + | KEEP id, _score + """; + + try (var resp = run(query)) { + assertColumnNames(resp.columns(), List.of("id", "_score")); + assertColumnTypes(resp.columns(), List.of("integer", "double")); + assertValuesInAnyOrder(resp.values(), List.of(List.of(1, 1.156558871269226), List.of(6, 0.9114001989364624))); + } + } + + public void testNonExistingColumn() { + var query = """ + FROM test + | WHERE something:"fox" + """; + + var error = expectThrows(VerificationException.class, () -> run(query)); + assertThat(error.getMessage(), containsString("Unknown column [something]")); + } + + public void testWhereMatchEvalColumn() { + var query = """ + FROM test + | EVAL upper_content = to_upper(content) + | WHERE upper_content:"FOX" + | KEEP id + """; + + var error = expectThrows(VerificationException.class, () -> run(query)); + assertThat( + error.getMessage(), + containsString("[:] operator cannot operate on [upper_content], which is not a field from an index mapping") + ); + } + + public void testWhereMatchOverWrittenColumn() { + var query = """ + FROM test + | DROP content + | EVAL content = CONCAT("document with ID ", to_str(id)) + | WHERE content:"document" + """; + + var error = expectThrows(VerificationException.class, () -> run(query)); + assertThat( + error.getMessage(), + containsString("[:] operator cannot operate on [content], which is not a field from an index mapping") + ); + } + + public void testWhereMatchAfterStats() { + var query = """ + FROM test + | STATS count(*) + | WHERE content:"fox" + """; + + var error = expectThrows(VerificationException.class, () -> run(query)); + assertThat(error.getMessage(), containsString("Unknown column [content]")); + } + + public void testWhereMatchWithFunctions() { + var query = """ + FROM test + | WHERE content:"fox" OR to_upper(content) == "FOX" + """; + var error = expectThrows(ElasticsearchException.class, () -> run(query)); + assertThat( + error.getMessage(), + containsString( + "Invalid condition [content:\"fox\" OR to_upper(content) == \"FOX\"]. " + + "[:] operator can't be used as part of an or condition" + ) + ); + } + + public void testWhereMatchWithRow() { + var query = """ + ROW content = "a brown fox" + | WHERE content:"fox" + """; + + var error = expectThrows(ElasticsearchException.class, () -> run(query)); + assertThat( + error.getMessage(), + containsString("[:] operator cannot operate on [\"a brown fox\"], which is not a field from an index mapping") + ); + } + + public void testMatchWithinEval() { + var query = """ + FROM test + | EVAL matches_query = content:"fox" + """; + + var error = expectThrows(VerificationException.class, () -> run(query)); + assertThat(error.getMessage(), containsString("[:] operator is only supported in WHERE commands")); + } + + public void testMatchWithNonTextField() { + var query = """ + FROM test + | WHERE id:"fox" + """; + + var error = expectThrows(VerificationException.class, () -> run(query)); + assertThat(error.getMessage(), containsString("first argument of [id:\"fox\"] must be [string], found value [id] type [integer]")); + } + + private void createAndPopulateIndex() { + var indexName = "test"; + var client = client().admin().indices(); + var CreateRequest = client.prepareCreate(indexName) + .setSettings(Settings.builder().put("index.number_of_shards", 1)) + .setMapping("id", "type=integer", "content", "type=text"); + assertAcked(CreateRequest); + client().prepareBulk() + .add(new IndexRequest(indexName).id("1").source("id", 1, "content", "This is a brown fox")) + .add(new IndexRequest(indexName).id("2").source("id", 2, "content", "This is a brown dog")) + .add(new IndexRequest(indexName).id("3").source("id", 3, "content", "This dog is really brown")) + .add(new IndexRequest(indexName).id("4").source("id", 4, "content", "The dog is brown but this document is very very long")) + .add(new IndexRequest(indexName).id("5").source("id", 5, "content", "There is also a white cat")) + .add(new IndexRequest(indexName).id("6").source("id", 6, "content", "The quick brown fox jumps over the lazy dog")) + .setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE) + .get(); + ensureYellow(indexName); + } +} diff --git a/x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/plugin/MatchOperatorIT.java b/x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/plugin/MatchOperatorIT.java index 3b647583f1129..6a360eb319abb 100644 --- a/x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/plugin/MatchOperatorIT.java +++ b/x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/plugin/MatchOperatorIT.java @@ -14,6 +14,7 @@ import org.elasticsearch.test.junit.annotations.TestLogging; import org.elasticsearch.xpack.esql.VerificationException; import org.elasticsearch.xpack.esql.action.AbstractEsqlIntegTestCase; +import org.elasticsearch.xpack.esql.action.EsqlCapabilities; import org.junit.Before; import java.util.List; @@ -105,6 +106,56 @@ public void testNotWhereMatch() { } } + public void testWhereMatchWithScoring() { + assumeTrue("'METADATA _score' is disabled", EsqlCapabilities.Cap.METADATA_SCORE.isEnabled()); + var query = """ + FROM test + METADATA _score + | WHERE content:"fox" + | KEEP id, _score + | SORT id ASC + """; + + try (var resp = run(query)) { + assertColumnNames(resp.columns(), List.of("id", "_score")); + assertColumnTypes(resp.columns(), List.of("integer", "double")); + assertValues(resp.values(), List.of(List.of(1, 1.156558871269226), List.of(6, 0.9114001989364624))); + } + } + + public void testWhereMatchWithScoringDifferentSort() { + assumeTrue("'METADATA _score' is disabled", EsqlCapabilities.Cap.METADATA_SCORE.isEnabled()); + var query = """ + FROM test + METADATA _score + | WHERE content:"fox" + | KEEP id, _score + | SORT id + """; + + try (var resp = run(query)) { + assertColumnNames(resp.columns(), List.of("id", "_score")); + assertColumnTypes(resp.columns(), List.of("integer", "double")); + assertValues(resp.values(), List.of(List.of(1, 1.156558871269226), List.of(6, 0.9114001989364624))); + } + } + + public void testWhereMatchWithScoringNoSort() { + assumeTrue("'METADATA _score' is disabled", EsqlCapabilities.Cap.METADATA_SCORE.isEnabled()); + var query = """ + FROM test + METADATA _score + | WHERE content:"fox" + | KEEP id, _score + """; + + try (var resp = run(query)) { + assertColumnNames(resp.columns(), List.of("id", "_score")); + assertColumnTypes(resp.columns(), List.of("integer", "double")); + assertValuesInAnyOrder(resp.values(), List.of(List.of(1, 1.156558871269226), List.of(6, 0.9114001989364624))); + } + } + public void testNonExistingColumn() { var query = """ FROM test diff --git a/x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/plugin/QueryStringIT.java b/x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/plugin/QueryStringIT.java index 03af16d29e9b4..a3d1ac931528c 100644 --- a/x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/plugin/QueryStringIT.java +++ b/x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/plugin/QueryStringIT.java @@ -13,6 +13,7 @@ import org.elasticsearch.index.query.QueryShardException; import org.elasticsearch.xpack.esql.VerificationException; import org.elasticsearch.xpack.esql.action.AbstractEsqlIntegTestCase; +import org.elasticsearch.xpack.esql.action.EsqlCapabilities; import org.junit.Before; import java.util.List; @@ -137,4 +138,99 @@ private void createAndPopulateIndex() { .get(); ensureYellow(indexName); } + + public void testWhereQstrWithScoring() { + assumeTrue("'METADATA _score' is disabled", EsqlCapabilities.Cap.METADATA_SCORE.isEnabled()); + var query = """ + FROM test + METADATA _score + | WHERE qstr("content: fox") + | KEEP id, _score + """; + + try (var resp = run(query)) { + assertColumnNames(resp.columns(), List.of("id", "_score")); + assertColumnTypes(resp.columns(), List.of("integer", "double")); + assertValuesInAnyOrder( + resp.values(), + List.of( + List.of(2, 0.3028995096683502), + List.of(3, 0.3028995096683502), + List.of(4, 0.2547692656517029), + List.of(5, 0.28161853551864624) + ) + ); + + } + } + + public void testWhereQstrWithScoringSorted() { + assumeTrue("'METADATA _score' is disabled", EsqlCapabilities.Cap.METADATA_SCORE.isEnabled()); + var query = """ + FROM test + METADATA _score + | WHERE qstr("content:fox fox") + | KEEP id, _score + | SORT _score DESC + """; + + try (var resp = run(query)) { + assertColumnNames(resp.columns(), List.of("id", "_score")); + assertColumnTypes(resp.columns(), List.of("integer", "double")); + assertValues( + resp.values(), + List.of( + List.of(3, 1.5605685710906982), + List.of(2, 0.6057990193367004), + List.of(5, 0.5632370710372925), + List.of(4, 0.5095385313034058) + ) + ); + + } + } + + public void testWhereQstrWithScoringNoSort() { + assumeTrue("'METADATA _score' is disabled", EsqlCapabilities.Cap.METADATA_SCORE.isEnabled()); + var query = """ + FROM test + METADATA _score + | WHERE qstr("content: fox") + | KEEP id, _score + """; + + try (var resp = run(query)) { + assertColumnNames(resp.columns(), List.of("id", "_score")); + assertColumnTypes(resp.columns(), List.of("integer", "double")); + assertValuesInAnyOrder( + resp.values(), + List.of( + List.of(2, 0.3028995096683502), + List.of(3, 0.3028995096683502), + List.of(4, 0.2547692656517029), + List.of(5, 0.28161853551864624) + ) + ); + } + } + + public void testWhereQstrWithNonPushableAndScoring() { + assumeTrue("'METADATA _score' is disabled", EsqlCapabilities.Cap.METADATA_SCORE.isEnabled()); + var query = """ + FROM test + METADATA _score + | WHERE qstr("content: fox") + AND abs(id) > 0 + | EVAL c_score = ceil(_score) + | KEEP id, c_score + | SORT id DESC + | LIMIT 2 + """; + + try (var resp = run(query)) { + assertColumnNames(resp.columns(), List.of("id", "c_score")); + assertColumnTypes(resp.columns(), List.of("integer", "double")); + assertValuesInAnyOrder(resp.values(), List.of(List.of(5, 1.0), List.of(4, 1.0))); + } + } } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java index d8004f73f613f..9bd4211855699 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java @@ -539,7 +539,12 @@ public enum Cap { /** * Fix for https://github.com/elastic/elasticsearch/issues/114714, again */ - FIX_STATS_BY_FOLDABLE_EXPRESSION_2,; + FIX_STATS_BY_FOLDABLE_EXPRESSION_2, + + /** + * Support the "METADATA _score" directive to enable _score column. + */ + METADATA_SCORE(Build.current().isSnapshot()); private final boolean enabled; diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/analysis/Verifier.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/analysis/Verifier.java index 2be13398dab2f..5f8c011cff53a 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/analysis/Verifier.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/analysis/Verifier.java @@ -19,6 +19,7 @@ import org.elasticsearch.xpack.esql.core.expression.Expression; import org.elasticsearch.xpack.esql.core.expression.Expressions; import org.elasticsearch.xpack.esql.core.expression.FieldAttribute; +import org.elasticsearch.xpack.esql.core.expression.MetadataAttribute; import org.elasticsearch.xpack.esql.core.expression.NameId; import org.elasticsearch.xpack.esql.core.expression.NamedExpression; import org.elasticsearch.xpack.esql.core.expression.TypeResolutions; @@ -221,6 +222,7 @@ else if (p instanceof Lookup lookup) { checkFullTextQueryFunctions(p, failures); }); checkRemoteEnrich(plan, failures); + checkMetadataScoreNameReserved(plan, failures); if (failures.isEmpty()) { checkLicense(plan, licenseState, failures); @@ -234,6 +236,13 @@ else if (p instanceof Lookup lookup) { return failures; } + private static void checkMetadataScoreNameReserved(LogicalPlan p, Set failures) { + // _score can only be set as metadata attribute + if (p.inputSet().stream().anyMatch(a -> MetadataAttribute.SCORE.equals(a.name()) && (a instanceof MetadataAttribute) == false)) { + failures.add(fail(p, "`" + MetadataAttribute.SCORE + "` is a reserved METADATA attribute")); + } + } + private void checkSort(LogicalPlan p, Set failures) { if (p instanceof OrderBy ob) { ob.order().forEach(o -> { diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/local/LucenePushdownPredicates.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/local/LucenePushdownPredicates.java index feb8717f007b7..8046d6bc56607 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/local/LucenePushdownPredicates.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/local/LucenePushdownPredicates.java @@ -9,6 +9,7 @@ import org.elasticsearch.xpack.esql.core.expression.Expression; import org.elasticsearch.xpack.esql.core.expression.FieldAttribute; +import org.elasticsearch.xpack.esql.core.expression.MetadataAttribute; import org.elasticsearch.xpack.esql.core.type.DataType; import org.elasticsearch.xpack.esql.stats.SearchStats; @@ -59,6 +60,10 @@ default boolean isPushableFieldAttribute(Expression exp) { return false; } + default boolean isPushableMetadataAttribute(Expression exp) { + return exp instanceof MetadataAttribute ma && ma.name().equals(MetadataAttribute.SCORE); + } + /** * The default implementation of this has no access to SearchStats, so it can only make decisions based on the FieldAttribute itself. * In particular, it assumes TEXT fields have no exact subfields (underlying keyword field), diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/local/PushTopNToSource.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/local/PushTopNToSource.java index 925e144b69fcc..2b531257e594a 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/local/PushTopNToSource.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/local/PushTopNToSource.java @@ -14,6 +14,7 @@ import org.elasticsearch.xpack.esql.core.expression.AttributeMap; import org.elasticsearch.xpack.esql.core.expression.Expression; import org.elasticsearch.xpack.esql.core.expression.FieldAttribute; +import org.elasticsearch.xpack.esql.core.expression.MetadataAttribute; import org.elasticsearch.xpack.esql.core.expression.NameId; import org.elasticsearch.xpack.esql.core.expression.ReferenceAttribute; import org.elasticsearch.xpack.esql.expression.Order; @@ -57,6 +58,7 @@ * */ public class PushTopNToSource extends PhysicalOptimizerRules.ParameterizedOptimizerRule { + @Override protected PhysicalPlan rule(TopNExec topNExec, LocalPhysicalOptimizerContext ctx) { Pushable pushable = evaluatePushable(topNExec, LucenePushdownPredicates.from(ctx.searchStats())); @@ -155,6 +157,8 @@ && canPushDownOrders(topNExec.order(), lucenePushdownPredicates)) { order.nullsPosition() ) ); + } else if (lucenePushdownPredicates.isPushableMetadataAttribute(order.child())) { + pushableSorts.add(new EsQueryExec.ScoreSort(order.direction())); } else if (order.child() instanceof ReferenceAttribute referenceAttribute) { Attribute resolvedAttribute = aliasReplacedBy.resolve(referenceAttribute, referenceAttribute); if (distances.containsKey(resolvedAttribute.id())) { @@ -192,13 +196,23 @@ && canPushDownOrders(topNExec.order(), lucenePushdownPredicates)) { private static boolean canPushDownOrders(List orders, LucenePushdownPredicates lucenePushdownPredicates) { // allow only exact FieldAttributes (no expressions) for sorting - return orders.stream().allMatch(o -> lucenePushdownPredicates.isPushableFieldAttribute(o.child())); + return orders.stream() + .allMatch( + o -> lucenePushdownPredicates.isPushableFieldAttribute(o.child()) + || lucenePushdownPredicates.isPushableMetadataAttribute(o.child()) + ); } private static List buildFieldSorts(List orders) { List sorts = new ArrayList<>(orders.size()); for (Order o : orders) { - sorts.add(new EsQueryExec.FieldSort(((FieldAttribute) o.child()).exactAttribute(), o.direction(), o.nullsPosition())); + if (o.child() instanceof FieldAttribute fa) { + sorts.add(new EsQueryExec.FieldSort(fa.exactAttribute(), o.direction(), o.nullsPosition())); + } else if (o.child() instanceof MetadataAttribute ma && MetadataAttribute.SCORE.equals(ma.name())) { + sorts.add(new EsQueryExec.ScoreSort(o.direction())); + } else { + assert false : "unexpected ordering on expression type " + o.child().getClass(); + } } return sorts; } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/local/ReplaceSourceAttributes.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/local/ReplaceSourceAttributes.java index 74ea6f99e5e59..11e386ddd046c 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/local/ReplaceSourceAttributes.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/local/ReplaceSourceAttributes.java @@ -16,6 +16,7 @@ import org.elasticsearch.xpack.esql.plan.physical.EsSourceExec; import org.elasticsearch.xpack.esql.plan.physical.PhysicalPlan; +import java.util.ArrayList; import java.util.List; import static org.elasticsearch.xpack.esql.optimizer.rules.logical.OptimizerRules.TransformDirection.UP; @@ -29,6 +30,8 @@ public ReplaceSourceAttributes() { @Override protected PhysicalPlan rule(EsSourceExec plan) { var docId = new FieldAttribute(plan.source(), EsQueryExec.DOC_ID_FIELD.getName(), EsQueryExec.DOC_ID_FIELD); + final List attributes = new ArrayList<>(); + attributes.add(docId); if (plan.indexMode() == IndexMode.TIME_SERIES) { Attribute tsid = null, timestamp = null; for (Attribute attr : plan.output()) { @@ -42,9 +45,14 @@ protected PhysicalPlan rule(EsSourceExec plan) { if (tsid == null || timestamp == null) { throw new IllegalStateException("_tsid or @timestamp are missing from the time-series source"); } - return new EsQueryExec(plan.source(), plan.index(), plan.indexMode(), List.of(docId, tsid, timestamp), plan.query()); - } else { - return new EsQueryExec(plan.source(), plan.index(), plan.indexMode(), List.of(docId), plan.query()); + attributes.add(tsid); + attributes.add(timestamp); } + plan.output().forEach(attr -> { + if (attr instanceof MetadataAttribute ma && ma.name().equals(MetadataAttribute.SCORE)) { + attributes.add(ma); + } + }); + return new EsQueryExec(plan.source(), plan.index(), plan.indexMode(), attributes, plan.query()); } } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/parser/LogicalPlanBuilder.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/parser/LogicalPlanBuilder.java index 99e03b3653f79..24398afa18010 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/parser/LogicalPlanBuilder.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/parser/LogicalPlanBuilder.java @@ -16,6 +16,7 @@ import org.elasticsearch.dissect.DissectParser; import org.elasticsearch.index.IndexMode; import org.elasticsearch.xpack.esql.VerificationException; +import org.elasticsearch.xpack.esql.action.EsqlCapabilities; import org.elasticsearch.xpack.esql.common.Failure; import org.elasticsearch.xpack.esql.core.expression.Alias; import org.elasticsearch.xpack.esql.core.expression.Attribute; @@ -276,7 +277,8 @@ public LogicalPlan visitFromCommand(EsqlBaseParser.FromCommandContext ctx) { for (var c : metadataOptionContext.UNQUOTED_SOURCE()) { String id = c.getText(); Source src = source(c); - if (MetadataAttribute.isSupported(id) == false) { + if (MetadataAttribute.isSupported(id) == false // TODO: drop check below once METADATA_SCORE is no longer snapshot-only + || (EsqlCapabilities.Cap.METADATA_SCORE.isEnabled() == false && MetadataAttribute.SCORE.equals(id))) { throw new ParsingException(src, "unsupported metadata field [" + id + "]"); } Attribute a = metadataMap.put(id, MetadataAttribute.create(src, id)); diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plan/physical/EsQueryExec.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plan/physical/EsQueryExec.java index 82848fb2f1062..267b9e613abef 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plan/physical/EsQueryExec.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plan/physical/EsQueryExec.java @@ -15,6 +15,7 @@ import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.search.sort.FieldSortBuilder; import org.elasticsearch.search.sort.GeoDistanceSortBuilder; +import org.elasticsearch.search.sort.ScoreSortBuilder; import org.elasticsearch.search.sort.SortBuilder; import org.elasticsearch.search.sort.SortOrder; import org.elasticsearch.xpack.esql.core.expression.Attribute; @@ -94,6 +95,19 @@ public SortBuilder sortBuilder() { } } + public record ScoreSort(Order.OrderDirection direction) implements Sort { + @Override + public SortBuilder sortBuilder() { + return new ScoreSortBuilder(); + } + + @Override + public FieldAttribute field() { + // TODO: refactor this: not all Sorts are backed by FieldAttributes + return null; + } + } + public EsQueryExec(Source source, EsIndex index, IndexMode indexMode, List attributes, QueryBuilder query) { this(source, index, indexMode, attributes, query, null, null, null); } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/EsPhysicalOperationProviders.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/EsPhysicalOperationProviders.java index ab0d68b152262..15f5b6579098d 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/EsPhysicalOperationProviders.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/EsPhysicalOperationProviders.java @@ -51,6 +51,7 @@ import org.elasticsearch.xpack.esql.core.expression.Attribute; import org.elasticsearch.xpack.esql.core.expression.Expression; import org.elasticsearch.xpack.esql.core.expression.FieldAttribute; +import org.elasticsearch.xpack.esql.core.expression.MetadataAttribute; import org.elasticsearch.xpack.esql.core.type.DataType; import org.elasticsearch.xpack.esql.core.type.MultiTypeEsField; import org.elasticsearch.xpack.esql.expression.function.scalar.convert.AbstractConvertFunction; @@ -165,7 +166,10 @@ public final PhysicalOperation sourcePhysicalOperation(EsQueryExec esQueryExec, assert esQueryExec.estimatedRowSize() != null : "estimated row size not initialized"; int rowEstimatedSize = esQueryExec.estimatedRowSize(); int limit = esQueryExec.limit() != null ? (Integer) esQueryExec.limit().fold() : NO_LIMIT; - if (sorts != null && sorts.isEmpty() == false) { + boolean scoring = esQueryExec.attrs() + .stream() + .anyMatch(a -> a instanceof MetadataAttribute && a.name().equals(MetadataAttribute.SCORE)); + if ((sorts != null && sorts.isEmpty() == false)) { List> sortBuilders = new ArrayList<>(sorts.size()); for (Sort sort : sorts) { sortBuilders.add(sort.sortBuilder()); @@ -177,7 +181,8 @@ public final PhysicalOperation sourcePhysicalOperation(EsQueryExec esQueryExec, context.queryPragmas().taskConcurrency(), context.pageSize(rowEstimatedSize), limit, - sortBuilders + sortBuilders, + scoring ); } else { if (esQueryExec.indexMode() == IndexMode.TIME_SERIES) { @@ -195,7 +200,8 @@ public final PhysicalOperation sourcePhysicalOperation(EsQueryExec esQueryExec, context.queryPragmas().dataPartitioning(), context.queryPragmas().taskConcurrency(), context.pageSize(rowEstimatedSize), - limit + limit, + scoring ); } } @@ -273,7 +279,7 @@ public IndexSearcher searcher() { @Override public Optional buildSort(List> sorts) throws IOException { - return SortBuilder.buildSort(sorts, ctx); + return SortBuilder.buildSort(sorts, ctx, false); } @Override diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java index 355073fcc873f..6074601535477 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java @@ -12,6 +12,7 @@ import org.elasticsearch.test.ESTestCase; import org.elasticsearch.xpack.esql.VerificationException; import org.elasticsearch.xpack.esql.action.EsqlCapabilities; +import org.elasticsearch.xpack.esql.core.expression.Attribute; import org.elasticsearch.xpack.esql.core.type.DataType; import org.elasticsearch.xpack.esql.core.type.EsField; import org.elasticsearch.xpack.esql.core.type.InvalidMappedField; @@ -21,6 +22,7 @@ import org.elasticsearch.xpack.esql.parser.EsqlParser; import org.elasticsearch.xpack.esql.parser.QueryParam; import org.elasticsearch.xpack.esql.parser.QueryParams; +import org.elasticsearch.xpack.esql.plan.logical.LogicalPlan; import java.util.ArrayList; import java.util.LinkedHashMap; @@ -1754,6 +1756,29 @@ public void testToDatePeriodToTimeDurationWithInvalidType() { ); } + public void testNonMetadataScore() { + assumeTrue("'METADATA _score' is disabled", EsqlCapabilities.Cap.METADATA_SCORE.isEnabled()); + assertEquals("1:12: `_score` is a reserved METADATA attribute", error("from foo | eval _score = 10")); + + assertEquals( + "1:48: `_score` is a reserved METADATA attribute", + error("from foo metadata _score | where qstr(\"bar\") | eval _score = _score + 1") + ); + } + + public void testScoreRenaming() { + assumeTrue("'METADATA _score' is disabled", EsqlCapabilities.Cap.METADATA_SCORE.isEnabled()); + assertEquals("1:33: `_score` is a reserved METADATA attribute", error("from foo METADATA _id, _score | rename _id as _score")); + + assertTrue(passes("from foo metadata _score | rename _score as foo").stream().anyMatch(a -> a.name().equals("foo"))); + } + + private List passes(String query) { + LogicalPlan logicalPlan = defaultAnalyzer.analyze(parser.createStatement(query)); + assertTrue(logicalPlan.resolved()); + return logicalPlan.output(); + } + public void testIntervalAsString() { // DateTrunc for (String interval : List.of("1 minu", "1 dy", "1.5 minutes", "0.5 days", "minutes 1", "day 5")) { diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/PhysicalPlanOptimizerTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/PhysicalPlanOptimizerTests.java index f3ba11457a715..1f131f79c3d0e 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/PhysicalPlanOptimizerTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/PhysicalPlanOptimizerTests.java @@ -36,6 +36,7 @@ import org.elasticsearch.xpack.esql.EsqlTestUtils.TestConfigurableSearchStats; import org.elasticsearch.xpack.esql.EsqlTestUtils.TestConfigurableSearchStats.Config; import org.elasticsearch.xpack.esql.VerificationException; +import org.elasticsearch.xpack.esql.action.EsqlCapabilities; import org.elasticsearch.xpack.esql.analysis.Analyzer; import org.elasticsearch.xpack.esql.analysis.AnalyzerContext; import org.elasticsearch.xpack.esql.analysis.EnrichResolution; @@ -63,6 +64,7 @@ import org.elasticsearch.xpack.esql.expression.function.aggregate.SpatialAggregateFunction; import org.elasticsearch.xpack.esql.expression.function.aggregate.SpatialCentroid; import org.elasticsearch.xpack.esql.expression.function.aggregate.Sum; +import org.elasticsearch.xpack.esql.expression.function.fulltext.Match; import org.elasticsearch.xpack.esql.expression.function.scalar.math.Round; import org.elasticsearch.xpack.esql.expression.function.scalar.spatial.SpatialContains; import org.elasticsearch.xpack.esql.expression.function.scalar.spatial.SpatialDisjoint; @@ -6581,6 +6583,66 @@ public void testLookupThenTopN() { ); } + public void testScore() { + assumeTrue("'METADATA _score' is disabled", EsqlCapabilities.Cap.METADATA_SCORE.isEnabled()); + var plan = physicalPlan(""" + from test metadata _score + | where match(first_name, "john") + | keep _score + """); + + ProjectExec outerProject = as(plan, ProjectExec.class); + LimitExec limitExec = as(outerProject.child(), LimitExec.class); + ExchangeExec exchange = as(limitExec.child(), ExchangeExec.class); + FragmentExec frag = as(exchange.child(), FragmentExec.class); + + LogicalPlan opt = logicalOptimizer.optimize(frag.fragment()); + Limit limit = as(opt, Limit.class); + Filter filter = as(limit.child(), Filter.class); + + Match match = as(filter.condition(), Match.class); + assertTrue(match.field() instanceof FieldAttribute); + assertEquals("first_name", ((FieldAttribute) match.field()).field().getName()); + + EsRelation esRelation = as(filter.child(), EsRelation.class); + assertTrue(esRelation.optimized()); + assertTrue(esRelation.resolved()); + assertTrue(esRelation.output().stream().anyMatch(a -> a.name().equals(MetadataAttribute.SCORE) && a instanceof MetadataAttribute)); + } + + public void testScoreTopN() { + assumeTrue("'METADATA _score' is disabled", EsqlCapabilities.Cap.METADATA_SCORE.isEnabled()); + var plan = physicalPlan(""" + from test metadata _score + | where match(first_name, "john") + | keep _score + | sort _score desc + """); + + ProjectExec projectExec = as(plan, ProjectExec.class); + TopNExec topNExec = as(projectExec.child(), TopNExec.class); + ExchangeExec exchange = as(topNExec.child(), ExchangeExec.class); + FragmentExec frag = as(exchange.child(), FragmentExec.class); + + LogicalPlan opt = logicalOptimizer.optimize(frag.fragment()); + TopN topN = as(opt, TopN.class); + List order = topN.order(); + Order scoreOrer = order.getFirst(); + assertEquals(Order.OrderDirection.DESC, scoreOrer.direction()); + Expression child = scoreOrer.child(); + assertTrue(child instanceof MetadataAttribute ma && ma.name().equals(MetadataAttribute.SCORE)); + Filter filter = as(topN.child(), Filter.class); + + Match match = as(filter.condition(), Match.class); + assertTrue(match.field() instanceof FieldAttribute); + assertEquals("first_name", ((FieldAttribute) match.field()).field().getName()); + + EsRelation esRelation = as(filter.child(), EsRelation.class); + assertTrue(esRelation.optimized()); + assertTrue(esRelation.resolved()); + assertTrue(esRelation.output().stream().anyMatch(a -> a.name().equals(MetadataAttribute.SCORE) && a instanceof MetadataAttribute)); + } + @SuppressWarnings("SameParameterValue") private static void assertFilterCondition( Filter filter, diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/local/PushTopNToSourceTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/local/PushTopNToSourceTests.java index 98f0af8e4b8e6..2429bcb1a1b04 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/local/PushTopNToSourceTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/local/PushTopNToSourceTests.java @@ -20,6 +20,7 @@ import org.elasticsearch.xpack.esql.core.expression.Expression; import org.elasticsearch.xpack.esql.core.expression.FieldAttribute; import org.elasticsearch.xpack.esql.core.expression.Literal; +import org.elasticsearch.xpack.esql.core.expression.MetadataAttribute; import org.elasticsearch.xpack.esql.core.expression.Nullability; import org.elasticsearch.xpack.esql.core.expression.ReferenceAttribute; import org.elasticsearch.xpack.esql.core.tree.Source; @@ -64,6 +65,13 @@ public void testSimpleSortField() { assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); } + public void testSimpleScoreSortField() { + // FROM index METADATA _score | SORT _score | LIMIT 10 + var query = from("index").metadata("_score", DOUBLE, false).scoreSort().limit(10); + assertPushdownSort(query); + assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); + } + public void testSimpleSortMultipleFields() { // FROM index | SORT field, integer, double | LIMIT 10 var query = from("index").sort("field").sort("integer").sort("double").limit(10); @@ -71,6 +79,13 @@ public void testSimpleSortMultipleFields() { assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); } + public void testSimpleSortMultipleFieldsAndScore() { + // FROM index | SORT field, integer, double, _score | LIMIT 10 + var query = from("index").metadata("_score", DOUBLE, false).sort("field").sort("integer").sort("double").scoreSort().limit(10); + assertPushdownSort(query); + assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); + } + public void testSimpleSortFieldAndEvalLiteral() { // FROM index | EVAL x = 1 | SORT field | LIMIT 10 var query = from("index").eval("x", e -> e.i(1)).sort("field").limit(10); @@ -78,6 +93,13 @@ public void testSimpleSortFieldAndEvalLiteral() { assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); } + public void testSimpleSortFieldScoreAndEvalLiteral() { + // FROM index METADATA _score | EVAL x = 1 | SORT field, _score | LIMIT 10 + var query = from("index").metadata("_score", DOUBLE, false).eval("x", e -> e.i(1)).sort("field").scoreSort().limit(10); + assertPushdownSort(query, List.of(EvalExec.class, EsQueryExec.class)); + assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); + } + public void testSimpleSortFieldWithAlias() { // FROM index | EVAL x = field | SORT field | LIMIT 10 var query = from("index").eval("x", b -> b.field("field")).sort("field").limit(10); @@ -98,6 +120,21 @@ public void testSimpleSortMultipleFieldsWithAliases() { assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); } + public void testSimpleSortMultipleFieldsWithAliasesAndScore() { + // FROM index | EVAL x = field, y = integer, z = double | SORT field, integer, double, _score | LIMIT 10 + var query = from("index").metadata("_score", DOUBLE, false) + .eval("x", b -> b.field("field")) + .eval("y", b -> b.field("integer")) + .eval("z", b -> b.field("double")) + .sort("field") + .sort("integer") + .sort("double") + .scoreSort() + .limit(10); + assertPushdownSort(query, List.of(EvalExec.class, EsQueryExec.class)); + assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); + } + public void testSimpleSortFieldAsAlias() { // FROM index | EVAL x = field | SORT x | LIMIT 10 var query = from("index").eval("x", b -> b.field("field")).sort("x").limit(10); @@ -105,6 +142,13 @@ public void testSimpleSortFieldAsAlias() { assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); } + public void testSimpleSortFieldAsAliasAndScore() { + // FROM index METADATA _score | EVAL x = field | SORT x, _score | LIMIT 10 + var query = from("index").metadata("_score", DOUBLE, false).eval("x", b -> b.field("field")).sort("x").scoreSort().limit(10); + assertPushdownSort(query, Map.of("x", "field"), List.of(EvalExec.class, EsQueryExec.class)); + assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); + } + public void testSimpleSortFieldAndEvalSumLiterals() { // FROM index | EVAL sum = 1 + 2 | SORT field | LIMIT 10 var query = from("index").eval("sum", b -> b.add(b.i(1), b.i(2))).sort("field").limit(10); @@ -112,6 +156,17 @@ public void testSimpleSortFieldAndEvalSumLiterals() { assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); } + public void testSimpleSortFieldAndEvalSumLiteralsAndScore() { + // FROM index METADATA _score | EVAL sum = 1 + 2 | SORT field, _score | LIMIT 10 + var query = from("index").metadata("_score", DOUBLE, false) + .eval("sum", b -> b.add(b.i(1), b.i(2))) + .sort("field") + .scoreSort() + .limit(10); + assertPushdownSort(query, List.of(EvalExec.class, EsQueryExec.class)); + assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); + } + public void testSimpleSortFieldAndEvalSumLiteralAndField() { // FROM index | EVAL sum = 1 + integer | SORT integer | LIMIT 10 var query = from("index").eval("sum", b -> b.add(b.i(1), b.field("integer"))).sort("integer").limit(10); @@ -119,6 +174,17 @@ public void testSimpleSortFieldAndEvalSumLiteralAndField() { assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); } + public void testSimpleSortFieldAndEvalSumLiteralAndFieldAndScore() { + // FROM index METADATA _score | EVAL sum = 1 + integer | SORT integer, _score | LIMIT 10 + var query = from("index").metadata("_score", DOUBLE, false) + .eval("sum", b -> b.add(b.i(1), b.field("integer"))) + .sort("integer") + .scoreSort() + .limit(10); + assertPushdownSort(query, List.of(EvalExec.class, EsQueryExec.class)); + assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); + } + public void testSimpleSortEvalSumLiteralAndField() { // FROM index | EVAL sum = 1 + integer | SORT sum | LIMIT 10 var query = from("index").eval("sum", b -> b.add(b.i(1), b.field("integer"))).sort("sum").limit(10); @@ -144,6 +210,14 @@ public void testSortGeoPointField() { assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); } + public void testSortGeoPointFieldAnsScore() { + // FROM index METADATA _score | SORT location, _score | LIMIT 10 + var query = from("index").metadata("_score", DOUBLE, false).sort("location", Order.OrderDirection.ASC).scoreSort().limit(10); + // NOTE: while geo_point is not sortable, this is checked during logical planning and the physical planner does not know or care + assertPushdownSort(query); + assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); + } + public void testSortGeoDistanceFunction() { // FROM index | EVAL distance = ST_DISTANCE(location, POINT(1 2)) | SORT distance | LIMIT 10 var query = from("index").eval("distance", b -> b.distance("location", "POINT(1 2)")) @@ -154,6 +228,18 @@ public void testSortGeoDistanceFunction() { assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); } + public void testSortGeoDistanceFunctionAndScore() { + // FROM index METADATA _score | EVAL distance = ST_DISTANCE(location, POINT(1 2)) | SORT distance, _score | LIMIT 10 + var query = from("index").metadata("_score", DOUBLE, false) + .eval("distance", b -> b.distance("location", "POINT(1 2)")) + .sort("distance", Order.OrderDirection.ASC) + .scoreSort() + .limit(10); + // The pushed-down sort will use the underlying field 'location', not the sorted reference field 'distance' + assertPushdownSort(query, Map.of("distance", "location"), List.of(EvalExec.class, EsQueryExec.class)); + assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); + } + public void testSortGeoDistanceFunctionInverted() { // FROM index | EVAL distance = ST_DISTANCE(POINT(1 2), location) | SORT distance | LIMIT 10 var query = from("index").eval("distance", b -> b.distance("POINT(1 2)", "location")) @@ -164,6 +250,18 @@ public void testSortGeoDistanceFunctionInverted() { assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); } + public void testSortGeoDistanceFunctionInvertedAndScore() { + // FROM index METADATA _score | EVAL distance = ST_DISTANCE(POINT(1 2), location) | SORT distance, _score | LIMIT 10 + var query = from("index").metadata("_score", DOUBLE, false) + .eval("distance", b -> b.distance("POINT(1 2)", "location")) + .sort("distance", Order.OrderDirection.ASC) + .scoreSort() + .limit(10); + // The pushed-down sort will use the underlying field 'location', not the sorted reference field 'distance' + assertPushdownSort(query, Map.of("distance", "location"), List.of(EvalExec.class, EsQueryExec.class)); + assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); + } + public void testSortGeoDistanceFunctionLiterals() { // FROM index | EVAL distance = ST_DISTANCE(POINT(2 1), POINT(1 2)) | SORT distance | LIMIT 10 var query = from("index").eval("distance", b -> b.distance("POINT(2 1)", "POINT(1 2)")) @@ -174,6 +272,18 @@ public void testSortGeoDistanceFunctionLiterals() { assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); } + public void testSortGeoDistanceFunctionLiteralsAndScore() { + // FROM index METADATA _score | EVAL distance = ST_DISTANCE(POINT(2 1), POINT(1 2)) | SORT distance, _score | LIMIT 10 + var query = from("index").metadata("_score", DOUBLE, false) + .eval("distance", b -> b.distance("POINT(2 1)", "POINT(1 2)")) + .sort("distance", Order.OrderDirection.ASC) + .scoreSort() + .limit(10); + // The pushed-down sort will use the underlying field 'location', not the sorted reference field 'distance' + assertNoPushdownSort(query, "sort on foldable distance function"); + assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); + } + public void testSortGeoDistanceFunctionAndFieldsWithAliases() { // FROM index | EVAL distance = ST_DISTANCE(location, POINT(1 2)), x = field | SORT distance, field, integer | LIMIT 10 var query = from("index").eval("distance", b -> b.distance("location", "POINT(1 2)")) @@ -187,6 +297,21 @@ public void testSortGeoDistanceFunctionAndFieldsWithAliases() { assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); } + public void testSortGeoDistanceFunctionAndFieldsWithAliasesAndScore() { + // FROM index | EVAL distance = ST_DISTANCE(location, POINT(1 2)), x = field | SORT distance, field, integer, _score | LIMIT 10 + var query = from("index").metadata("_score", DOUBLE, false) + .eval("distance", b -> b.distance("location", "POINT(1 2)")) + .eval("x", b -> b.field("field")) + .sort("distance", Order.OrderDirection.ASC) + .sort("field", Order.OrderDirection.DESC) + .sort("integer", Order.OrderDirection.DESC) + .scoreSort() + .limit(10); + // The pushed-down sort will use the underlying field 'location', not the sorted reference field 'distance' + assertPushdownSort(query, query.orders, Map.of("distance", "location"), List.of(EvalExec.class, EsQueryExec.class)); + assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); + } + public void testSortGeoDistanceFunctionAndFieldsAndAliases() { // FROM index | EVAL distance = ST_DISTANCE(location, POINT(1 2)), x = field | SORT distance, x, integer | LIMIT 10 var query = from("index").eval("distance", b -> b.distance("location", "POINT(1 2)")) @@ -200,6 +325,21 @@ public void testSortGeoDistanceFunctionAndFieldsAndAliases() { assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); } + public void testSortGeoDistanceFunctionAndFieldsAndAliasesAndScore() { + // FROM index | EVAL distance = ST_DISTANCE(location, POINT(1 2)), x = field | SORT distance, x, integer, _score | LIMIT 10 + var query = from("index").metadata("_score", DOUBLE, false) + .eval("distance", b -> b.distance("location", "POINT(1 2)")) + .eval("x", b -> b.field("field")) + .sort("distance", Order.OrderDirection.ASC) + .sort("x", Order.OrderDirection.DESC) + .sort("integer", Order.OrderDirection.DESC) + .scoreSort() + .limit(10); + // The pushed-down sort will use the underlying field 'location', not the sorted reference field 'distance' + assertPushdownSort(query, query.orders, Map.of("distance", "location", "x", "field"), List.of(EvalExec.class, EsQueryExec.class)); + assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); + } + public void testSortGeoDistanceFunctionAndFieldsAndManyAliases() { // FROM index // | EVAL loc = location, loc2 = loc, loc3 = loc2, distance = ST_DISTANCE(loc3, POINT(1 2)), x = field @@ -219,6 +359,27 @@ public void testSortGeoDistanceFunctionAndFieldsAndManyAliases() { assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); } + public void testSortGeoDistanceFunctionAndFieldsAndManyAliasesAndScore() { + // FROM index METADATA _score + // | EVAL loc = location, loc2 = loc, loc3 = loc2, distance = ST_DISTANCE(loc3, POINT(1 2)), x = field + // | SORT distance, x, integer, _score + // | LIMIT 10 + var query = from("index").metadata("_score", DOUBLE, false) + .eval("loc", b -> b.field("location")) + .eval("loc2", b -> b.ref("loc")) + .eval("loc3", b -> b.ref("loc2")) + .eval("distance", b -> b.distance("loc3", "POINT(1 2)")) + .eval("x", b -> b.field("field")) + .sort("distance", Order.OrderDirection.ASC) + .sort("x", Order.OrderDirection.DESC) + .sort("integer", Order.OrderDirection.DESC) + .scoreSort() + .limit(10); + // The pushed-down sort will use the underlying field 'location', not the sorted reference field 'distance' + assertPushdownSort(query, Map.of("distance", "location", "x", "field"), List.of(EvalExec.class, EsQueryExec.class)); + assertNoPushdownSort(query.asTimeSeries(), "for time series index mode"); + } + private static void assertPushdownSort(TestPhysicalPlanBuilder builder) { assertPushdownSort(builder, null, List.of(EsQueryExec.class)); } @@ -289,9 +450,12 @@ private static void assertPushdownSort( assertThat("Expect sorts count to match", sorts.size(), is(expectedSorts.size())); for (int i = 0; i < expectedSorts.size(); i++) { String name = ((Attribute) expectedSorts.get(i).child()).name(); - String fieldName = sorts.get(i).field().fieldName(); - assertThat("Expect sort[" + i + "] name to match", fieldName, is(sortName(name, fieldMap))); - assertThat("Expect sort[" + i + "] direction to match", sorts.get(i).direction(), is(expectedSorts.get(i).direction())); + EsQueryExec.Sort sort = sorts.get(i); + if (sort.field() != null) { + String fieldName = sort.field().fieldName(); + assertThat("Expect sort[" + i + "] name to match", fieldName, is(sortName(name, fieldMap))); + } + assertThat("Expect sort[" + i + "] direction to match", sort.direction(), is(expectedSorts.get(i).direction())); } } @@ -317,6 +481,7 @@ static class TestPhysicalPlanBuilder { private final String index; private final LinkedHashMap fields; private final LinkedHashMap refs; + private final LinkedHashMap metadata; private IndexMode indexMode; private final List aliases = new ArrayList<>(); private final List orders = new ArrayList<>(); @@ -327,6 +492,7 @@ private TestPhysicalPlanBuilder(String index, IndexMode indexMode) { this.indexMode = indexMode; this.fields = new LinkedHashMap<>(); this.refs = new LinkedHashMap<>(); + this.metadata = new LinkedHashMap<>(); addSortableFieldAttributes(this.fields); } @@ -346,6 +512,11 @@ static TestPhysicalPlanBuilder from(String index) { return new TestPhysicalPlanBuilder(index, IndexMode.STANDARD); } + TestPhysicalPlanBuilder metadata(String metadataAttribute, DataType dataType, boolean searchable) { + metadata.put(metadataAttribute, new MetadataAttribute(Source.EMPTY, metadataAttribute, dataType, searchable)); + return this; + } + public TestPhysicalPlanBuilder eval(Alias... aliases) { if (orders.isEmpty() == false) { throw new IllegalArgumentException("Eval must be before sort"); @@ -376,6 +547,22 @@ public TestPhysicalPlanBuilder sort(String field) { return sort(field, Order.OrderDirection.ASC); } + public TestPhysicalPlanBuilder scoreSort(Order.OrderDirection direction) { + orders.add( + new Order( + Source.EMPTY, + MetadataAttribute.create(Source.EMPTY, MetadataAttribute.SCORE), + direction, + Order.NullsPosition.LAST + ) + ); + return this; + } + + public TestPhysicalPlanBuilder scoreSort() { + return scoreSort(Order.OrderDirection.DESC); + } + public TestPhysicalPlanBuilder sort(String field, Order.OrderDirection direction) { Attribute attr = refs.get(field); if (attr == null) { From 6b94a91633fc846fe02ac8cf3173d613af27bc01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Iv=C3=A1n=20Cea=20Fontenla?= Date: Thu, 28 Nov 2024 16:07:07 +0100 Subject: [PATCH 10/15] ESQL: Add nulls support to Categorize (#117655) Handle nulls and empty strings (Which resolve to null) on Categorize grouping function. Also, implement `seenGroupIds()`, which would fail some queries with nulls otherwise. --- docs/changelog/117655.yaml | 5 + .../AbstractCategorizeBlockHash.java | 37 +++++- .../blockhash/CategorizeRawBlockHash.java | 12 +- .../CategorizedIntermediateBlockHash.java | 19 ++- .../blockhash/CategorizeBlockHashTests.java | 72 +++++++---- .../src/main/resources/categorize.csv-spec | 122 ++++++++++-------- .../xpack/esql/action/EsqlCapabilities.java | 5 +- .../xpack/esql/analysis/VerifierTests.java | 6 +- .../optimizer/LogicalPlanOptimizerTests.java | 4 +- .../categorization/TokenListCategorizer.java | 2 + 10 files changed, 186 insertions(+), 98 deletions(-) create mode 100644 docs/changelog/117655.yaml diff --git a/docs/changelog/117655.yaml b/docs/changelog/117655.yaml new file mode 100644 index 0000000000000..f2afd3570f104 --- /dev/null +++ b/docs/changelog/117655.yaml @@ -0,0 +1,5 @@ +pr: 117655 +summary: Add nulls support to Categorize +area: ES|QL +type: enhancement +issues: [] diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/AbstractCategorizeBlockHash.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/AbstractCategorizeBlockHash.java index 22d3a10facb06..0e89d77820883 100644 --- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/AbstractCategorizeBlockHash.java +++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/AbstractCategorizeBlockHash.java @@ -13,8 +13,10 @@ import org.elasticsearch.common.util.BigArrays; import org.elasticsearch.common.util.BitArray; import org.elasticsearch.common.util.BytesRefHash; +import org.elasticsearch.compute.aggregation.SeenGroupIds; import org.elasticsearch.compute.data.Block; import org.elasticsearch.compute.data.BlockFactory; +import org.elasticsearch.compute.data.BytesRefBlock; import org.elasticsearch.compute.data.BytesRefVector; import org.elasticsearch.compute.data.IntBlock; import org.elasticsearch.compute.data.IntVector; @@ -31,11 +33,21 @@ * Base BlockHash implementation for {@code Categorize} grouping function. */ public abstract class AbstractCategorizeBlockHash extends BlockHash { + protected static final int NULL_ORD = 0; + // TODO: this should probably also take an emitBatchSize private final int channel; private final boolean outputPartial; protected final TokenListCategorizer.CloseableTokenListCategorizer categorizer; + /** + * Store whether we've seen any {@code null} values. + *

+ * Null gets the {@link #NULL_ORD} ord. + *

+ */ + protected boolean seenNull = false; + AbstractCategorizeBlockHash(BlockFactory blockFactory, int channel, boolean outputPartial) { super(blockFactory); this.channel = channel; @@ -58,12 +70,12 @@ public Block[] getKeys() { @Override public IntVector nonEmpty() { - return IntVector.range(0, categorizer.getCategoryCount(), blockFactory); + return IntVector.range(seenNull ? 0 : 1, categorizer.getCategoryCount() + 1, blockFactory); } @Override public BitArray seenGroupIds(BigArrays bigArrays) { - throw new UnsupportedOperationException(); + return new SeenGroupIds.Range(seenNull ? 0 : 1, Math.toIntExact(categorizer.getCategoryCount() + 1)).seenGroupIds(bigArrays); } @Override @@ -76,24 +88,39 @@ public final ReleasableIterator lookup(Page page, ByteSizeValue target */ private Block buildIntermediateBlock() { if (categorizer.getCategoryCount() == 0) { - return blockFactory.newConstantNullBlock(0); + return blockFactory.newConstantNullBlock(seenNull ? 1 : 0); } try (BytesStreamOutput out = new BytesStreamOutput()) { // TODO be more careful here. + out.writeBoolean(seenNull); out.writeVInt(categorizer.getCategoryCount()); for (SerializableTokenListCategory category : categorizer.toCategoriesById()) { category.writeTo(out); } // We're returning a block with N positions just because the Page must have all blocks with the same position count! - return blockFactory.newConstantBytesRefBlockWith(out.bytes().toBytesRef(), categorizer.getCategoryCount()); + int positionCount = categorizer.getCategoryCount() + (seenNull ? 1 : 0); + return blockFactory.newConstantBytesRefBlockWith(out.bytes().toBytesRef(), positionCount); } catch (IOException e) { throw new RuntimeException(e); } } private Block buildFinalBlock() { + BytesRefBuilder scratch = new BytesRefBuilder(); + + if (seenNull) { + try (BytesRefBlock.Builder result = blockFactory.newBytesRefBlockBuilder(categorizer.getCategoryCount())) { + result.appendNull(); + for (SerializableTokenListCategory category : categorizer.toCategoriesById()) { + scratch.copyChars(category.getRegex()); + result.appendBytesRef(scratch.get()); + scratch.clear(); + } + return result.build(); + } + } + try (BytesRefVector.Builder result = blockFactory.newBytesRefVectorBuilder(categorizer.getCategoryCount())) { - BytesRefBuilder scratch = new BytesRefBuilder(); for (SerializableTokenListCategory category : categorizer.toCategoriesById()) { scratch.copyChars(category.getRegex()); result.appendBytesRef(scratch.get()); diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeRawBlockHash.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeRawBlockHash.java index bf633e0454384..0d0a2fef2f82b 100644 --- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeRawBlockHash.java +++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeRawBlockHash.java @@ -64,7 +64,7 @@ public void close() { /** * Similar implementation to an Evaluator. */ - public static final class CategorizeEvaluator implements Releasable { + public final class CategorizeEvaluator implements Releasable { private final CategorizationAnalyzer analyzer; private final TokenListCategorizer.CloseableTokenListCategorizer categorizer; @@ -95,7 +95,8 @@ public IntBlock eval(int positionCount, BytesRefBlock vBlock) { BytesRef vScratch = new BytesRef(); for (int p = 0; p < positionCount; p++) { if (vBlock.isNull(p)) { - result.appendNull(); + seenNull = true; + result.appendInt(NULL_ORD); continue; } int first = vBlock.getFirstValueIndex(p); @@ -126,7 +127,12 @@ public IntVector eval(int positionCount, BytesRefVector vVector) { } private int process(BytesRef v) { - return categorizer.computeCategory(v.utf8ToString(), analyzer).getId(); + var category = categorizer.computeCategory(v.utf8ToString(), analyzer); + if (category == null) { + seenNull = true; + return NULL_ORD; + } + return category.getId() + 1; } @Override diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizedIntermediateBlockHash.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizedIntermediateBlockHash.java index 1bca34a70e5fa..c774d3b26049d 100644 --- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizedIntermediateBlockHash.java +++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizedIntermediateBlockHash.java @@ -40,9 +40,19 @@ public void add(Page page, GroupingAggregatorFunction.AddInput addInput) { return; } BytesRefBlock categorizerState = page.getBlock(channel()); + if (categorizerState.areAllValuesNull()) { + seenNull = true; + try (var newIds = blockFactory.newConstantIntVector(NULL_ORD, 1)) { + addInput.add(0, newIds); + } + return; + } + Map idMap = readIntermediate(categorizerState.getBytesRef(0, new BytesRef())); try (IntBlock.Builder newIdsBuilder = blockFactory.newIntBlockBuilder(idMap.size())) { - for (int i = 0; i < idMap.size(); i++) { + int fromId = idMap.containsKey(0) ? 0 : 1; + int toId = fromId + idMap.size(); + for (int i = fromId; i < toId; i++) { newIdsBuilder.appendInt(idMap.get(i)); } try (IntBlock newIds = newIdsBuilder.build()) { @@ -59,10 +69,15 @@ public void add(Page page, GroupingAggregatorFunction.AddInput addInput) { private Map readIntermediate(BytesRef bytes) { Map idMap = new HashMap<>(); try (StreamInput in = new BytesArray(bytes).streamInput()) { + if (in.readBoolean()) { + seenNull = true; + idMap.put(NULL_ORD, NULL_ORD); + } int count = in.readVInt(); for (int oldCategoryId = 0; oldCategoryId < count; oldCategoryId++) { int newCategoryId = categorizer.mergeWireCategory(new SerializableTokenListCategory(in)).getId(); - idMap.put(oldCategoryId, newCategoryId); + // +1 because the 0 ordinal is reserved for null + idMap.put(oldCategoryId + 1, newCategoryId + 1); } return idMap; } catch (IOException e) { diff --git a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHashTests.java b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHashTests.java index de8a2a44266fe..dd7a87dc4a574 100644 --- a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHashTests.java +++ b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHashTests.java @@ -52,7 +52,8 @@ public class CategorizeBlockHashTests extends BlockHashTestCase { public void testCategorizeRaw() { final Page page; - final int positions = 7; + boolean withNull = randomBoolean(); + final int positions = 7 + (withNull ? 1 : 0); try (BytesRefBlock.Builder builder = blockFactory.newBytesRefBlockBuilder(positions)) { builder.appendBytesRef(new BytesRef("Connected to 10.1.0.1")); builder.appendBytesRef(new BytesRef("Connection error")); @@ -61,6 +62,13 @@ public void testCategorizeRaw() { builder.appendBytesRef(new BytesRef("Disconnected")); builder.appendBytesRef(new BytesRef("Connected to 10.1.0.2")); builder.appendBytesRef(new BytesRef("Connected to 10.1.0.3")); + if (withNull) { + if (randomBoolean()) { + builder.appendNull(); + } else { + builder.appendBytesRef(new BytesRef("")); + } + } page = new Page(builder.build()); } @@ -70,13 +78,16 @@ public void testCategorizeRaw() { public void add(int positionOffset, IntBlock groupIds) { assertEquals(groupIds.getPositionCount(), positions); - assertEquals(0, groupIds.getInt(0)); - assertEquals(1, groupIds.getInt(1)); - assertEquals(1, groupIds.getInt(2)); - assertEquals(1, groupIds.getInt(3)); - assertEquals(2, groupIds.getInt(4)); - assertEquals(0, groupIds.getInt(5)); - assertEquals(0, groupIds.getInt(6)); + assertEquals(1, groupIds.getInt(0)); + assertEquals(2, groupIds.getInt(1)); + assertEquals(2, groupIds.getInt(2)); + assertEquals(2, groupIds.getInt(3)); + assertEquals(3, groupIds.getInt(4)); + assertEquals(1, groupIds.getInt(5)); + assertEquals(1, groupIds.getInt(6)); + if (withNull) { + assertEquals(0, groupIds.getInt(7)); + } } @Override @@ -100,7 +111,8 @@ public void close() { public void testCategorizeIntermediate() { Page page1; - int positions1 = 7; + boolean withNull = randomBoolean(); + int positions1 = 7 + (withNull ? 1 : 0); try (BytesRefBlock.Builder builder = blockFactory.newBytesRefBlockBuilder(positions1)) { builder.appendBytesRef(new BytesRef("Connected to 10.1.0.1")); builder.appendBytesRef(new BytesRef("Connection error")); @@ -109,6 +121,13 @@ public void testCategorizeIntermediate() { builder.appendBytesRef(new BytesRef("Connection error")); builder.appendBytesRef(new BytesRef("Connected to 10.1.0.3")); builder.appendBytesRef(new BytesRef("Connected to 10.1.0.4")); + if (withNull) { + if (randomBoolean()) { + builder.appendNull(); + } else { + builder.appendBytesRef(new BytesRef("")); + } + } page1 = new Page(builder.build()); } Page page2; @@ -133,13 +152,16 @@ public void testCategorizeIntermediate() { @Override public void add(int positionOffset, IntBlock groupIds) { assertEquals(groupIds.getPositionCount(), positions1); - assertEquals(0, groupIds.getInt(0)); - assertEquals(1, groupIds.getInt(1)); - assertEquals(1, groupIds.getInt(2)); - assertEquals(0, groupIds.getInt(3)); - assertEquals(1, groupIds.getInt(4)); - assertEquals(0, groupIds.getInt(5)); - assertEquals(0, groupIds.getInt(6)); + assertEquals(1, groupIds.getInt(0)); + assertEquals(2, groupIds.getInt(1)); + assertEquals(2, groupIds.getInt(2)); + assertEquals(1, groupIds.getInt(3)); + assertEquals(2, groupIds.getInt(4)); + assertEquals(1, groupIds.getInt(5)); + assertEquals(1, groupIds.getInt(6)); + if (withNull) { + assertEquals(0, groupIds.getInt(7)); + } } @Override @@ -158,11 +180,11 @@ public void close() { @Override public void add(int positionOffset, IntBlock groupIds) { assertEquals(groupIds.getPositionCount(), positions2); - assertEquals(0, groupIds.getInt(0)); - assertEquals(1, groupIds.getInt(1)); - assertEquals(0, groupIds.getInt(2)); - assertEquals(1, groupIds.getInt(3)); - assertEquals(2, groupIds.getInt(4)); + assertEquals(1, groupIds.getInt(0)); + assertEquals(2, groupIds.getInt(1)); + assertEquals(1, groupIds.getInt(2)); + assertEquals(2, groupIds.getInt(3)); + assertEquals(3, groupIds.getInt(4)); } @Override @@ -189,7 +211,11 @@ public void add(int positionOffset, IntBlock groupIds) { .map(groupIds::getInt) .boxed() .collect(Collectors.toSet()); - assertEquals(values, Set.of(0, 1)); + if (withNull) { + assertEquals(Set.of(0, 1, 2), values); + } else { + assertEquals(Set.of(1, 2), values); + } } @Override @@ -212,7 +238,7 @@ public void add(int positionOffset, IntBlock groupIds) { .collect(Collectors.toSet()); // The category IDs {0, 1, 2} should map to groups {0, 2, 3}, because // 0 matches an existing category (Connected to ...), and the others are new. - assertEquals(values, Set.of(0, 2, 3)); + assertEquals(Set.of(1, 3, 4), values); } @Override diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec index 89d9026423204..547c430ed7518 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec @@ -1,5 +1,5 @@ standard aggs -required_capability: categorize_v2 +required_capability: categorize_v3 FROM sample_data | STATS count=COUNT(), @@ -17,7 +17,7 @@ count:long | sum:long | avg:double | count_distinct:long | category:keyw ; values aggs -required_capability: categorize_v2 +required_capability: categorize_v3 FROM sample_data | STATS values=MV_SORT(VALUES(message)), @@ -33,7 +33,7 @@ values:keyword | top ; mv -required_capability: categorize_v2 +required_capability: categorize_v3 FROM mv_sample_data | STATS COUNT(), SUM(event_duration) BY category=CATEGORIZE(message) @@ -48,7 +48,7 @@ COUNT():long | SUM(event_duration):long | category:keyword ; row mv -required_capability: categorize_v2 +required_capability: categorize_v3 ROW message = ["connected to a", "connected to b", "disconnected"], str = ["a", "b", "c"] | STATS COUNT(), VALUES(str) BY category=CATEGORIZE(message) @@ -61,7 +61,7 @@ COUNT():long | VALUES(str):keyword | category:keyword ; with multiple indices -required_capability: categorize_v2 +required_capability: categorize_v3 required_capability: union_types FROM sample_data* @@ -76,7 +76,7 @@ COUNT():long | category:keyword ; mv with many values -required_capability: categorize_v2 +required_capability: categorize_v3 FROM employees | STATS COUNT() BY category=CATEGORIZE(job_positions) @@ -92,24 +92,37 @@ COUNT():long | category:keyword 10 | .*?Head.+?Human.+?Resources.*? ; -# Throws when calling AbstractCategorizeBlockHash.seenGroupIds() - Requires nulls support? -mv with many values-Ignore -required_capability: categorize_v2 +mv with many values and SUM +required_capability: categorize_v3 FROM employees | STATS SUM(languages) BY category=CATEGORIZE(job_positions) - | SORT category DESC + | SORT category | LIMIT 3 ; -SUM(languages):integer | category:keyword - 43 | .*?Accountant.*? - 46 | .*?Architect.*? - 35 | .*?Business.+?Analyst.*? +SUM(languages):long | category:keyword + 43 | .*?Accountant.*? + 46 | .*?Architect.*? + 35 | .*?Business.+?Analyst.*? +; + +mv with many values and nulls and SUM +required_capability: categorize_v3 + +FROM employees + | STATS SUM(languages) BY category=CATEGORIZE(job_positions) + | SORT category DESC + | LIMIT 2 +; + +SUM(languages):long | category:keyword + 27 | null + 46 | .*?Tech.+?Lead.*? ; mv via eval -required_capability: categorize_v2 +required_capability: categorize_v3 FROM sample_data | EVAL message = MV_APPEND(message, "Banana") @@ -125,7 +138,7 @@ COUNT():long | category:keyword ; mv via eval const -required_capability: categorize_v2 +required_capability: categorize_v3 FROM sample_data | EVAL message = ["Banana", "Bread"] @@ -139,7 +152,7 @@ COUNT():long | category:keyword ; mv via eval const without aliases -required_capability: categorize_v2 +required_capability: categorize_v3 FROM sample_data | EVAL message = ["Banana", "Bread"] @@ -153,7 +166,7 @@ COUNT():long | CATEGORIZE(message):keyword ; mv const in parameter -required_capability: categorize_v2 +required_capability: categorize_v3 FROM sample_data | STATS COUNT() BY c = CATEGORIZE(["Banana", "Bread"]) @@ -166,7 +179,7 @@ COUNT():long | c:keyword ; agg alias shadowing -required_capability: categorize_v2 +required_capability: categorize_v3 FROM sample_data | STATS c = COUNT() BY c = CATEGORIZE(["Banana", "Bread"]) @@ -181,7 +194,7 @@ c:keyword ; chained aggregations using categorize -required_capability: categorize_v2 +required_capability: categorize_v3 FROM sample_data | STATS COUNT() BY category=CATEGORIZE(message) @@ -196,7 +209,7 @@ COUNT():long | category:keyword ; stats without aggs -required_capability: categorize_v2 +required_capability: categorize_v3 FROM sample_data | STATS BY category=CATEGORIZE(message) @@ -210,7 +223,7 @@ category:keyword ; text field -required_capability: categorize_v2 +required_capability: categorize_v3 FROM hosts | STATS COUNT() BY category=CATEGORIZE(host_group) @@ -221,10 +234,11 @@ COUNT():long | category:keyword 2 | .*?DB.+?servers.*? 2 | .*?Gateway.+?instances.*? 5 | .*?Kubernetes.+?cluster.*? + 1 | null ; on TO_UPPER -required_capability: categorize_v2 +required_capability: categorize_v3 FROM sample_data | STATS COUNT() BY category=CATEGORIZE(TO_UPPER(message)) @@ -238,7 +252,7 @@ COUNT():long | category:keyword ; on CONCAT -required_capability: categorize_v2 +required_capability: categorize_v3 FROM sample_data | STATS COUNT() BY category=CATEGORIZE(CONCAT(message, " banana")) @@ -252,7 +266,7 @@ COUNT():long | category:keyword ; on CONCAT with unicode -required_capability: categorize_v2 +required_capability: categorize_v3 FROM sample_data | STATS COUNT() BY category=CATEGORIZE(CONCAT(message, " 👍🏽😊")) @@ -266,7 +280,7 @@ COUNT():long | category:keyword ; on REVERSE(CONCAT()) -required_capability: categorize_v2 +required_capability: categorize_v3 FROM sample_data | STATS COUNT() BY category=CATEGORIZE(REVERSE(CONCAT(message, " 👍🏽😊"))) @@ -280,7 +294,7 @@ COUNT():long | category:keyword ; and then TO_LOWER -required_capability: categorize_v2 +required_capability: categorize_v3 FROM sample_data | STATS COUNT() BY category=CATEGORIZE(message) @@ -294,9 +308,8 @@ COUNT():long | category:keyword 1 | .*?disconnected.*? ; -# Throws NPE - Requires nulls support -on const empty string-Ignore -required_capability: categorize_v2 +on const empty string +required_capability: categorize_v3 FROM sample_data | STATS COUNT() BY category=CATEGORIZE("") @@ -304,12 +317,11 @@ FROM sample_data ; COUNT():long | category:keyword - 7 | .*?.*? + 7 | null ; -# Throws NPE - Requires nulls support -on const empty string from eval-Ignore -required_capability: categorize_v2 +on const empty string from eval +required_capability: categorize_v3 FROM sample_data | EVAL x = "" @@ -318,26 +330,24 @@ FROM sample_data ; COUNT():long | category:keyword - 7 | .*?.*? + 7 | null ; -# Doesn't give the correct results - Requires nulls support -on null-Ignore -required_capability: categorize_v2 +on null +required_capability: categorize_v3 FROM sample_data | EVAL x = null - | STATS COUNT() BY category=CATEGORIZE(x) + | STATS COUNT(), SUM(event_duration) BY category=CATEGORIZE(x) | SORT category ; -COUNT():long | category:keyword - 7 | null +COUNT():long | SUM(event_duration):long | category:keyword + 7 | 23231327 | null ; -# Doesn't give the correct results - Requires nulls support -on null string-Ignore -required_capability: categorize_v2 +on null string +required_capability: categorize_v3 FROM sample_data | EVAL x = null::string @@ -350,7 +360,7 @@ COUNT():long | category:keyword ; filtering out all data -required_capability: categorize_v2 +required_capability: categorize_v3 FROM sample_data | WHERE @timestamp < "2023-10-23T00:00:00Z" @@ -362,7 +372,7 @@ COUNT():long | category:keyword ; filtering out all data with constant -required_capability: categorize_v2 +required_capability: categorize_v3 FROM sample_data | STATS COUNT() BY category=CATEGORIZE(message) @@ -373,7 +383,7 @@ COUNT():long | category:keyword ; drop output columns -required_capability: categorize_v2 +required_capability: categorize_v3 FROM sample_data | STATS count=COUNT() BY category=CATEGORIZE(message) @@ -388,7 +398,7 @@ x:integer ; category value processing -required_capability: categorize_v2 +required_capability: categorize_v3 ROW message = ["connected to a", "connected to b", "disconnected"] | STATS COUNT() BY category=CATEGORIZE(message) @@ -402,7 +412,7 @@ COUNT():long | category:keyword ; row aliases -required_capability: categorize_v2 +required_capability: categorize_v3 ROW message = "connected to a" | EVAL x = message @@ -416,7 +426,7 @@ COUNT():long | category:keyword | y:keyword ; from aliases -required_capability: categorize_v2 +required_capability: categorize_v3 FROM sample_data | EVAL x = message @@ -432,7 +442,7 @@ COUNT():long | category:keyword | y:keyword ; row aliases with keep -required_capability: categorize_v2 +required_capability: categorize_v3 ROW message = "connected to a" | EVAL x = message @@ -448,7 +458,7 @@ COUNT():long | y:keyword ; from aliases with keep -required_capability: categorize_v2 +required_capability: categorize_v3 FROM sample_data | EVAL x = message @@ -466,7 +476,7 @@ COUNT():long | y:keyword ; row rename -required_capability: categorize_v2 +required_capability: categorize_v3 ROW message = "connected to a" | RENAME message as x @@ -480,7 +490,7 @@ COUNT():long | y:keyword ; from rename -required_capability: categorize_v2 +required_capability: categorize_v3 FROM sample_data | RENAME message as x @@ -496,7 +506,7 @@ COUNT():long | y:keyword ; row drop -required_capability: categorize_v2 +required_capability: categorize_v3 ROW message = "connected to a" | STATS c = COUNT() BY category=CATEGORIZE(message) @@ -509,7 +519,7 @@ c:long ; from drop -required_capability: categorize_v2 +required_capability: categorize_v3 FROM sample_data | STATS c = COUNT() BY category=CATEGORIZE(message) diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java index 9bd4211855699..77a3e2840977f 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java @@ -402,11 +402,8 @@ public enum Cap { /** * Supported the text categorization function "CATEGORIZE". - *

- * This capability was initially named `CATEGORIZE`, and got renamed after the function started correctly returning keywords. - *

*/ - CATEGORIZE_V2(Build.current().isSnapshot()), + CATEGORIZE_V3(Build.current().isSnapshot()), /** * QSTR function diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java index 6074601535477..dd14e8dd82123 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java @@ -1846,7 +1846,7 @@ public void testIntervalAsString() { } public void testCategorizeSingleGrouping() { - assumeTrue("requires Categorize capability", EsqlCapabilities.Cap.CATEGORIZE_V2.isEnabled()); + assumeTrue("requires Categorize capability", EsqlCapabilities.Cap.CATEGORIZE_V3.isEnabled()); query("from test | STATS COUNT(*) BY CATEGORIZE(first_name)"); query("from test | STATS COUNT(*) BY cat = CATEGORIZE(first_name)"); @@ -1875,7 +1875,7 @@ public void testCategorizeSingleGrouping() { } public void testCategorizeNestedGrouping() { - assumeTrue("requires Categorize capability", EsqlCapabilities.Cap.CATEGORIZE_V2.isEnabled()); + assumeTrue("requires Categorize capability", EsqlCapabilities.Cap.CATEGORIZE_V3.isEnabled()); query("from test | STATS COUNT(*) BY CATEGORIZE(LENGTH(first_name)::string)"); @@ -1890,7 +1890,7 @@ public void testCategorizeNestedGrouping() { } public void testCategorizeWithinAggregations() { - assumeTrue("requires Categorize capability", EsqlCapabilities.Cap.CATEGORIZE_V2.isEnabled()); + assumeTrue("requires Categorize capability", EsqlCapabilities.Cap.CATEGORIZE_V3.isEnabled()); query("from test | STATS MV_COUNT(cat), COUNT(*) BY cat = CATEGORIZE(first_name)"); diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizerTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizerTests.java index 8373528531902..e98f2b88b33c9 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizerTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizerTests.java @@ -1212,7 +1212,7 @@ public void testCombineProjectionWithAggregationFirstAndAliasedGroupingUsedInAgg * \_EsRelation[test][_meta_field{f}#23, emp_no{f}#17, first_name{f}#18, ..] */ public void testCombineProjectionWithCategorizeGrouping() { - assumeTrue("requires Categorize capability", EsqlCapabilities.Cap.CATEGORIZE_V2.isEnabled()); + assumeTrue("requires Categorize capability", EsqlCapabilities.Cap.CATEGORIZE_V3.isEnabled()); var plan = plan(""" from test @@ -3949,7 +3949,7 @@ public void testNestedExpressionsInGroups() { * \_EsRelation[test][_meta_field{f}#14, emp_no{f}#8, first_name{f}#9, ge..] */ public void testNestedExpressionsInGroupsWithCategorize() { - assumeTrue("requires Categorize capability", EsqlCapabilities.Cap.CATEGORIZE_V2.isEnabled()); + assumeTrue("requires Categorize capability", EsqlCapabilities.Cap.CATEGORIZE_V3.isEnabled()); var plan = optimizedPlan(""" from test diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/TokenListCategorizer.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/TokenListCategorizer.java index e4257270ce641..7fef6cdafa372 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/TokenListCategorizer.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/TokenListCategorizer.java @@ -115,6 +115,7 @@ public TokenListCategorizer( cacheRamUsage(0); } + @Nullable public TokenListCategory computeCategory(String s, CategorizationAnalyzer analyzer) { try (TokenStream ts = analyzer.tokenStream("text", s)) { return computeCategory(ts, s.length(), 1); @@ -123,6 +124,7 @@ public TokenListCategory computeCategory(String s, CategorizationAnalyzer analyz } } + @Nullable public TokenListCategory computeCategory(TokenStream ts, int unfilteredStringLen, long numDocs) throws IOException { assert partOfSpeechDictionary != null : "This version of computeCategory should only be used when a part-of-speech dictionary is available"; From 3c70cd081d40c36a5ac375b009932a0ce5eff1bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mariusz=20J=C3=B3zala?= <377355+jozala@users.noreply.github.com> Date: Thu, 28 Nov 2024 16:20:05 +0100 Subject: [PATCH 11/15] Revert "[CI] Ignore error about missing UBI artifact (#117506)" (#117704) This reverts commit 219372efaaf46a3b496df2142d3091d3434e67ec. This ignore is no longer necessary since the change to release-manager has been applied. --- .buildkite/scripts/dra-workflow.sh | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/.buildkite/scripts/dra-workflow.sh b/.buildkite/scripts/dra-workflow.sh index bbfa81f51b286..f2dc40ca1927f 100755 --- a/.buildkite/scripts/dra-workflow.sh +++ b/.buildkite/scripts/dra-workflow.sh @@ -75,7 +75,6 @@ find "$WORKSPACE" -type d -path "*/build/distributions" -exec chmod a+w {} \; echo --- Running release-manager -set +e # Artifacts should be generated docker run --rm \ --name release-manager \ @@ -92,16 +91,4 @@ docker run --rm \ --version "$ES_VERSION" \ --artifact-set main \ --dependency "beats:https://artifacts-${WORKFLOW}.elastic.co/beats/${BEATS_BUILD_ID}/manifest-${ES_VERSION}${VERSION_SUFFIX}.json" \ - --dependency "ml-cpp:https://artifacts-${WORKFLOW}.elastic.co/ml-cpp/${ML_CPP_BUILD_ID}/manifest-${ES_VERSION}${VERSION_SUFFIX}.json" \ -2>&1 | tee release-manager.log -EXIT_CODE=$? -set -e - -# This failure is just generating a ton of noise right now, so let's just ignore it -# This should be removed once this issue has been fixed -if grep "elasticsearch-ubi-9.0.0-SNAPSHOT-docker-image.tar.gz" release-manager.log; then - echo "Ignoring error about missing ubi artifact" - exit 0 -fi - -exit "$EXIT_CODE" + --dependency "ml-cpp:https://artifacts-${WORKFLOW}.elastic.co/ml-cpp/${ML_CPP_BUILD_ID}/manifest-${ES_VERSION}${VERSION_SUFFIX}.json" From 54db9470207df11f07475a6e8d4837b29515a4d7 Mon Sep 17 00:00:00 2001 From: Oleksandr Kolomiiets Date: Thu, 28 Nov 2024 07:33:35 -0800 Subject: [PATCH 12/15] Fix scaled_float test (#117662) --- .../index/mapper/extras/ScaledFloatFieldMapperTests.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/extras/ScaledFloatFieldMapperTests.java b/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/extras/ScaledFloatFieldMapperTests.java index dc9bc96f107a0..83fe07170d6e7 100644 --- a/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/extras/ScaledFloatFieldMapperTests.java +++ b/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/extras/ScaledFloatFieldMapperTests.java @@ -527,7 +527,13 @@ protected Number randomNumber() { public void testEncodeDecodeExactScalingFactor() { double v = randomValue(); - assertThat(encodeDecode(1 / v, v), equalTo(1 / v)); + double expected = 1 / v; + // We don't produce infinities while decoding. See #testDecodeHandlingInfinity(). + if (Double.isInfinite(expected)) { + var sign = expected == Double.POSITIVE_INFINITY ? 1 : -1; + expected = sign * Double.MAX_VALUE; + } + assertThat(encodeDecode(1 / v, v), equalTo(expected)); } /** From ab604ada78d779a18b82465d51829006540ce546 Mon Sep 17 00:00:00 2001 From: Liam Thompson <32779855+leemthompo@users.noreply.github.com> Date: Thu, 28 Nov 2024 16:34:57 +0100 Subject: [PATCH 13/15] [DOCS] Update tutorial example (#117538) --- .../full-text-filtering-tutorial.asciidoc | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/reference/quickstart/full-text-filtering-tutorial.asciidoc b/docs/reference/quickstart/full-text-filtering-tutorial.asciidoc index fee4b797da724..a024305588cae 100644 --- a/docs/reference/quickstart/full-text-filtering-tutorial.asciidoc +++ b/docs/reference/quickstart/full-text-filtering-tutorial.asciidoc @@ -511,8 +511,9 @@ In this tutorial scenario it's useful for when users have complex requirements f Let's create a query that addresses the following user needs: -* Must be a vegetarian main course +* Must be a vegetarian recipe * Should contain "curry" or "spicy" in the title or description +* Should be a main course * Must not be a dessert * Must have a rating of at least 4.5 * Should prefer recipes published in the last month @@ -524,16 +525,7 @@ GET /cooking_blog/_search "query": { "bool": { "must": [ - { - "term": { - "category.keyword": "Main Course" - } - }, - { - "term": { - "tags": "vegetarian" - } - }, + { "term": { "tags": "vegetarian" } }, { "range": { "rating": { @@ -543,10 +535,18 @@ GET /cooking_blog/_search } ], "should": [ + { + "term": { + "category": "Main Course" + } + }, { "multi_match": { "query": "curry spicy", - "fields": ["title^2", "description"] + "fields": [ + "title^2", + "description" + ] } }, { @@ -590,12 +590,12 @@ GET /cooking_blog/_search "value": 1, "relation": "eq" }, - "max_score": 7.9835095, + "max_score": 7.444513, "hits": [ { "_index": "cooking_blog", "_id": "2", - "_score": 7.9835095, + "_score": 7.444513, "_source": { "title": "Spicy Thai Green Curry: A Vegetarian Adventure", <1> "description": "Dive into the flavors of Thailand with this vibrant green curry. Packed with vegetables and aromatic herbs, this dish is both healthy and satisfying. Don't worry about the heat - you can easily adjust the spice level to your liking.", <2> @@ -619,8 +619,8 @@ GET /cooking_blog/_search <1> The title contains "Spicy" and "Curry", matching our should condition. With the default <> behavior, this field contributes most to the relevance score. <2> While the description also contains matching terms, only the best matching field's score is used by default. <3> The recipe was published within the last month, satisfying our recency preference. -<4> The "Main Course" category matches our `must` condition. -<5> The "vegetarian" tag satisfies another `must` condition, while "curry" and "spicy" tags align with our `should` preferences. +<4> The "Main Course" category satisfies another `should` condition. +<5> The "vegetarian" tag satisfies a `must` condition, while "curry" and "spicy" tags align with our `should` preferences. <6> The rating of 4.6 meets our minimum rating requirement of 4.5. ============== From f096c317c06052dc26c00b72448eda4743ab5965 Mon Sep 17 00:00:00 2001 From: Dimitris Rempapis Date: Thu, 28 Nov 2024 19:38:37 +0200 Subject: [PATCH 14/15] fix/SearchStatesIt_failures (#117618) Investigate and unmute automatically muted tests --- docs/changelog/117618.yaml | 5 +++++ muted-tests.yml | 6 ------ 2 files changed, 5 insertions(+), 6 deletions(-) create mode 100644 docs/changelog/117618.yaml diff --git a/docs/changelog/117618.yaml b/docs/changelog/117618.yaml new file mode 100644 index 0000000000000..5de29e2fe768c --- /dev/null +++ b/docs/changelog/117618.yaml @@ -0,0 +1,5 @@ +pr: 117618 +summary: SearchStatesIt failures reported by CI +area: Search +type: bug +issues: [116617, 116618] diff --git a/muted-tests.yml b/muted-tests.yml index fdadc747289bb..d703cfaa1b9aa 100644 --- a/muted-tests.yml +++ b/muted-tests.yml @@ -156,12 +156,6 @@ tests: - class: org.elasticsearch.xpack.searchablesnapshots.SearchableSnapshotsCanMatchOnCoordinatorIntegTests method: testSearchableSnapshotShardsAreSkippedBySearchRequestWithoutQueryingAnyNodeWhenTheyAreOutsideOfTheQueryRange issue: https://github.com/elastic/elasticsearch/issues/116523 -- class: org.elasticsearch.upgrades.SearchStatesIT - method: testBWCSearchStates - issue: https://github.com/elastic/elasticsearch/issues/116617 -- class: org.elasticsearch.upgrades.SearchStatesIT - method: testCanMatch - issue: https://github.com/elastic/elasticsearch/issues/116618 - class: org.elasticsearch.reservedstate.service.RepositoriesFileSettingsIT method: testSettingsApplied issue: https://github.com/elastic/elasticsearch/issues/116694 From 8350ff29ba18c7d03d652b107532415705426da9 Mon Sep 17 00:00:00 2001 From: John Verwolf Date: Thu, 28 Nov 2024 13:25:02 -0800 Subject: [PATCH 15/15] Extensible Completion Postings Formats (#111494) Allows the Completion Postings Format to be extensible by providing an implementation of the CompletionsPostingsFormatExtension SPIs. --- docs/changelog/111494.yaml | 5 ++++ server/src/main/java/module-info.java | 6 +++- .../index/codec/PerFieldFormatSupplier.java | 24 ++++++++++++++-- .../index/mapper/CompletionFieldMapper.java | 5 ---- .../index/mapper/MappingLookup.java | 17 ----------- .../CompletionsPostingsFormatExtension.java | 28 +++++++++++++++++++ 6 files changed, 59 insertions(+), 26 deletions(-) create mode 100644 docs/changelog/111494.yaml create mode 100644 server/src/main/java/org/elasticsearch/internal/CompletionsPostingsFormatExtension.java diff --git a/docs/changelog/111494.yaml b/docs/changelog/111494.yaml new file mode 100644 index 0000000000000..6c7b84bb04798 --- /dev/null +++ b/docs/changelog/111494.yaml @@ -0,0 +1,5 @@ +pr: 111494 +summary: Extensible Completion Postings Formats +area: "Suggesters" +type: enhancement +issues: [] diff --git a/server/src/main/java/module-info.java b/server/src/main/java/module-info.java index 63dbac3a72487..d572d3b90fec8 100644 --- a/server/src/main/java/module-info.java +++ b/server/src/main/java/module-info.java @@ -7,6 +7,7 @@ * License v3.0 only", or the "Server Side Public License, v 1". */ +import org.elasticsearch.internal.CompletionsPostingsFormatExtension; import org.elasticsearch.plugins.internal.RestExtension; /** The Elasticsearch Server Module. */ @@ -288,7 +289,8 @@ to org.elasticsearch.serverless.version, org.elasticsearch.serverless.buildinfo, - org.elasticsearch.serverless.constants; + org.elasticsearch.serverless.constants, + org.elasticsearch.serverless.codec; exports org.elasticsearch.lucene.analysis.miscellaneous; exports org.elasticsearch.lucene.grouping; exports org.elasticsearch.lucene.queries; @@ -395,6 +397,7 @@ org.elasticsearch.stateless, org.elasticsearch.settings.secure, org.elasticsearch.serverless.constants, + org.elasticsearch.serverless.codec, org.elasticsearch.serverless.apifiltering, org.elasticsearch.internal.security; @@ -414,6 +417,7 @@ uses org.elasticsearch.node.internal.TerminationHandlerProvider; uses org.elasticsearch.internal.VersionExtension; uses org.elasticsearch.internal.BuildExtension; + uses CompletionsPostingsFormatExtension; uses org.elasticsearch.features.FeatureSpecification; uses org.elasticsearch.plugins.internal.LoggingDataProvider; diff --git a/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java b/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java index 9c2a08a69002c..4d3d37ab4f3af 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java +++ b/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java @@ -20,10 +20,15 @@ import org.elasticsearch.index.codec.bloomfilter.ES87BloomFilterPostingsFormat; import org.elasticsearch.index.codec.postings.ES812PostingsFormat; import org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormat; +import org.elasticsearch.index.mapper.CompletionFieldMapper; import org.elasticsearch.index.mapper.IdFieldMapper; import org.elasticsearch.index.mapper.Mapper; import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper; +import org.elasticsearch.internal.CompletionsPostingsFormatExtension; +import org.elasticsearch.plugins.ExtensionLoader; + +import java.util.ServiceLoader; /** * Class that encapsulates the logic of figuring out the most appropriate file format for a given field, across postings, doc values and @@ -53,15 +58,28 @@ public PostingsFormat getPostingsFormatForField(String field) { private PostingsFormat internalGetPostingsFormatForField(String field) { if (mapperService != null) { - final PostingsFormat format = mapperService.mappingLookup().getPostingsFormat(field); - if (format != null) { - return format; + Mapper mapper = mapperService.mappingLookup().getMapper(field); + if (mapper instanceof CompletionFieldMapper) { + return PostingsFormatHolder.POSTINGS_FORMAT; } } // return our own posting format using PFOR return es812PostingsFormat; } + private static class PostingsFormatHolder { + private static final PostingsFormat POSTINGS_FORMAT = getPostingsFormat(); + + private static PostingsFormat getPostingsFormat() { + String defaultName = "Completion912"; // Caution: changing this name will result in exceptions if a field is created during a + // rolling upgrade and the new codec (specified by the name) is not available on all nodes in the cluster. + String codecName = ExtensionLoader.loadSingleton(ServiceLoader.load(CompletionsPostingsFormatExtension.class)) + .map(CompletionsPostingsFormatExtension::getFormatName) + .orElse(defaultName); + return PostingsFormat.forName(codecName); + } + } + boolean useBloomFilter(String field) { if (mapperService == null) { return false; diff --git a/server/src/main/java/org/elasticsearch/index/mapper/CompletionFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/CompletionFieldMapper.java index 53ccccdbd4bab..bb229c795a83e 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/CompletionFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/CompletionFieldMapper.java @@ -8,7 +8,6 @@ */ package org.elasticsearch.index.mapper; -import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.document.FieldType; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.Term; @@ -344,10 +343,6 @@ public CompletionFieldType fieldType() { return (CompletionFieldType) super.fieldType(); } - static PostingsFormat postingsFormat() { - return PostingsFormat.forName("Completion912"); - } - @Override public boolean parsesArrayValue() { return true; diff --git a/server/src/main/java/org/elasticsearch/index/mapper/MappingLookup.java b/server/src/main/java/org/elasticsearch/index/mapper/MappingLookup.java index 2f78e11761448..ce3f8cfb53184 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/MappingLookup.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/MappingLookup.java @@ -9,7 +9,6 @@ package org.elasticsearch.index.mapper; -import org.apache.lucene.codecs.PostingsFormat; import org.elasticsearch.cluster.metadata.DataStream; import org.elasticsearch.cluster.metadata.InferenceFieldMetadata; import org.elasticsearch.index.IndexSettings; @@ -21,7 +20,6 @@ import java.util.Collection; import java.util.Collections; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -58,7 +56,6 @@ private CacheKey() {} private final Map indexAnalyzersMap; private final List indexTimeScriptMappers; private final Mapping mapping; - private final Set completionFields; private final int totalFieldsCount; /** @@ -161,7 +158,6 @@ private MappingLookup( this.nestedLookup = NestedLookup.build(nestedMappers); final Map indexAnalyzersMap = new HashMap<>(); - final Set completionFields = new HashSet<>(); final List indexTimeScriptMappers = new ArrayList<>(); for (FieldMapper mapper : mappers) { if (objects.containsKey(mapper.fullPath())) { @@ -174,9 +170,6 @@ private MappingLookup( if (mapper.hasScript()) { indexTimeScriptMappers.add(mapper); } - if (mapper instanceof CompletionFieldMapper) { - completionFields.add(mapper.fullPath()); - } } for (FieldAliasMapper aliasMapper : aliasMappers) { @@ -211,7 +204,6 @@ private MappingLookup( this.objectMappers = Map.copyOf(objects); this.runtimeFieldMappersCount = runtimeFields.size(); this.indexAnalyzersMap = Map.copyOf(indexAnalyzersMap); - this.completionFields = Set.copyOf(completionFields); this.indexTimeScriptMappers = List.copyOf(indexTimeScriptMappers); runtimeFields.stream().flatMap(RuntimeField::asMappedFieldTypes).map(MappedFieldType::name).forEach(this::validateDoesNotShadow); @@ -285,15 +277,6 @@ public Iterable fieldMappers() { return fieldMappers.values(); } - /** - * Gets the postings format for a particular field - * @param field the field to retrieve a postings format for - * @return the postings format for the field, or {@code null} if the default format should be used - */ - public PostingsFormat getPostingsFormat(String field) { - return completionFields.contains(field) ? CompletionFieldMapper.postingsFormat() : null; - } - void checkLimits(IndexSettings settings) { checkFieldLimit(settings.getMappingTotalFieldsLimit()); checkObjectDepthLimit(settings.getMappingDepthLimit()); diff --git a/server/src/main/java/org/elasticsearch/internal/CompletionsPostingsFormatExtension.java b/server/src/main/java/org/elasticsearch/internal/CompletionsPostingsFormatExtension.java new file mode 100644 index 0000000000000..bb28d4dd6c901 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/internal/CompletionsPostingsFormatExtension.java @@ -0,0 +1,28 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.internal; + +import org.apache.lucene.search.suggest.document.CompletionPostingsFormat; + +/** + * Allows plugging-in the Completions Postings Format. + */ +public interface CompletionsPostingsFormatExtension { + + /** + * Returns the name of the {@link CompletionPostingsFormat} that Elasticsearch should use. Should return null if the extension + * is not enabled. + *

+ * Note that the name must match a codec that is available on all nodes in the cluster, otherwise IndexCorruptionExceptions will occur. + * A feature can be used to protect against this scenario, or alternatively, the codec code can be rolled out prior to its usage by this + * extension. + */ + String getFormatName(); +}