[7.x] Implement stats aggregation for string terms (elastic#49097)

Backport of elastic#47468 to 7.x This PR adds a new metric aggregation called string_stats that operates on string terms of a document and returns the following: min_length: The length of the shortest term max_length: The length of the longest term avg_length: The average length of all terms distribution: The probability distribution of all characters appearing in all terms entropy: The total Shannon entropy value calculated for all terms This aggregation has been implemented as an analytics plugin.
markharwood · Nov 15, 2019 · d9f0245 · d9f0245
1 parent 085d08c
commit d9f0245
Show file tree

Hide file tree

Showing 10 changed files with 1,153 additions and 9 deletions.
diff --git a/docs/build.gradle b/docs/build.gradle
@@ -177,7 +177,7 @@ buildRestTests.setups['ledger'] = '''
             {"index":{}}
             {"date": "2015/01/01 00:00:00", "amount": 200, "type": "sale", "description": "something"}
             {"index":{}}
-            {"date": "2015/01/01 00:00:00", "amount": 10, "type": "expense", "decription": "another thing"}
+            {"date": "2015/01/01 00:00:00", "amount": 10, "type": "expense", "description": "another thing"}
             {"index":{}}
             {"date": "2015/01/01 00:00:00", "amount": 150, "type": "sale", "description": "blah"}
             {"index":{}}

diff --git a/docs/reference/aggregations/metrics.asciidoc b/docs/reference/aggregations/metrics.asciidoc
@@ -35,6 +35,8 @@ include::metrics/scripted-metric-aggregation.asciidoc[]
 
 include::metrics/stats-aggregation.asciidoc[]
 
+include::metrics/string-stats-aggregation.asciidoc[]
+
 include::metrics/sum-aggregation.asciidoc[]
 
 include::metrics/tophits-aggregation.asciidoc[]

diff --git a/docs/reference/aggregations/metrics/string-stats-aggregation.asciidoc b/docs/reference/aggregations/metrics/string-stats-aggregation.asciidoc
@@ -0,0 +1,217 @@
+[role="xpack"]
+[testenv="basic"]
+[[search-aggregations-metrics-string-stats-aggregation]]
+=== String Stats Aggregation
+
+A `multi-value` metrics aggregation that computes statistics over string values extracted from the aggregated documents.
+These values can be retrieved either from specific `keyword` fields in the documents or can be generated by a provided script.
+
+The string stats aggregation returns the following results:
+
+* `count` - The number of non-empty fields counted.
+* `min_length` - The length of the shortest term.
+* `max_length` - The length of the longest term.
+* `avg_length` - The average length computed over all terms.
+* `entropy` - The https://en.wikipedia.org/wiki/Entropy_(information_theory)[Shannon Entropy] value computed over all terms collected by
+the aggregation. Shannon entropy quantifies the amount of information contained in the field. It is a very useful metric for
+measuring a wide range of properties of a data set, such as diversity, similarity, randomness etc.
+
+Assuming the data consists of a twitter messages:
+
+[source,console]
+--------------------------------------------------
+POST /twitter/_search?size=0
+{
+    "aggs" : {
+        "message_stats" : { "string_stats" : { "field" : "message.keyword" } }
+    }
+}
+--------------------------------------------------
+// TEST[setup:twitter]
+
+The above aggregation computes the string statistics for the `message` field in all documents. The aggregation type
+is `string_stats` and the `field` parameter defines the field of the documents the stats will be computed on.
+The above will return the following:
+
+[source,console-result]
+--------------------------------------------------
+{
+    ...
+
+    "aggregations": {
+        "message_stats" : {
+            "count" : 5,
+            "min_length" : 24,
+            "max_length" : 30,
+            "avg_length" : 28.8,
+            "entropy" : 3.94617750050791
+        }
+    }
+}
+--------------------------------------------------
+// TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/]
+
+The name of the aggregation (`message_stats` above) also serves as the key by which the aggregation result can be retrieved from
+the returned response.
+
+==== Character distribution
+
+The computation of the Shannon Entropy value is based on the probability of each character appearing in all terms collected
+by the aggregation. To view the probability distribution for all characters, we can add the `show_distribution` (default: `false`) parameter.
+
+[source,console]
+--------------------------------------------------
+POST /twitter/_search?size=0
+{
+    "aggs" : {
+        "message_stats" : {
+            "string_stats" : {
+                "field" : "message.keyword",
+                "show_distribution": true  <1>
+            }
+        }
+    }
+}
+--------------------------------------------------
+// TEST[setup:twitter]
+
+<1> Set the `show_distribution` parameter to `true`, so that probability distribution for all characters is returned in the results.
+
+[source,console-result]
+--------------------------------------------------
+{
+    ...
+
+    "aggregations": {
+        "message_stats" : {
+            "count" : 5,
+            "min_length" : 24,
+            "max_length" : 30,
+            "avg_length" : 28.8,
+            "entropy" : 3.94617750050791,
+            "distribution" : {
+                " " : 0.1527777777777778,
+                "e" : 0.14583333333333334,
+                "s" : 0.09722222222222222,
+                "m" : 0.08333333333333333,
+                "t" : 0.0763888888888889,
+                "h" : 0.0625,
+                "a" : 0.041666666666666664,
+                "i" : 0.041666666666666664,
+                "r" : 0.041666666666666664,
+                "g" : 0.034722222222222224,
+                "n" : 0.034722222222222224,
+                "o" : 0.034722222222222224,
+                "u" : 0.034722222222222224,
+                "b" : 0.027777777777777776,
+                "w" : 0.027777777777777776,
+                "c" : 0.013888888888888888,
+                "E" : 0.006944444444444444,
+                "l" : 0.006944444444444444,
+                "1" : 0.006944444444444444,
+                "2" : 0.006944444444444444,
+                "3" : 0.006944444444444444,
+                "4" : 0.006944444444444444,
+                "y" : 0.006944444444444444
+            }
+        }
+    }
+}
+--------------------------------------------------
+// TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/]
+
+The `distribution` object shows the probability of each character appearing in all terms. The characters are sorted by descending probability.
+
+==== Script
+
+Computing the message string stats based on a script:
+
+[source,console]
+--------------------------------------------------
+POST /twitter/_search?size=0
+{
+    "aggs" : {
+        "message_stats" : {
+             "string_stats" : {
+                 "script" : {
+                     "lang": "painless",
+                     "source": "doc['message.keyword'].value"
+                 }
+             }
+         }
+    }
+}
+--------------------------------------------------
+// TEST[setup:twitter]
+
+This will interpret the `script` parameter as an `inline` script with the `painless` script language and no script parameters.
+To use a stored script use the following syntax:
+
+[source,console]
+--------------------------------------------------
+POST /twitter/_search?size=0
+{
+    "aggs" : {
+        "message_stats" : {
+            "string_stats" : {
+                "script" : {
+                    "id": "my_script",
+                    "params" : {
+                        "field" : "message.keyword"
+                    }
+                }
+            }
+        }
+    }
+}
+--------------------------------------------------
+// TEST[setup:twitter,stored_example_script]
+
+===== Value Script
+
+We can use a value script to modify the message (eg we can add a prefix) and compute the new stats:
+
+[source,console]
+--------------------------------------------------
+POST /twitter/_search?size=0
+{
+    "aggs" : {
+        "message_stats" : {
+            "string_stats" : {
+                "field" : "message.keyword",
+                "script" : {
+                    "lang": "painless",
+                    "source": "params.prefix + _value",
+                    "params" : {
+                        "prefix" : "Message: "
+                    }
+                }
+            }
+        }
+    }
+}
+--------------------------------------------------
+// TEST[setup:twitter]
+
+==== Missing value
+
+The `missing` parameter defines how documents that are missing a value should be treated.
+By default they will be ignored but it is also possible to treat them as if they had a value.
+
+[source,console]
+--------------------------------------------------
+POST /twitter/_search?size=0
+{
+    "aggs" : {
+        "message_stats" : {
+            "string_stats" : {
+                "field" : "message.keyword",
+                "missing": "[empty message]" <1>
+            }
+        }
+    }
+}
+--------------------------------------------------
+// TEST[setup:twitter]
+
+<1> Documents without a value in the `message` field will be treated as documents that have the value `[empty message]`.
diff --git a/...alytics/src/main/java/org/elasticsearch/xpack/analytics/AnalyticsAggregationBuilders.java b/...alytics/src/main/java/org/elasticsearch/xpack/analytics/AnalyticsAggregationBuilders.java
@@ -6,10 +6,15 @@
 package org.elasticsearch.xpack.analytics;
 
 import org.elasticsearch.xpack.analytics.cumulativecardinality.CumulativeCardinalityPipelineAggregationBuilder;
+import org.elasticsearch.xpack.analytics.stringstats.StringStatsAggregationBuilder;
 
 public class AnalyticsAggregationBuilders {
 
-    public static CumulativeCardinalityPipelineAggregationBuilder cumulativeCaardinality(String name, String bucketsPath) {
+    public static CumulativeCardinalityPipelineAggregationBuilder cumulativeCardinality(String name, String bucketsPath) {
         return new CumulativeCardinalityPipelineAggregationBuilder(name, bucketsPath);
     }
+
+    public static StringStatsAggregationBuilder stringStats(String name) {
+        return new StringStatsAggregationBuilder(name);
+    }
 }
diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/AnalyticsPlugin.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/AnalyticsPlugin.java
@@ -13,11 +13,13 @@
 import org.elasticsearch.plugins.ActionPlugin;
 import org.elasticsearch.plugins.Plugin;
 import org.elasticsearch.plugins.SearchPlugin;
-import org.elasticsearch.xpack.core.XPackPlugin;
-import org.elasticsearch.xpack.core.analytics.action.AnalyticsStatsAction;
 import org.elasticsearch.xpack.analytics.action.TransportAnalyticsStatsAction;
 import org.elasticsearch.xpack.analytics.cumulativecardinality.CumulativeCardinalityPipelineAggregationBuilder;
 import org.elasticsearch.xpack.analytics.cumulativecardinality.CumulativeCardinalityPipelineAggregator;
+import org.elasticsearch.xpack.analytics.stringstats.InternalStringStats;
+import org.elasticsearch.xpack.analytics.stringstats.StringStatsAggregationBuilder;
+import org.elasticsearch.xpack.core.XPackPlugin;
+import org.elasticsearch.xpack.core.analytics.action.AnalyticsStatsAction;
 
 import java.util.ArrayList;
 import java.util.Collection;
@@ -40,11 +42,23 @@ public AnalyticsPlugin(Settings settings) {
 
     @Override
     public List<PipelineAggregationSpec> getPipelineAggregations() {
-        return singletonList(new PipelineAggregationSpec(
-            CumulativeCardinalityPipelineAggregationBuilder.NAME,
-            CumulativeCardinalityPipelineAggregationBuilder::new,
-            CumulativeCardinalityPipelineAggregator::new,
-            CumulativeCardinalityPipelineAggregationBuilder::parse));
+        return singletonList(
+            new PipelineAggregationSpec(
+                CumulativeCardinalityPipelineAggregationBuilder.NAME,
+                CumulativeCardinalityPipelineAggregationBuilder::new,
+                CumulativeCardinalityPipelineAggregator::new,
+                CumulativeCardinalityPipelineAggregationBuilder::parse)
+        );
+    }
+
+    @Override
+    public List<AggregationSpec> getAggregations() {
+        return singletonList(
+            new AggregationSpec(
+                StringStatsAggregationBuilder.NAME,
+                StringStatsAggregationBuilder::new,
+                StringStatsAggregationBuilder::parse).addResultReader(InternalStringStats::new)
+        );
     }
 
     @Override