elastic · benwtrent · Jun 4, 2021 · Apr 23, 2021 · May 24, 2021 · May 24, 2021
diff --git a/docs/reference/aggregations/pipeline.asciidoc b/docs/reference/aggregations/pipeline.asciidoc
@@ -271,6 +271,10 @@ include::pipeline/avg-bucket-aggregation.asciidoc[]
 
 include::pipeline/bucket-script-aggregation.asciidoc[]
 
+include::pipeline/bucket-count-ks-test-aggregation.asciidoc[]
+
+include::pipeline/bucket-correlation-aggregation.asciidoc[]
+
 include::pipeline/bucket-selector-aggregation.asciidoc[]
 
 include::pipeline/bucket-sort-aggregation.asciidoc[]

diff --git a/docs/reference/aggregations/pipeline/bucket-correlation-aggregation.asciidoc b/docs/reference/aggregations/pipeline/bucket-correlation-aggregation.asciidoc
@@ -103,13 +103,13 @@ POST correlate_latency/_search?size=0&filter_path=aggregations
 {
   "aggs": {
     "buckets": {
-      "terms": {
+      "terms": { <1>
         "field": "version",
         "size": 2
       },
       "aggs": {
         "latency_ranges": {
-          "range": {
+          "range": { <2>
             "field": "latency",
             "ranges": [
               { "to": 0.0 },
@@ -126,7 +126,7 @@ POST correlate_latency/_search?size=0&filter_path=aggregations
             ]
           }
         },
-        "bucket_correlation": {
+        "bucket_correlation": { <3>
           "bucket_correlation": {
             "buckets_path": "latency_ranges>_count",
             "function": {

diff --git a/docs/reference/aggregations/pipeline/bucket-count-ks-test-aggregation.asciidoc b/docs/reference/aggregations/pipeline/bucket-count-ks-test-aggregation.asciidoc
@@ -0,0 +1,291 @@
+[role="xpack"]
+[testenv="basic"]
+[[search-aggregations-bucket-count-ks-test-aggregation]]
+=== Bucket count K-S test correlation aggregation
+++++
+<titleabbrev>Bucket count K-S test aggregation</titleabbrev>
+++++
+
+experimental::[]
+
+A sibling pipeline aggregation which executes a two sample Kolmogorov–Smirnov test
+(referred to as a "K-S test" from now own) against a provided distribution and
-(referred to as a "K-S test" from now own) against a provided distribution and
+(referred to as a "K-S test" from now on) against a provided distribution and
-(referred to as a "K-S test" from now own) against a provided distribution and
+(referred to as a "K-S test" from now on) against a provided distribution and
+the distribution of documents counts in the configured sibling aggregation.
+
+This test is useful to determine if two samples (represented by `fractions` and `buckets_path`) are
+drawn from the same distribution.
+
+[[bucket-count-ks-test-agg-syntax]]
+==== Parameters
+
+`buckets_path`::
+(Required, string)
+Path to the buckets that contain one set of values to correlate. Must be a `_count` path
+For syntax, see <<buckets-path-syntax>>.
+
+`alternative`::
+(Required, list)
+A list of string values indicating which K-S test alternative to calculate.
+The valid values are: "greater", "less", "two_sided". This parameter is key for
+determining the K-S statistic used when calculating the K-S test.
+
+`fractions`::
+(Optional, list)
+A list of doubles indicating the distribution of the samples with which to compare to the
+`buckets_path` results. The default is a uniform distribution of the same length as the
+`buckets_path` buckets.
+
+`sampling_method`::
+(Optional, string)
+Indicates the sampling methodology when calculating the K-S test. Note, this is sampling
+of the returned values. This determines the cumulative distribution function (cdf) points
-of the returned values. This determines the cumulative distribution function (cdf) points
+of the returned values. This determines the cumulative distribution function (CDF) points
-of the returned values. This determines the cumulative distribution function (cdf) points
+of the returned values. This determines the cumulative distribution function (CDF) points
+used comparing the two samples. Default is `upper_tail`, which emphasizes the upper
+end of the CDF points. Valid options are: `upper_tail`, `uniform`, and `lower_tail`.
+
+==== Syntax
+
+A `bucket_count_ks_test` aggregation looks like this in isolation:
+
+[source,js]
+--------------------------------------------------
+{
+  "bucket_count_ks_test": {
+    "buckets_path": "range_values>_count", <1>
+    "alternative": ["less", "greater", "two_sided"], <2>
+    "sampling_method": "upper_tail" <3>
+  }
+}
+--------------------------------------------------
+// NOTCONSOLE
+<1> The buckets containing the values to test against.
+<2> The alternatives to calculate
+<3> The sampling method for the K-S statistic
+
+
+[[bucket-count-ks-test-agg-example]]
+==== Example
+
+The following snippet runs the `bucket_count_ks_test` on the individual terms in the field `version` against a uniform distribution.
+The uniform distribution reflects the `latency` percentile buckets. Not shown is the pre-calculation of the `latency` indicator values,
+which was done utilizing the
+<<search-aggregations-metrics-percentile-aggregation,percentiles>> aggregation.
+
+This example is only using the 10s percentiles.
-This example is only using the 10s percentiles.
+This example is only using the deciles of `latency`.
-This example is only using the 10s percentiles.
+This example is only using the deciles of `latency`.
+
+[source,console]
+-------------------------------------------------
+POST correlate_latency/_search?size=0&filter_path=aggregations
+{
+  "aggs": {
+    "buckets": {
+      "terms": { <1>
+        "field": "version",
+        "size": 2
+      },
+      "aggs": {
+        "latency_ranges": {
+          "range": { <2>
+            "field": "latency",
+            "ranges": [
+              { "to": 0.0 },
+              { "from": 0, "to": 105 },
+              { "from": 105, "to": 225 },
+              { "from": 225, "to": 445 },
+              { "from": 445, "to": 665 },
+              { "from": 665, "to": 885 },
+              { "from": 885, "to": 1115 },
+              { "from": 1115, "to": 1335 },
+              { "from": 1335, "to": 1555 },
+              { "from": 1555, "to": 1775 },
+              { "from": 1775 }
+            ]
+          }
+        },
+        "ks_test": { <3>
+          "bucket_count_ks_test": {
+            "buckets_path": "latency_ranges>_count",
+            "alternative": ["less", "greater", "two_sided"]
+          }
+        }
+      }
+    }
+  }
+}
+-------------------------------------------------
+// TEST[setup:correlate_latency]
+
+<1> The term buckets containing a range aggregation and the bucket correlation aggregation. Both are utilized to calculate
+    the correlation of the term values with the latency.
+<2> The range aggregation on the latency field. The ranges were created referencing the percentiles of the latency field.
+<3> The bucket count K-S test aggregation that determines if the count samples are from the same distribution as the uniform
+    distribution.
+
+And the following may be the response:
+
+[source,console-result]
+----
+{
+  "aggregations" : {
+    "buckets" : {
+      "doc_count_error_upper_bound" : 0,
+      "sum_other_doc_count" : 0,
+      "buckets" : [
+        {
+          "key" : "1.0",
+          "doc_count" : 100,
+          "latency_ranges" : {
+            "buckets" : [
+              {
+                "key" : "*-0.0",
+                "to" : 0.0,
+                "doc_count" : 0
+              },
+              {
+                "key" : "0.0-105.0",
+                "from" : 0.0,
+                "to" : 105.0,
+                "doc_count" : 1
+              },
+              {
+                "key" : "105.0-225.0",
+                "from" : 105.0,
+                "to" : 225.0,
+                "doc_count" : 9
+              },
+              {
+                "key" : "225.0-445.0",
+                "from" : 225.0,
+                "to" : 445.0,
+                "doc_count" : 0
+              },
+              {
+                "key" : "445.0-665.0",
+                "from" : 445.0,
+                "to" : 665.0,
+                "doc_count" : 0
+              },
+              {
+                "key" : "665.0-885.0",
+                "from" : 665.0,
+                "to" : 885.0,
+                "doc_count" : 0
+              },
+              {
+                "key" : "885.0-1115.0",
+                "from" : 885.0,
+                "to" : 1115.0,
+                "doc_count" : 10
+              },
+              {
+                "key" : "1115.0-1335.0",
+                "from" : 1115.0,
+                "to" : 1335.0,
+                "doc_count" : 20
+              },
+              {
+                "key" : "1335.0-1555.0",
+                "from" : 1335.0,
+                "to" : 1555.0,
+                "doc_count" : 20
+              },
+              {
+                "key" : "1555.0-1775.0",
+                "from" : 1555.0,
+                "to" : 1775.0,
+                "doc_count" : 20
+              },
+              {
+                "key" : "1775.0-*",
+                "from" : 1775.0,
+                "doc_count" : 20
+              }
+            ]
+          },
+          "ks_test" : {
+            "less" : 2.248673241788478E-4,
+            "greater" : 1.0,
+            "two_sided" : 2.248673241788478E-4
+          }
+        },
+        {
+          "key" : "2.0",
+          "doc_count" : 100,
+          "latency_ranges" : {
+            "buckets" : [
+              {
+                "key" : "*-0.0",
+                "to" : 0.0,
+                "doc_count" : 0
+              },
+              {
+                "key" : "0.0-105.0",
+                "from" : 0.0,
+                "to" : 105.0,
+                "doc_count" : 19
+              },
+              {
+                "key" : "105.0-225.0",
+                "from" : 105.0,
+                "to" : 225.0,
+                "doc_count" : 11
+              },
+              {
+                "key" : "225.0-445.0",
+                "from" : 225.0,
+                "to" : 445.0,
+                "doc_count" : 20
+              },
+              {
+                "key" : "445.0-665.0",
+                "from" : 445.0,
+                "to" : 665.0,
+                "doc_count" : 20
+              },
+              {
+                "key" : "665.0-885.0",
+                "from" : 665.0,
+                "to" : 885.0,
+                "doc_count" : 20
+              },
+              {
+                "key" : "885.0-1115.0",
+                "from" : 885.0,
+                "to" : 1115.0,
+                "doc_count" : 10
+              },
+              {
+                "key" : "1115.0-1335.0",
+                "from" : 1115.0,
+                "to" : 1335.0,
+                "doc_count" : 0
+              },
+              {
+                "key" : "1335.0-1555.0",
+                "from" : 1335.0,
+                "to" : 1555.0,
+                "doc_count" : 0
+              },
+              {
+                "key" : "1555.0-1775.0",
+                "from" : 1555.0,
+                "to" : 1775.0,
+                "doc_count" : 0
+              },
+              {
+                "key" : "1775.0-*",
+                "from" : 1775.0,
+                "doc_count" : 0
+              }
+            ]
+          },
+          "ks_test" : {
+            "less" : 0.9642895789647244,
+            "greater" : 4.58718174664754E-9,
+            "two_sided" : 4.58718174664754E-9
+          }
+        }
+      ]
+    }
+  }
+}
+----
diff --git a/...g/elasticsearch/search/aggregations/pipeline/BucketMetricsPipelineAggregationBuilder.java b/...g/elasticsearch/search/aggregations/pipeline/BucketMetricsPipelineAggregationBuilder.java
@@ -97,29 +97,33 @@ public GapPolicy gapPolicy() {
     @Override
     protected abstract PipelineAggregator createInternal(Map<String, Object> metadata);
 
-    @Override
-    protected void validate(ValidationContext context) {
-        if (bucketsPaths.length != 1) {
-            context.addBucketPathValidationError("must contain a single entry for aggregation [" + name + "]");
-            return;
-        }
+    protected void validateBucketPath(ValidationContext context, String bucketsPath) {
         // Need to find the first agg name in the buckets path to check its a
         // multi bucket agg: aggs are split with '>' and can optionally have a
         // metric name after them by using '.' so need to split on both to get
         // just the agg name
-        final String firstAgg = bucketsPaths[0].split("[>\\.]")[0];
+        final String firstAgg = bucketsPath.split("[>\\.]")[0];
         Optional<AggregationBuilder> aggBuilder = context.getSiblingAggregations().stream()
-                .filter(builder -> builder.getName().equals(firstAgg))
-                .findAny();
+            .filter(builder -> builder.getName().equals(firstAgg))
+            .findAny();
         if (aggBuilder.isEmpty()) {
-            context.addBucketPathValidationError("aggregation does not exist for aggregation [" + name + "]: " + bucketsPaths[0]);
+            context.addBucketPathValidationError("aggregation does not exist for aggregation [" + name + "]: " + bucketsPath);
             return;
         }
         if (aggBuilder.get().bucketCardinality() != AggregationBuilder.BucketCardinality.MANY) {
             context.addValidationError("The first aggregation in " + PipelineAggregator.Parser.BUCKETS_PATH.getPreferredName()
-                    + " must be a multi-bucket aggregation for aggregation [" + name + "] found :"
-                    + aggBuilder.get().getClass().getName() + " for buckets path: " + bucketsPaths[0]);
+                + " must be a multi-bucket aggregation for aggregation [" + name + "] found :"
+                + aggBuilder.get().getClass().getName() + " for buckets path: " + bucketsPath);
+        }
+    }
+
+    @Override
+    protected void validate(ValidationContext context) {
+        if (bucketsPaths.length != 1) {
+            context.addBucketPathValidationError("must contain a single entry for aggregation [" + name + "]");
+            return;
         }
+        validateBucketPath(context, bucketsPaths[0]);
     }
 
     @Override

diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java
@@ -237,6 +237,7 @@
 import org.elasticsearch.xpack.ml.action.TransportValidateJobConfigAction;
 import org.elasticsearch.xpack.ml.aggs.correlation.BucketCorrelationAggregationBuilder;
 import org.elasticsearch.xpack.ml.aggs.correlation.CorrelationNamedContentProvider;
+import org.elasticsearch.xpack.ml.aggs.kstest.BucketCountKSTestAggregationBuilder;
 import org.elasticsearch.xpack.ml.annotations.AnnotationPersister;
 import org.elasticsearch.xpack.ml.autoscaling.MlAutoscalingDeciderService;
 import org.elasticsearch.xpack.ml.autoscaling.MlAutoscalingNamedWritableProvider;
@@ -1087,7 +1088,8 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
     public List<PipelineAggregationSpec> getPipelineAggregations() {
         return Arrays.asList(
             InferencePipelineAggregationBuilder.buildSpec(modelLoadingService, getLicenseState()),
-            BucketCorrelationAggregationBuilder.buildSpec()
+            BucketCorrelationAggregationBuilder.buildSpec(),
+            BucketCountKSTestAggregationBuilder.buildSpec()
         );
     }