elastic · benwtrent · Jun 4, 2021 · Apr 23, 2021 · May 24, 2021 · May 24, 2021
diff --git a/docs/reference/aggregations/pipeline.asciidoc b/docs/reference/aggregations/pipeline.asciidoc
@@ -271,6 +271,10 @@ include::pipeline/avg-bucket-aggregation.asciidoc[]
 
 include::pipeline/bucket-script-aggregation.asciidoc[]
 
+include::pipeline/bucket-count-ks-test-aggregation.asciidoc[]
+
+include::pipeline/bucket-correlation-aggregation.asciidoc[]
+
 include::pipeline/bucket-selector-aggregation.asciidoc[]
 
 include::pipeline/bucket-sort-aggregation.asciidoc[]

diff --git a/docs/reference/aggregations/pipeline/bucket-correlation-aggregation.asciidoc b/docs/reference/aggregations/pipeline/bucket-correlation-aggregation.asciidoc
@@ -103,13 +103,13 @@ POST correlate_latency/_search?size=0&filter_path=aggregations
 {
   "aggs": {
     "buckets": {
-      "terms": {
+      "terms": { <1>
         "field": "version",
         "size": 2
       },
       "aggs": {
         "latency_ranges": {
-          "range": {
+          "range": { <2>
             "field": "latency",
             "ranges": [
               { "to": 0.0 },
@@ -126,7 +126,7 @@ POST correlate_latency/_search?size=0&filter_path=aggregations
             ]
           }
         },
-        "bucket_correlation": {
+        "bucket_correlation": { <3>
           "bucket_correlation": {
             "buckets_path": "latency_ranges>_count",
             "function": {

diff --git a/docs/reference/aggregations/pipeline/bucket-count-ks-test-aggregation.asciidoc b/docs/reference/aggregations/pipeline/bucket-count-ks-test-aggregation.asciidoc
@@ -0,0 +1,292 @@
+[role="xpack"]
+[testenv="basic"]
+[[search-aggregations-bucket-count-ks-test-aggregation]]
+=== Bucket count K-S test correlation aggregation
+++++
+<titleabbrev>Bucket count K-S test aggregation</titleabbrev>
+++++
+
+experimental::[]
+
+A sibling pipeline aggregation which executes a two sample Kolmogorov–Smirnov test
+(referred to as a "K-S test" from now on) against a provided distribution and
+the distribution of documents counts in the configured sibling aggregation.
+
+This test is useful to determine if two samples (represented by `fractions` and `buckets_path`) are
+drawn from the same distribution.
+
+[[bucket-count-ks-test-agg-syntax]]
+==== Parameters
+
+`buckets_path`::
+(Required, string)
+Path to the buckets that contain one set of values to correlate. Must be a `_count` path
+For syntax, see <<buckets-path-syntax>>.
+
+`alternative`::
+(Optional, list)
+A list of string values indicating which K-S test alternative to calculate.
+The valid values are: "greater", "less", "two_sided". This parameter is key for
+determining the K-S statistic used when calculating the K-S test. Default value is
+all possible alternative hypothesis.
-all possible alternative hypothesis.
+all possible alternative hypotheses.
-all possible alternative hypothesis.
+all possible alternative hypotheses.
+
+`fractions`::
+(Optional, list)
+A list of doubles indicating the distribution of the samples with which to compare to the
+`buckets_path` results. The default is a uniform distribution of the same length as the
+`buckets_path` buckets.
+
+`sampling_method`::
+(Optional, string)
+Indicates the sampling methodology when calculating the K-S test. Note, this is sampling
+of the returned values. This determines the cumulative distribution function (cdf) points
-of the returned values. This determines the cumulative distribution function (cdf) points
+of the returned values. This determines the cumulative distribution function (CDF) points
-of the returned values. This determines the cumulative distribution function (cdf) points
+of the returned values. This determines the cumulative distribution function (CDF) points
+used comparing the two samples. Default is `upper_tail`, which emphasizes the upper
+end of the CDF points. Valid options are: `upper_tail`, `uniform`, and `lower_tail`.
+
+==== Syntax
+
+A `bucket_count_ks_test` aggregation looks like this in isolation:
+
+[source,js]
+--------------------------------------------------
+{
+  "bucket_count_ks_test": {
+    "buckets_path": "range_values>_count", <1>
+    "alternative": ["less", "greater", "two_sided"], <2>
+    "sampling_method": "upper_tail" <3>
+  }
+}
+--------------------------------------------------
+// NOTCONSOLE
+<1> The buckets containing the values to test against.
+<2> The alternatives to calculate.
+<3> The sampling method for the K-S statistic.
+
+
+[[bucket-count-ks-test-agg-example]]
+==== Example
+
+The following snippet runs the `bucket_count_ks_test` on the individual terms in the field `version` against a uniform distribution.
+The uniform distribution reflects the `latency` percentile buckets. Not shown is the pre-calculation of the `latency` indicator values,
+which was done utilizing the
+<<search-aggregations-metrics-percentile-aggregation,percentiles>> aggregation.
+
+This example is only using the 10s percentiles.
-This example is only using the 10s percentiles.
+This example is only using the deciles of `latency`.
-This example is only using the 10s percentiles.
+This example is only using the deciles of `latency`.
+
+[source,console]
+-------------------------------------------------
+POST correlate_latency/_search?size=0&filter_path=aggregations
+{
+  "aggs": {
+    "buckets": {
+      "terms": { <1>
+        "field": "version",
+        "size": 2
+      },
+      "aggs": {
+        "latency_ranges": {
+          "range": { <2>
+            "field": "latency",
+            "ranges": [
+              { "to": 0 },
+              { "from": 0, "to": 105 },
+              { "from": 105, "to": 225 },
+              { "from": 225, "to": 445 },
+              { "from": 445, "to": 665 },
+              { "from": 665, "to": 885 },
+              { "from": 885, "to": 1115 },
+              { "from": 1115, "to": 1335 },
+              { "from": 1335, "to": 1555 },
+              { "from": 1555, "to": 1775 },
+              { "from": 1775 }
+            ]
+          }
+        },
+        "ks_test": { <3>
+          "bucket_count_ks_test": {
+            "buckets_path": "latency_ranges>_count",
+            "alternative": ["less", "greater", "two_sided"]
+          }
+        }
+      }
+    }
+  }
+}
+-------------------------------------------------
+// TEST[setup:correlate_latency]
+
+<1> The term buckets containing a range aggregation and the bucket correlation aggregation. Both are utilized to calculate
+    the correlation of the term values with the latency.
+<2> The range aggregation on the latency field. The ranges were created referencing the percentiles of the latency field.
+<3> The bucket count K-S test aggregation that tests if the bucket counts comes from the same distribution as `fractions`;
+    where `fractions` is a uniform distribution.
+
+And the following may be the response:
+
+[source,console-result]
+----
+{
+  "aggregations" : {
+    "buckets" : {
+      "doc_count_error_upper_bound" : 0,
+      "sum_other_doc_count" : 0,
+      "buckets" : [
+        {
+          "key" : "1.0",
+          "doc_count" : 100,
+          "latency_ranges" : {
+            "buckets" : [
+              {
+                "key" : "*-0.0",
+                "to" : 0.0,
+                "doc_count" : 0
+              },
+              {
+                "key" : "0.0-105.0",
+                "from" : 0.0,
+                "to" : 105.0,
+                "doc_count" : 1
+              },
+              {
+                "key" : "105.0-225.0",
+                "from" : 105.0,
+                "to" : 225.0,
+                "doc_count" : 9
+              },
+              {
+                "key" : "225.0-445.0",
+                "from" : 225.0,
+                "to" : 445.0,
+                "doc_count" : 0
+              },
+              {
+                "key" : "445.0-665.0",
+                "from" : 445.0,
+                "to" : 665.0,
+                "doc_count" : 0
+              },
+              {
+                "key" : "665.0-885.0",
+                "from" : 665.0,
+                "to" : 885.0,
+                "doc_count" : 0
+              },
+              {
+                "key" : "885.0-1115.0",
+                "from" : 885.0,
+                "to" : 1115.0,
+                "doc_count" : 10
+              },
+              {
+                "key" : "1115.0-1335.0",
+                "from" : 1115.0,
+                "to" : 1335.0,
+                "doc_count" : 20
+              },
+              {
+                "key" : "1335.0-1555.0",
+                "from" : 1335.0,
+                "to" : 1555.0,
+                "doc_count" : 20
+              },
+              {
+                "key" : "1555.0-1775.0",
+                "from" : 1555.0,
+                "to" : 1775.0,
+                "doc_count" : 20
+              },
+              {
+                "key" : "1775.0-*",
+                "from" : 1775.0,
+                "doc_count" : 20
+              }
+            ]
+          },
+          "ks_test" : {
+            "less" : 2.248673241788478E-4,
+            "greater" : 1.0,
+            "two_sided" : 2.248673241788478E-4
+          }
+        },
+        {
+          "key" : "2.0",
+          "doc_count" : 100,
+          "latency_ranges" : {
+            "buckets" : [
+              {
+                "key" : "*-0.0",
+                "to" : 0.0,
+                "doc_count" : 0
+              },
+              {
+                "key" : "0.0-105.0",
+                "from" : 0.0,
+                "to" : 105.0,
+                "doc_count" : 19
+              },
+              {
+                "key" : "105.0-225.0",
+                "from" : 105.0,
+                "to" : 225.0,
+                "doc_count" : 11
+              },
+              {
+                "key" : "225.0-445.0",
+                "from" : 225.0,
+                "to" : 445.0,
+                "doc_count" : 20
+              },
+              {
+                "key" : "445.0-665.0",
+                "from" : 445.0,
+                "to" : 665.0,
+                "doc_count" : 20
+              },
+              {
+                "key" : "665.0-885.0",
+                "from" : 665.0,
+                "to" : 885.0,
+                "doc_count" : 20
+              },
+              {
+                "key" : "885.0-1115.0",
+                "from" : 885.0,
+                "to" : 1115.0,
+                "doc_count" : 10
+              },
+              {
+                "key" : "1115.0-1335.0",
+                "from" : 1115.0,
+                "to" : 1335.0,
+                "doc_count" : 0
+              },
+              {
+                "key" : "1335.0-1555.0",
+                "from" : 1335.0,
+                "to" : 1555.0,
+                "doc_count" : 0
+              },
+              {
+                "key" : "1555.0-1775.0",
+                "from" : 1555.0,
+                "to" : 1775.0,
+                "doc_count" : 0
+              },
+              {
+                "key" : "1775.0-*",
+                "from" : 1775.0,
+                "doc_count" : 0
+              }
+            ]
+          },
+          "ks_test" : {
+            "less" : 0.9642895789647244,
+            "greater" : 4.58718174664754E-9,
+            "two_sided" : 4.58718174664754E-9
+          }
+        }
+      ]
+    }
+  }
+}
+----
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java
@@ -237,6 +237,7 @@
 import org.elasticsearch.xpack.ml.action.TransportValidateJobConfigAction;
 import org.elasticsearch.xpack.ml.aggs.correlation.BucketCorrelationAggregationBuilder;
 import org.elasticsearch.xpack.ml.aggs.correlation.CorrelationNamedContentProvider;
+import org.elasticsearch.xpack.ml.aggs.kstest.BucketCountKSTestAggregationBuilder;
 import org.elasticsearch.xpack.ml.annotations.AnnotationPersister;
 import org.elasticsearch.xpack.ml.autoscaling.MlAutoscalingDeciderService;
 import org.elasticsearch.xpack.ml.autoscaling.MlAutoscalingNamedWritableProvider;
@@ -1087,7 +1088,8 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
     public List<PipelineAggregationSpec> getPipelineAggregations() {
         return Arrays.asList(
             InferencePipelineAggregationBuilder.buildSpec(modelLoadingService, getLicenseState()),
-            BucketCorrelationAggregationBuilder.buildSpec()
+            BucketCorrelationAggregationBuilder.buildSpec(),
+            BucketCountKSTestAggregationBuilder.buildSpec()
         );
     }
 

diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/DoubleArray.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/DoubleArray.java
@@ -0,0 +1,43 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.ml.aggs;
+
+import org.elasticsearch.common.Numbers;
+
+public final class DoubleArray {
+
+    private DoubleArray() { }
+
+    /**
+     * Returns a NEW {@link double[]} that is the cumulative sum of the passed array
+     * @param xs array to cumulatively sum
+     * @return new double[]
+     */
+    public static double[] cumulativeSum(double[] xs) {
+        double[] sum = new double[xs.length];
+        sum[0] = xs[0];
+        for (int i = 1; i < xs.length; i++) {
+            sum[i] = sum[i - 1] + xs[i];
+        }
+        return sum;
+    }
+
+    /**
+     * Mutably divides each element in `xs` by the value `v`
+     * @param xs array to mutate with the divisor
+     * @param v The divisor, must not be 0.0, Infinite, or NaN.
+     */
+    public static void divMut(double[] xs, double v) {
+        if (v == 0.0 || Numbers.isValidDouble(v) == false) {
+            throw new IllegalArgumentException("unable to divide by [" + v + "] as it results in undefined behavior");
+        }
+        for (int i = 0; i < xs.length; i++) {
+            xs[i] /= v;
+        }
+    }
+}