From e7bb4718ea907acb6c181492f9a38229b63cbc76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Istv=C3=A1n=20Zolt=C3=A1n=20Szab=C3=B3?= Date: Mon, 30 May 2022 10:36:05 +0200 Subject: [PATCH] [8.3][DOCS] Removes frequent_items agg docs. (#87150) --- docs/reference/aggregations/bucket.asciidoc | 2 - .../frequent-items-aggregation.asciidoc | 318 ------------------ 2 files changed, 320 deletions(-) delete mode 100644 docs/reference/aggregations/bucket/frequent-items-aggregation.asciidoc diff --git a/docs/reference/aggregations/bucket.asciidoc b/docs/reference/aggregations/bucket.asciidoc index a52eb15c9a151..8d3e02cf01be0 100644 --- a/docs/reference/aggregations/bucket.asciidoc +++ b/docs/reference/aggregations/bucket.asciidoc @@ -36,8 +36,6 @@ include::bucket/filter-aggregation.asciidoc[] include::bucket/filters-aggregation.asciidoc[] -include::bucket/frequent-items-aggregation.asciidoc[] - include::bucket/geodistance-aggregation.asciidoc[] include::bucket/geohashgrid-aggregation.asciidoc[] diff --git a/docs/reference/aggregations/bucket/frequent-items-aggregation.asciidoc b/docs/reference/aggregations/bucket/frequent-items-aggregation.asciidoc deleted file mode 100644 index 8eda65232bcde..0000000000000 --- a/docs/reference/aggregations/bucket/frequent-items-aggregation.asciidoc +++ /dev/null @@ -1,318 +0,0 @@ -[[search-aggregations-bucket-frequent-items-aggregation]] -=== Frequent items aggregation -++++ -Frequent items -++++ - -experimental::[] - -A bucket aggregation which finds frequent item sets. It -is a form of association rules mining that identifies items that often occur -together. It also helps you to discover relationships between different data -points (items). Items that are frequently purchased together or log events that -tend to co-occur are examples of frequent item sets. Finding frequent item sets -helps to discover relationships between different data points (items). - -The aggregation reports closed item sets. A frequent item set is called closed -if no superset exists with the same ratio of documents (also known as its -<>). For example, we have the two -following candidates for a frequent item set, which have the same support value: -1. `apple, orange, banana` -2. `apple, orange, banana, tomato`. -Only the second item set (`apple, orange, banana, tomato`) is returned, and the -first set – which is a subset of the second one – is skipped. Both item sets -might be returned if their support values are different. - - -==== Syntax - -A `frequent_items` aggregation looks like this in isolation: - -[source,js] --------------------------------------------------- -"frequent_items": { - "minimum_set_size": 3, - "fields": [ - {"field": "my_field_1"}, - {"field": "my_field_2"} - ] -} --------------------------------------------------- -// NOTCONSOLE - -.`frequent_items` Parameters -|=== -|Parameter Name |Description |Required |Default Value -|`fields` |(array) Fields to analyze. | Required | -|`minimum_set_size` | (integer) The <> of one item set. | Optional | `1` -|`minimum_support` | (integer) The <> of one item set. | Optional | `0.1` -|`size` | (integer) The number of top item sets to return. | Optional | `10` -|=== - - -[discrete] -[[frequent-items-fields]] -==== Fields - -Supported field types for the analyzed fields are keyword, numeric, ip, date, -and arrays of these types. You can also add runtime fields to your analyzed fields. - -If the combined cardinality of the analyzed fields are high, then the -aggregation might require a significant amount of system resources. - -[discrete] -[[frequent-items-minimum-set-size]] -==== Minimum set size - -The minimum set size is the minimum number of items the set needs to contain. A -value of 1 returns the frequency of single items. Only item sets that contain at -least the number of `minimum_set_size` items are returned. For example, the item -set `orange, banana, apple` is only returned if the minimum set size is 3 or -lower. - -[discrete] -[[frequent-items-minimum-support]] -==== Minimum support - -The minimum support value is the ratio of documents that an item set must exist -in to be considered "frequent". In particular, it is a normalized value between -0 and 1. It is calculated by dividing the number of documents containing the -item set by the total number of documents. - -For example, if a given item set is contained by five documents and the total -number of documents is 20, then the support of the item set is 5/20 = 0.25. -Therefore, this set is returned only if the minimum support is 0.25 or lower. -As a higher minimum support prunes more items, the calculation is less resource -intensive. The `minimum_support` parameter has an effect on the required memory -and the runtime of the aggregation. - - -[discrete] -[[frequent-items-size]] -==== Size - -This parameter defines the maximum number of item sets to return. The result -contains top-k item sets; the item sets with the highest support values. This -parameter has a significant effect on the required memory and the runtime of the -aggregation. - - -[discrete] -[[frequent-items-example]] -==== Examples - -In the following examples, we use the e-commerce {kib} sample data set. - - -[discrete] -==== Aggregation with two analized fields - -In the first example, the goal is to find out based on transaction data (1.) -from what product categories the customers purchase products frequently together -and (2.) from which cities they make those purchases. We are interested in sets -with three or more items, and want to see the first three frequent item sets -with the highest support. - -[source,console] -------------------------------------------------- -GET kibana_sample_data_ecommerce /_search -{ - "size": 0, - "aggs": { - "my_agg": { - "frequent_items": { - "minimum_set_size": 3, - "fields": [ - { "field": "category.keyword" }, - { "field": "geoip.city_name" } - ], - "size": 3 - } - } - } -} -------------------------------------------------- -// TEST[skip:setup kibana sample data] - -The API returns a response similar to the following one: - -[source,console-result] -------------------------------------------------- -(...) -"aggregations" : { - "my_agg" : { - "buckets" : [ - { - "key" : { - "category.keyword" : [ - "Women's Clothing", - "Women's Shoes" - ], - "geoip.city_name" : [ - "New York" - ] - }, - "doc_count" : 217, - "support" : 0.04641711229946524 - }, - { - "key" : { - "category.keyword" : [ - "Women's Clothing", - "Women's Accessories" - ], - "geoip.city_name" : [ - "New York" - ] - }, - "doc_count" : 135, - "support" : 0.028877005347593583 - }, - { - "key" : { - "category.keyword" : [ - "Men's Clothing", - "Men's Shoes" - ], - "geoip.city_name" : [ - "Cairo" - ] - }, - "doc_count" : 123, - "support" : 0.026310160427807486 - } - ], - (...) - } -} -------------------------------------------------- -// TEST[skip:setup kibana sample data] - -The response shows that the categories customers purchase from most frequently -together are `Women's Clothing` and `Women's Shoes` and customers from New York -tend to buy items from these categories frequently togeher. In other words, -customers who buy products labelled Women's Clothing more likely buy products -also from the Women's Shoes category and customers from New York most likely buy -products from these categories together. The item set with the second highest -support is `Women's Clothing` and `Women's Accessories` with customers mostly -from New York. Finally, the item set with the third highest support is -`Men's Clothing` and `Men's Shoes` with customers mostly from Cairo. - - -[discrete] -==== Analizing numeric values by using a runtime field - -The frequent items aggregation enables you to bucket numeric values by using -<>. The next example demonstrates how to use a script to -add a runtime field to your documents that called `price_range` which is -calculated from the taxful total price of the individual transactions. The -runtime field then can be used in the frequent items aggregation as a field to -analyze. - - -[source,console] -------------------------------------------------- -GET kibana_sample_data_ecommerce/_search -{ - "runtime_mappings": { - "price_range": { - "type": "keyword", - "script": { - "source": """ - def bucket_start = (long) Math.floor(doc['taxful_total_price'].value / 50) * 50; - def bucket_end = bucket_start + 50; - emit(bucket_start.toString() + "-" + bucket_end.toString()); - """ - } - } - }, - "size": 0, - "aggs": { - "my_agg": { - "frequent_items": { - "minimum_set_size": 4, - "fields": [ - { - "field": "category.keyword" - }, - { - "field": "price_range" - }, - { - "field": "geoip.city_name" - } - ], - "size": 3 - } - } - } -} -------------------------------------------------- -// TEST[skip:setup kibana sample data] - -The API returns a response similar to the following one: - -[source,console-result] -------------------------------------------------- -(...) -"aggregations" : { - "my_agg" : { - "buckets" : [ - { - "key" : { - "category.keyword" : [ - "Women's Clothing", - "Women's Shoes" - ], - "price_range" : [ - "50-100" - ], - "geoip.city_name" : [ - "New York" - ] - }, - "doc_count" : 100, - "support" : 0.0213903743315508 - }, - { - "key" : { - "category.keyword" : [ - "Women's Clothing", - "Women's Shoes" - ], - "price_range" : [ - "50-100" - ], - "geoip.city_name" : [ - "Dubai" - ] - }, - "doc_count" : 59, - "support" : 0.012620320855614974 - }, - { - "key" : { - "category.keyword" : [ - "Men's Clothing", - "Men's Shoes" - ], - "price_range" : [ - "50-100" - ], - "geoip.city_name" : [ - "Marrakesh" - ] - }, - "doc_count" : 53, - "support" : 0.011336898395721925 - } - ], - (...) - } - } -------------------------------------------------- -// TEST[skip:setup kibana sample data] - -The response shows the categories that customers purchase from most frequently -together, the location of the customers who tend to buy items from these -categories, and the most frequent price ranges of these purchases. \ No newline at end of file