Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix doc_count on HistoBackedHistogramAggregator #74650

Merged
merged 1 commit into from
Jun 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,10 @@ public void collect(int doc, long owningBucketOrd) throws IOException {
} else {
collectBucket(sub, doc, bucketOrd);
}
// We have added the document already. We should increment doc_count by count - 1
// so that we have added it count times.
incrementBucketDocCount(bucketOrd, count - 1);
// We have added the document already and we have incremented bucket doc_count
// by _doc_count times. To compensate for this, we should increment doc_count by
// (count - _doc_count) so that we have added it count times.
incrementBucketDocCount(bucketOrd, count - docCountProvider.getDocCount(doc));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No matter what _doc_count is, it was previously added to the bucket_count via the collectBucket methods. It could be ANY number.

Consequently, this incrementBucketDocCount may actually be decrementing the bucket_count to adjust for the difference.

I think this is fine.

The other potential solution is to override collectBucket... so the bucket count is not incremented. But that may prove too complicated.

I think this is good solution for now 👍

Somebody else from the aggs team should take a look as well.

}
previousKey = key;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,12 @@

package org.elasticsearch.xpack.analytics.aggregations.bucket.histogram;

import static java.util.Collections.singleton;
import static org.elasticsearch.xpack.analytics.AnalyticsTestsUtils.histogramFieldDocValues;

import java.util.Collections;
import java.util.List;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.store.Directory;
import org.elasticsearch.index.mapper.CustomTermFreqField;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.plugins.SearchPlugin;
import org.elasticsearch.search.aggregations.AggregationBuilder;
Expand All @@ -30,6 +25,12 @@
import org.elasticsearch.xpack.analytics.AnalyticsPlugin;
import org.elasticsearch.xpack.analytics.mapper.HistogramFieldMapper;

import java.util.Collections;
import java.util.List;

import static java.util.Collections.singleton;
import static org.elasticsearch.xpack.analytics.AnalyticsTestsUtils.histogramFieldDocValues;

public class HistoBackedHistogramAggregatorTests extends AggregatorTestCase {

private static final String FIELD_NAME = "field";
Expand Down Expand Up @@ -99,6 +100,27 @@ public void testMinDocCount() throws Exception {
}
}

public void testHistogramWithDocCountField() throws Exception {
try (Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), dir)) {
w.addDocument(List.of(
// Add the _doc_dcount field
new CustomTermFreqField("_doc_count", "_doc_count", 8),
histogramFieldDocValues(FIELD_NAME, new double[] {0, 1.2, 10, 10, 12, 24, 24, 24}))
);

HistogramAggregationBuilder aggBuilder = new HistogramAggregationBuilder("my_agg")
.field(FIELD_NAME)
.interval(100);

try (IndexReader reader = w.getReader()) {
IndexSearcher searcher = new IndexSearcher(reader);
InternalHistogram histogram = searchAndReduce(searcher, new MatchAllDocsQuery(), aggBuilder, defaultFieldType(FIELD_NAME));
assertTrue(AggregationInspectionHelper.hasValue(histogram));
assertEquals(8, histogram.getBuckets().get(0).getDocCount());
}
}
}

public void testRandomOffset() throws Exception {
try (Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,46 @@ setup:
field: "latency"
interval: 0.3


- match: { hits.total.value: 2 }
- length: { aggregations.histo.buckets: 2 }
- match: { aggregations.histo.buckets.0.key: 0.0 }
- match: { aggregations.histo.buckets.0.doc_count: 20 }
- match: { aggregations.histo.buckets.1.key: 0.3 }
- match: { aggregations.histo.buckets.1.doc_count: 60 }

---
"Histogram with _doc_count":
- do:
indices.create:
index: "histo_with_doc_count"
body:
mappings:
properties:
latency:
type: "histogram"
- do:
headers:
Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser
bulk:
index: "histo_with_doc_count"
refresh: true
body:
- '{"index": {}}'
- '{"_doc_count": 50, "latency": {"values" : [0.1, 0.2, 0.3, 0.4, 0.5], "counts" : [3, 7, 23, 12, 5]}}'
- '{"index": {}}'
- '{"_doc_count": 10, "latency": {"values" : [0.1, 0.2, 0.3, 0.4, 0.5], "counts" : [1, 1, 1, 1, 6]}}'
- do:
search:
index: "histo_with_doc_count"
body:
size: 0
aggs:
histo:
histogram:
field: "latency"
interval: 1

- match: { hits.total.value: 2 }
- length: { aggregations.histo.buckets: 1 }
- match: { aggregations.histo.buckets.0.key: 0.0 }
- match: { aggregations.histo.buckets.0.doc_count: 60 }