Skip to content

Commit

Permalink
Build global ordinals terms bucket from matching ordinals (#30166)
Browse files Browse the repository at this point in the history
The global ordinals terms aggregator has an option to remap global ordinals to
dense ordinal that match the request. This mode is automatically picked when the terms
aggregator is a child of another bucket aggregator or when it needs to defer buckets to an
aggregation that is used in the ordering of the terms.
Though when building the final buckets, this aggregator loops over all possible global ordinals
rather than using the hash map that was built to remap the ordinals.
For fields with high cardinality this is highly inefficient and can lead to slow responses even
when the number of terms that match the query is low.
This change fixes this performance issue by using the hash table of matching ordinals to perform
the pruning of the final buckets for the terms and significant_terms aggregation.
I ran a simple benchmark with 1M documents containing 0 to 10 keywords randomly selected among 1M unique terms.
This field is used to perform a multi-level terms aggregation using rally to collect the response times.
The aggregation below is an example of a two-level terms aggregation that was used to perform the benchmark:

```
"aggregations":{
   "1":{
      "terms":{
         "field":"keyword"
      },
      "aggregations":{
         "2":{
            "terms":{
               "field":"keyword"
            }
         }
      }
   }
}
```

| Levels of aggregation | 50th percentile ms (master) | 50th percentile ms (patch) |
| --- | --- | --- |
| 2 | 640.41ms | 577.499ms |
| 3 | 2239.66ms | 600.154ms |
| 4 | 14141.2ms | 703.512ms |

Closes #30117
  • Loading branch information
jimczi committed May 16, 2018
1 parent fadd4bb commit 84c6255
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,8 @@

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.lease.Releasables;
import org.elasticsearch.common.util.LongHash;
import org.elasticsearch.search.DocValueFormat;
import org.elasticsearch.search.aggregations.Aggregator;
import org.elasticsearch.search.aggregations.AggregatorFactories;
Expand Down Expand Up @@ -103,11 +101,22 @@ public SignificantStringTerms buildAggregation(long owningBucketOrdinal) throws

BucketSignificancePriorityQueue<SignificantStringTerms.Bucket> ordered = new BucketSignificancePriorityQueue<>(size);
SignificantStringTerms.Bucket spare = null;
for (long globalTermOrd = 0; globalTermOrd < valueCount; ++globalTermOrd) {
if (includeExclude != null && !acceptedGlobalOrdinals.get(globalTermOrd)) {
final boolean needsFullScan = bucketOrds == null || bucketCountThresholds.getMinDocCount() == 0;
final long maxId = needsFullScan ? valueCount : bucketOrds.size();
for (long ord = 0; ord < maxId; ord++) {
final long globalOrd;
final long bucketOrd;
if (needsFullScan) {
bucketOrd = bucketOrds == null ? ord : bucketOrds.find(ord);
globalOrd = ord;
} else {
assert bucketOrds != null;
bucketOrd = ord;
globalOrd = bucketOrds.get(ord);
}
if (includeExclude != null && !acceptedGlobalOrdinals.get(globalOrd)) {
continue;
}
final long bucketOrd = getBucketOrd(globalTermOrd);
final int bucketDocCount = bucketOrd < 0 ? 0 : bucketDocCount(bucketOrd);
if (bucketCountThresholds.getMinDocCount() > 0 && bucketDocCount == 0) {
continue;
Expand All @@ -120,7 +129,7 @@ public SignificantStringTerms buildAggregation(long owningBucketOrdinal) throws
spare = new SignificantStringTerms.Bucket(new BytesRef(), 0, 0, 0, 0, null, format);
}
spare.bucketOrd = bucketOrd;
copy(lookupGlobalOrd.apply(globalTermOrd), spare.termBytes);
copy(lookupGlobalOrd.apply(globalOrd), spare.termBytes);
spare.subsetDf = bucketDocCount;
spare.subsetSize = subsetSize;
spare.supersetDf = termsAggFactory.getBackgroundFrequency(spare.termBytes);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
protected final long valueCount;
protected final GlobalOrdLookupFunction lookupGlobalOrd;

private final LongHash bucketOrds;
protected final LongHash bucketOrds;

public interface GlobalOrdLookupFunction {
BytesRef apply(long ord) throws IOException;
Expand Down Expand Up @@ -107,10 +107,6 @@ boolean remapGlobalOrds() {
return bucketOrds != null;
}

protected final long getBucketOrd(long globalOrd) {
return bucketOrds == null ? globalOrd : bucketOrds.find(globalOrd);
}

private void collectGlobalOrd(int doc, long globalOrd, LeafBucketCollector sub) throws IOException {
if (bucketOrds == null) {
collectExistingBucket(sub, doc, globalOrd);
Expand Down Expand Up @@ -188,17 +184,28 @@ public InternalAggregation buildAggregation(long owningBucketOrdinal) throws IOE
long otherDocCount = 0;
BucketPriorityQueue<OrdBucket> ordered = new BucketPriorityQueue<>(size, order.comparator(this));
OrdBucket spare = new OrdBucket(-1, 0, null, showTermDocCountError, 0);
for (long globalTermOrd = 0; globalTermOrd < valueCount; ++globalTermOrd) {
if (includeExclude != null && !acceptedGlobalOrdinals.get(globalTermOrd)) {
final boolean needsFullScan = bucketOrds == null || bucketCountThresholds.getMinDocCount() == 0;
final long maxId = needsFullScan ? valueCount : bucketOrds.size();
for (long ord = 0; ord < maxId; ord++) {
final long globalOrd;
final long bucketOrd;
if (needsFullScan) {
bucketOrd = bucketOrds == null ? ord : bucketOrds.find(ord);
globalOrd = ord;
} else {
assert bucketOrds != null;
bucketOrd = ord;
globalOrd = bucketOrds.get(ord);
}
if (includeExclude != null && !acceptedGlobalOrdinals.get(globalOrd)) {
continue;
}
final long bucketOrd = getBucketOrd(globalTermOrd);
final int bucketDocCount = bucketOrd < 0 ? 0 : bucketDocCount(bucketOrd);
if (bucketCountThresholds.getMinDocCount() > 0 && bucketDocCount == 0) {
continue;
}
otherDocCount += bucketDocCount;
spare.globalOrd = globalTermOrd;
spare.globalOrd = globalOrd;
spare.bucketOrd = bucketOrd;
spare.docCount = bucketDocCount;
if (bucketCountThresholds.getShardMinDocCount() <= spare.docCount) {
Expand Down Expand Up @@ -378,7 +385,7 @@ private void mapSegmentCountsToGlobalCounts(LongUnaryOperator mapping) throws IO
}
final long ord = i - 1; // remember we do +1 when counting
final long globalOrd = mapping.applyAsLong(ord);
long bucketOrd = getBucketOrd(globalOrd);
long bucketOrd = bucketOrds == null ? globalOrd : bucketOrds.find(globalOrd);
incrementBucketDocCount(bucketOrd, inc);
}
}
Expand Down

0 comments on commit 84c6255

Please sign in to comment.