Apply the fast filter optimization to composite aggregation (#11505)

We can do efficient DateHistogram aggregation under composite aggregations. --------- Signed-off-by: bowenlan-amzn <[email protected]>
opensearch-project · Jan 17, 2024 · d5a6f99 · d5a6f99
1 parent 5dd4b61
commit d5a6f99
Show file tree

Hide file tree

Showing 14 changed files with 585 additions and 378 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -188,6 +188,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Performance improvement for MultiTerm Queries on Keyword fields ([#7057](https://github.com/opensearch-project/OpenSearch/issues/7057))
 - Refactor common parts from the Rounding class into a separate 'round' package ([#11023](https://github.com/opensearch-project/OpenSearch/issues/11023))
 - Performance improvement for date histogram aggregations without sub-aggregations ([#11083](https://github.com/opensearch-project/OpenSearch/pull/11083))
+- Apply the fast filter optimization to composite aggregation of date histogram source ([#11505](https://github.com/opensearch-project/OpenSearch/pull/11083))
 - Disable concurrent aggs for Diversified Sampler and Sampler aggs ([#11087](https://github.com/opensearch-project/OpenSearch/issues/11087))
 - Made leader/follower check timeout setting dynamic ([#10528](https://github.com/opensearch-project/OpenSearch/pull/10528))
 - Improved performance of numeric exact-match queries ([#11209](https://github.com/opensearch-project/OpenSearch/pull/11209))

diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/FastFilterRewriteHelper.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/FastFilterRewriteHelper.java
diff --git a/...rc/main/java/org/opensearch/search/aggregations/bucket/composite/CompositeAggregator.java b/...rc/main/java/org/opensearch/search/aggregations/bucket/composite/CompositeAggregator.java
@@ -56,7 +56,9 @@
 import org.apache.lucene.search.Weight;
 import org.apache.lucene.search.comparators.LongComparator;
 import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.CollectionUtil;
 import org.apache.lucene.util.RoaringDocIdSet;
+import org.opensearch.common.Rounding;
 import org.opensearch.common.lease.Releasables;
 import org.opensearch.index.IndexSortConfig;
 import org.opensearch.lucene.queries.SearchAfterSortedDocQuery;
@@ -71,7 +73,9 @@
 import org.opensearch.search.aggregations.MultiBucketCollector;
 import org.opensearch.search.aggregations.MultiBucketConsumerService;
 import org.opensearch.search.aggregations.bucket.BucketsAggregator;
+import org.opensearch.search.aggregations.bucket.FastFilterRewriteHelper;
 import org.opensearch.search.aggregations.bucket.missing.MissingOrder;
+import org.opensearch.search.aggregations.bucket.terms.LongKeyedBucketOrds;
 import org.opensearch.search.internal.SearchContext;
 import org.opensearch.search.searchafter.SearchAfterBuilder;
 import org.opensearch.search.sort.SortAndFormats;
@@ -80,6 +84,7 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.function.LongUnaryOperator;
@@ -111,6 +116,10 @@ final class CompositeAggregator extends BucketsAggregator {
 
     private boolean earlyTerminated;
 
+    private final FastFilterRewriteHelper.FastFilterContext fastFilterContext;
+    private LongKeyedBucketOrds bucketOrds = null;
+    private Rounding.Prepared preparedRounding = null;
+
     CompositeAggregator(
         String name,
         AggregatorFactories factories,
@@ -154,12 +163,33 @@ final class CompositeAggregator extends BucketsAggregator {
         }
         this.queue = new CompositeValuesCollectorQueue(context.bigArrays(), sources, size, rawAfterKey);
         this.rawAfterKey = rawAfterKey;
+
+        fastFilterContext = new FastFilterRewriteHelper.FastFilterContext();
+        if (!FastFilterRewriteHelper.isCompositeAggRewriteable(sourceConfigs)) return;
+        fastFilterContext.setAggregationType(
+            new FastFilterRewriteHelper.CompositeAggregationType(sourceConfigs, rawAfterKey, formats, size)
+        );
+        if (fastFilterContext.isRewriteable(parent, subAggregators.length)) {
+            // bucketOrds is the data structure for saving date histogram results
+            bucketOrds = LongKeyedBucketOrds.build(context.bigArrays(), CardinalityUpperBound.ONE);
+            // Currently the filter rewrite is only supported for date histograms
+            FastFilterRewriteHelper.CompositeAggregationType aggregationType =
+                (FastFilterRewriteHelper.CompositeAggregationType) fastFilterContext.aggregationType;
+            preparedRounding = aggregationType.getRoundingPreparer();
+            fastFilterContext.buildFastFilter(
+                context,
+                fc -> FastFilterRewriteHelper.getAggregationBounds(context, fc.getFieldType().name()),
+                x -> aggregationType.getRounding(),
+                () -> preparedRounding
+            );
+        }
     }
 
     @Override
     protected void doClose() {
         try {
             Releasables.close(queue);
+            Releasables.close(bucketOrds);
         } finally {
             Releasables.close(sources);
         }
@@ -187,12 +217,14 @@ public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws I
         }
 
         int num = Math.min(size, queue.size());
-        final InternalComposite.InternalBucket[] buckets = new InternalComposite.InternalBucket[num];
+        InternalComposite.InternalBucket[] buckets = new InternalComposite.InternalBucket[num];
+
         long[] bucketOrdsToCollect = new long[queue.size()];
         for (int i = 0; i < queue.size(); i++) {
             bucketOrdsToCollect[i] = i;
         }
         InternalAggregations[] subAggsForBuckets = buildSubAggsForBuckets(bucketOrdsToCollect);
+
         while (queue.size() > 0) {
             int slot = queue.pop();
             CompositeKey key = queue.toCompositeKey(slot);
@@ -208,6 +240,43 @@ public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws I
                 aggs
             );
         }
+
+        // Build results from fast filters optimization
+        if (bucketOrds != null) {
+            // CompositeKey is the value of bucket key
+            final Map<CompositeKey, InternalComposite.InternalBucket> bucketMap = new HashMap<>();
+            // Some segments may not be optimized, so buckets may contain results from the queue.
+            for (InternalComposite.InternalBucket internalBucket : buckets) {
+                bucketMap.put(internalBucket.getRawKey(), internalBucket);
+            }
+            // Loop over the buckets in the bucketOrds, and populate the map accordingly
+            LongKeyedBucketOrds.BucketOrdsEnum ordsEnum = bucketOrds.ordsEnum(0);
+            while (ordsEnum.next()) {
+                Long bucketKeyValue = ordsEnum.value();
+                CompositeKey key = new CompositeKey(bucketKeyValue);
+                if (bucketMap.containsKey(key)) {
+                    long docCount = bucketDocCount(ordsEnum.ord()) + bucketMap.get(key).getDocCount();
+                    bucketMap.get(key).setDocCount(docCount);
+                } else {
+                    InternalComposite.InternalBucket bucket = new InternalComposite.InternalBucket(
+                        sourceNames,
+                        formats,
+                        key,
+                        reverseMuls,
+                        missingOrders,
+                        bucketDocCount(ordsEnum.ord()),
+                        buildEmptySubAggregations()
+                    );
+                    bucketMap.put(key, bucket);
+                }
+            }
+            // since a map is not sorted structure, sort it before transform back to buckets
+            List<InternalComposite.InternalBucket> bucketList = new ArrayList<>(bucketMap.values());
+            CollectionUtil.introSort(bucketList, InternalComposite.InternalBucket::compareKey);
+            buckets = bucketList.subList(0, Math.min(size, bucketList.size())).toArray(InternalComposite.InternalBucket[]::new);
+            num = buckets.length;
+        }
+
         CompositeKey lastBucket = num > 0 ? buckets[num - 1].getRawKey() : null;
         return new InternalAggregation[] {
             new InternalComposite(
@@ -296,16 +365,16 @@ private Sort buildIndexSortPrefix(LeafReaderContext context) throws IOException
 
             if (indexSortField.getReverse() != (source.reverseMul == -1)) {
                 if (i == 0) {
-                    // the leading index sort matches the leading source field but the order is reversed
+                    // the leading index sort matches the leading source field, but the order is reversed,
                     // so we don't check the other sources.
                     return new Sort(indexSortField);
                 }
                 break;
             }
             sortFields.add(indexSortField);
             if (sourceConfig.valuesSource() instanceof RoundingValuesSource) {
-                // the rounding "squashes" many values together, that breaks the ordering of sub-values
-                // so we ignore subsequent source even if they match the index sort.
+                // the rounding "squashes" many values together, that breaks the ordering of sub-values,
+                // so we ignore the subsequent sources even if they match the index sort.
                 break;
             }
         }
@@ -448,6 +517,16 @@ private void processLeafFromQuery(LeafReaderContext ctx, Sort indexSortPrefix) t
 
     @Override
     protected LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCollector sub) throws IOException {
+        boolean optimized = FastFilterRewriteHelper.tryFastFilterAggregation(
+            ctx,
+            fastFilterContext,
+            (key, count) -> incrementBucketDocCount(
+                FastFilterRewriteHelper.getBucketOrd(bucketOrds.add(0, preparedRounding.round(key))),
+                count
+            )
+        );
+        if (optimized) throw new CollectionTerminatedException();
+
         finishLeaf();
 
         boolean fillDocIdSet = deferredCollectors != NO_OP_COLLECTOR;
@@ -477,9 +556,10 @@ protected LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucket
                 docIdSetBuilder = new RoaringDocIdSet.Builder(ctx.reader().maxDoc());
             }
             if (rawAfterKey != null && sortPrefixLen > 0) {
-                // We have an after key and index sort is applicable so we jump directly to the doc
-                // that is after the index sort prefix using the rawAfterKey and we start collecting
-                // document from there.
+                // We have an after key and index sort is applicable, so we jump directly to the doc
+                // after the index sort prefix using the rawAfterKey and we start collecting
+                // documents from there.
+                assert indexSortPrefix != null;
                 processLeafFromQuery(ctx, indexSortPrefix);
                 throw new CollectionTerminatedException();
             } else {
@@ -507,6 +587,8 @@ public void collect(int doc, long bucket) throws IOException {
                 try {
                     long docCount = docCountProvider.getDocCount(doc);
                     if (queue.addIfCompetitive(indexSortPrefix, docCount)) {
+                        // one doc may contain multiple values, we iterate over and collect one by one
+                        // so the same doc can appear multiple times here
                         if (builder != null && lastDoc != doc) {
                             builder.add(doc);
                             lastDoc = doc;
@@ -569,7 +651,7 @@ private LeafBucketCollector getSecondPassCollector(LeafBucketCollector subCollec
             @Override
             public void collect(int doc, long zeroBucket) throws IOException {
                 assert zeroBucket == 0;
-                Integer slot = queue.compareCurrent();
+                Integer slot = queue.getCurrentSlot();
                 if (slot != null) {
                     // The candidate key is a top bucket.
                     // We can defer the collection of this document/bucket to the sub collector

diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/composite/CompositeKey.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/composite/CompositeKey.java
@@ -44,7 +44,7 @@
  *
  * @opensearch.internal
  */
-class CompositeKey implements Writeable {
+public class CompositeKey implements Writeable {
     private final Comparable[] values;
 
     CompositeKey(Comparable... values) {
@@ -64,11 +64,11 @@ Comparable[] values() {
         return values;
     }
 
-    int size() {
+    public int size() {
         return values.length;
     }
 
-    Comparable get(int pos) {
+    public Comparable get(int pos) {
         assert pos < values.length;
         return values[pos];
     }

diff --git a/...va/org/opensearch/search/aggregations/bucket/composite/CompositeValuesCollectorQueue.java b/...va/org/opensearch/search/aggregations/bucket/composite/CompositeValuesCollectorQueue.java
@@ -47,6 +47,8 @@
 
 /**
  * A specialized {@link PriorityQueue} implementation for composite buckets.
+ * Can think of this as a max heap that holds the top small buckets slots in order.
+ * Each slot holds the values of the composite bucket key it represents.
  *
  * @opensearch.internal
  */
@@ -77,7 +79,7 @@ public int hashCode() {
 
     private final BigArrays bigArrays;
     private final int maxSize;
-    private final Map<Slot, Integer> map;
+    private final Map<Slot, Integer> map; // to quickly find the slot for a value
     private final SingleDimensionValuesSource<?>[] arrays;
 
     private LongArray docCounts;
@@ -108,7 +110,7 @@ public int hashCode() {
 
     @Override
     protected boolean lessThan(Integer a, Integer b) {
-        return compare(a, b) > 0;
+        return compare(a, b) > 0; // max heap
     }
 
     /**
@@ -119,10 +121,10 @@ boolean isFull() {
     }
 
     /**
-     * Compares the current candidate with the values in the queue and returns
+     * Try to get the slot of the current/candidate values in the queue and returns
      * the slot if the candidate is already in the queue or null if the candidate is not present.
      */
-    Integer compareCurrent() {
+    Integer getCurrentSlot() {
         return map.get(new Slot(CANDIDATE_SLOT));
     }
 
@@ -281,32 +283,34 @@ boolean addIfCompetitive(long inc) {
      */
     boolean addIfCompetitive(int indexSortSourcePrefix, long inc) {
         // checks if the candidate key is competitive
-        Integer topSlot = compareCurrent();
-        if (topSlot != null) {
+        Integer curSlot = getCurrentSlot();
+        if (curSlot != null) {
             // this key is already in the top N, skip it
-            docCounts.increment(topSlot, inc);
+            docCounts.increment(curSlot, inc);
             return true;
         }
+
         if (afterKeyIsSet) {
             int cmp = compareCurrentWithAfter();
             if (cmp <= 0) {
                 if (indexSortSourcePrefix < 0 && cmp == indexSortSourcePrefix) {
-                    // the leading index sort is in the reverse order of the leading source
+                    // the leading index sort is and the leading source order are both reversed,
                     // so we can early terminate when we reach a document that is smaller
                     // than the after key (collected on a previous page).
                     throw new CollectionTerminatedException();
                 }
-                // key was collected on a previous page, skip it (>= afterKey).
+                // the key was collected on a previous page, skip it.
                 return false;
             }
         }
+
+        // the heap is full, check if the candidate key larger than max heap top
         if (size() >= maxSize) {
-            // the tree map is full, check if the candidate key should be kept
             int cmp = compare(CANDIDATE_SLOT, top());
             if (cmp > 0) {
                 if (cmp <= indexSortSourcePrefix) {
-                    // index sort guarantees that there is no key greater or equal than the
-                    // current one in the subsequent documents so we can early terminate.
+                    // index sort guarantees the following documents will have a key larger than the current candidate,
+                    // so we can early terminate.
                     throw new CollectionTerminatedException();
                 }
                 // the candidate key is not competitive, skip it.
@@ -324,7 +328,7 @@ boolean addIfCompetitive(int indexSortSourcePrefix, long inc) {
         } else {
             newSlot = size();
         }
-        // move the candidate key to its new slot
+        // move the candidate key to its new slot by copy its values to the new slot
         copyCurrent(newSlot, inc);
         map.put(new Slot(newSlot), newSlot);
         add(newSlot);

diff --git a/...java/org/opensearch/search/aggregations/bucket/composite/CompositeValuesSourceConfig.java b/...java/org/opensearch/search/aggregations/bucket/composite/CompositeValuesSourceConfig.java
@@ -156,7 +156,7 @@ public MissingOrder missingOrder() {
     /**
      * Returns true if the source contains a script that can change the value.
      */
-    protected boolean hasScript() {
+    public boolean hasScript() {
         return hasScript;
     }
 

diff --git a/...org/opensearch/search/aggregations/bucket/composite/DateHistogramValuesSourceBuilder.java b/...org/opensearch/search/aggregations/bucket/composite/DateHistogramValuesSourceBuilder.java
@@ -298,7 +298,7 @@ public static void register(ValuesSourceRegistry.Builder builder) {
                 // TODO once composite is plugged in to the values source registry or at least understands Date values source types use it
                 // here
                 Rounding.Prepared preparedRounding = rounding.prepareForUnknown();
-                RoundingValuesSource vs = new RoundingValuesSource(numeric, preparedRounding);
+                RoundingValuesSource vs = new RoundingValuesSource(numeric, preparedRounding, rounding);
                 // is specified in the builder.
                 final DocValueFormat docValueFormat = format == null ? DocValueFormat.RAW : valuesSourceConfig.format();
                 final MappedFieldType fieldType = valuesSourceConfig.fieldType();

diff --git a/.../src/main/java/org/opensearch/search/aggregations/bucket/composite/InternalComposite.java b/.../src/main/java/org/opensearch/search/aggregations/bucket/composite/InternalComposite.java
@@ -339,7 +339,7 @@ public static class InternalBucket extends InternalMultiBucketAggregation.Intern
             KeyComparable<InternalBucket> {
 
         private final CompositeKey key;
-        private final long docCount;
+        private long docCount;
         private final InternalAggregations aggregations;
         private final transient int[] reverseMuls;
         private final transient MissingOrder[] missingOrders;
@@ -436,6 +436,10 @@ public long getDocCount() {
             return docCount;
         }
 
+        public void setDocCount(long docCount) {
+            this.docCount = docCount;
+        }
+
         @Override
         public Aggregations getAggregations() {
             return aggregations;