Skip to content

Commit

Permalink
Optimize global ordinal includes/excludes for prefix matching (opense…
Browse files Browse the repository at this point in the history
…arch-project#14371)

* Optimize global ordinal includes/excludes for prefix matching

If an aggregration specifies includes or excludes based on a regular
expression, and the regular expression has a finite expansion followed
by .*, then we can optimize the global ordinal filter.

Specifically, in this case, we can expand the matching prefixes, then
include/exclude the range of global ordinals that start with each
prefix.

Signed-off-by: Michael Froh <[email protected]>

* Add unit test

Signed-off-by: Michael Froh <[email protected]>

* Add changelog entry

Signed-off-by: Michael Froh <[email protected]>

* Improve test coverage

Updated the unit test to be functionally equivalent, but it covers
more of the regex logic.

Signed-off-by: Michael Froh <[email protected]>

* Improve test coverage

Signed-off-by: Michael Froh <[email protected]>

* Fix bug in exclude-only case with no doc values in segment

Signed-off-by: Michael Froh <[email protected]>

* Address comments from @mch2

Signed-off-by: Michael Froh <[email protected]>

---------

Signed-off-by: Michael Froh <[email protected]>
  • Loading branch information
msfroh authored and wangdongyu.danny committed Aug 22, 2024
1 parent 48e848f commit aba4caa
Show file tree
Hide file tree
Showing 3 changed files with 439 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),

### Changed
- Add lower limit for primary and replica batch allocators timeout ([#14979](https://github.com/opensearch-project/OpenSearch/pull/14979))
- Optimize regexp-based include/exclude on aggregations when pattern matches prefixes ([#14371](https://github.com/opensearch-project/OpenSearch/pull/14371))
- Replace and block usages of org.apache.logging.log4j.util.Strings ([#15238](https://github.com/opensearch-project/OpenSearch/pull/15238))

### Deprecated
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,11 @@
import org.opensearch.search.DocValueFormat;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.SortedSet;
Expand Down Expand Up @@ -86,6 +90,10 @@ public class IncludeExclude implements Writeable, ToXContentFragment {
* https://github.com/opensearch-project/OpenSearch/issues/2858
*/
private static final int DEFAULT_MAX_REGEX_LENGTH = 1000;
/**
* The maximum number of prefixes to extract from a regex in tryCreatePrefixOrdinalsFilter
*/
private static final int MAX_PREFIXES = 1000;

// for parsing purposes only
// TODO: move all aggs to the same package so that this stuff could be pkg-private
Expand Down Expand Up @@ -393,6 +401,92 @@ public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) thro

}

/**
* An ordinals filter that includes/excludes all ordinals corresponding to terms starting with the given prefixes
*/
static class PrefixBackedOrdinalsFilter extends OrdinalsFilter {

private final SortedSet<BytesRef> includePrefixes, excludePrefixes;

private PrefixBackedOrdinalsFilter(SortedSet<BytesRef> includePrefixes, SortedSet<BytesRef> excludePrefixes) {
this.includePrefixes = includePrefixes;
this.excludePrefixes = excludePrefixes;
}

private static BytesRef nextBytesRef(BytesRef bytesRef) {
BytesRef next = BytesRef.deepCopyOf(bytesRef);
int pos = next.offset + next.length - 1;
while (pos >= next.offset && next.bytes[pos] == -1) {
next.bytes[pos] = 0;
pos--;
}
if (pos >= next.offset) {
next.bytes[pos]++;
} else {
// Every byte in our prefix had value 0xFF. We must match all subsequent ordinals.
return null;
}
return next;
}

private interface LongBiConsumer {
void accept(long a, long b);
}

private static void process(SortedSetDocValues globalOrdinals, long length, SortedSet<BytesRef> prefixes, LongBiConsumer consumer)
throws IOException {
for (BytesRef prefix : prefixes) {
long startOrd = globalOrdinals.lookupTerm(prefix);
if (startOrd < 0) {
// The prefix is not an exact match in the ordinals (can skip equal length below)
startOrd = -1 - startOrd;
// Make sure that the term at startOrd starts with prefix
BytesRef startTerm = globalOrdinals.lookupOrd(startOrd);
if (startTerm.length <= prefix.length
|| !Arrays.equals(
startTerm.bytes,
startTerm.offset,
startTerm.offset + prefix.length,
prefix.bytes,
prefix.offset,
prefix.offset + prefix.length
)) {
continue;
}
}
if (startOrd >= length) {
continue;
}
BytesRef next = nextBytesRef(prefix);
if (next == null) {
consumer.accept(startOrd, length);
} else {
long endOrd = globalOrdinals.lookupTerm(next);
if (endOrd < 0) {
endOrd = -1 - endOrd;
}
if (startOrd < endOrd) {
consumer.accept(startOrd, endOrd);
}
}
}

}

@Override
public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) throws IOException {
LongBitSet accept = new LongBitSet(globalOrdinals.getValueCount());
if (!includePrefixes.isEmpty()) {
process(globalOrdinals, accept.length(), includePrefixes, accept::set);
} else if (accept.length() > 0) {
// Exclude-only
accept.set(0, accept.length());
}
process(globalOrdinals, accept.length(), excludePrefixes, accept::clear);
return accept;
}
}

private final String include, exclude;
private final SortedSet<BytesRef> includeValues, excludeValues;
private final int incZeroBasedPartition;
Expand Down Expand Up @@ -709,8 +803,13 @@ public OrdinalsFilter convertToOrdinalsFilter(DocValueFormat format) {
}

public OrdinalsFilter convertToOrdinalsFilter(DocValueFormat format, int maxRegexLength) {

if (isRegexBased()) {
if ((include == null || include.endsWith(".*")) && (exclude == null || exclude.endsWith(".*"))) {
PrefixBackedOrdinalsFilter prefixBackedOrdinalsFilter = tryCreatePrefixOrdinalsFilter(maxRegexLength);
if (prefixBackedOrdinalsFilter != null) {
return prefixBackedOrdinalsFilter;
}
}
return new AutomatonBackedOrdinalsFilter(toAutomaton(maxRegexLength));
}
if (isPartitionBased()) {
Expand All @@ -720,6 +819,94 @@ public OrdinalsFilter convertToOrdinalsFilter(DocValueFormat format, int maxRege
return new TermListBackedOrdinalsFilter(parseForDocValues(includeValues, format), parseForDocValues(excludeValues, format));
}

private static List<String> expandRegexp(RegExp regExp, int maxPrefixes) {
switch (regExp.kind) {
case REGEXP_UNION:
List<RegExp> alternatives = new ArrayList<>();
while (regExp.exp1.kind == RegExp.Kind.REGEXP_UNION) {
alternatives.add(regExp.exp2);
regExp = regExp.exp1;
}
alternatives.add(regExp.exp2);
alternatives.add(regExp.exp1);
List<String> output = new ArrayList<>();
for (RegExp leaf : alternatives) {
List<String> leafExpansions = expandRegexp(leaf, maxPrefixes);
if (leafExpansions == null) {
return null;
} else {
if (output.size() + leafExpansions.size() > maxPrefixes) {
return null;
}
output.addAll(leafExpansions);
}
}
return output;
case REGEXP_CONCATENATION:
List<String> prefixes = expandRegexp(regExp.exp1, maxPrefixes);
if (prefixes == null) {
return null;
}
List<String> suffixes = expandRegexp(regExp.exp2, maxPrefixes);
if (suffixes == null) {
return null;
}
if (prefixes.size() * suffixes.size() > maxPrefixes) {
return null;
}
List<String> out = new ArrayList<>();
StringBuilder stringBuilder = new StringBuilder();
for (String prefix : prefixes) {
for (String suffix : suffixes) {
stringBuilder.setLength(0);
stringBuilder.append(prefix).append(suffix);
out.add(stringBuilder.toString());
}
}
return out;
case REGEXP_CHAR:
return List.of(Character.toString(regExp.c));
case REGEXP_STRING:
return List.of(regExp.s);
default:
return null;
}
}

static SortedSet<BytesRef> extractPrefixes(String pattern, int maxRegexLength) {
if (pattern == null) {
return Collections.emptySortedSet();
}
SortedSet<BytesRef> prefixSet = null;
validateRegExpStringLength(pattern, maxRegexLength);
RegExp regExp = new RegExp(pattern);
if (regExp.kind == RegExp.Kind.REGEXP_CONCATENATION && regExp.exp2.kind == RegExp.Kind.REGEXP_REPEAT) {
RegExp tail = regExp.exp2.exp1;
if (tail.kind == RegExp.Kind.REGEXP_ANYCHAR || tail.kind == RegExp.Kind.REGEXP_ANYSTRING) {
List<String> prefixes = expandRegexp(regExp.exp1, MAX_PREFIXES);
if (prefixes != null) {
prefixSet = new TreeSet<>();
for (String prefix : prefixes) {
prefixSet.add(new BytesRef(prefix));
}
}
}
}
return prefixSet;
}

private PrefixBackedOrdinalsFilter tryCreatePrefixOrdinalsFilter(int maxRegexLength) {
SortedSet<BytesRef> includeSet = extractPrefixes(include, maxRegexLength);
if (includeSet == null) {
return null;
}
SortedSet<BytesRef> excludeSet = extractPrefixes(exclude, maxRegexLength);
if (excludeSet == null) {
return null;
}
return new PrefixBackedOrdinalsFilter(includeSet, excludeSet);
}

public LongFilter convertToLongFilter(DocValueFormat format) {

if (isPartitionBased()) {
Expand Down
Loading

0 comments on commit aba4caa

Please sign in to comment.