From 2d7191891abcad8c8c54317c81d4a72586aa5234 Mon Sep 17 00:00:00 2001 From: "oerling@fb.com" Date: Sun, 25 Aug 2019 23:09:01 -0700 Subject: [PATCH] Adds testLength to TupleDomainFilter This allows prefiltering strings and later lists and maps based on length. We note that testLength of a PositionalFilter is true and has no effect on the position for simplicity. This could also advance the positional filter if false and leave it at position if true. --- .../presto/orc/TupleDomainFilter.java | 61 +++++++++++++++++++ .../presto/orc/TestPositionalFilter.java | 30 ++++++--- .../presto/orc/TestTupleDomainFilter.java | 7 +++ 3 files changed, 90 insertions(+), 8 deletions(-) diff --git a/presto-orc/src/main/java/com/facebook/presto/orc/TupleDomainFilter.java b/presto-orc/src/main/java/com/facebook/presto/orc/TupleDomainFilter.java index 55b85ea1c134..101cff41680c 100644 --- a/presto-orc/src/main/java/com/facebook/presto/orc/TupleDomainFilter.java +++ b/presto-orc/src/main/java/com/facebook/presto/orc/TupleDomainFilter.java @@ -67,6 +67,13 @@ public interface TupleDomainFilter boolean testBytes(byte[] buffer, int offset, int length); + /** + * Filters like string equality and IN, as well as conditions on cardinality of lists and maps can be at least partly + * decided by looking at lengths alone. If this is false, then no further checks are needed. If true, eventual filters on the + * data itself need to be evaluated. + */ + boolean testLength(int length); + /** * When a filter applied to a nested column fails, the whole top-level position should * fail. To enable this functionality, the filter keeps track of the boundaries of @@ -148,6 +155,12 @@ public boolean testBytes(byte[] buffer, int offset, int length) throw new UnsupportedOperationException(); } + @Override + public boolean testLength(int length) + { + throw new UnsupportedOperationException(); + } + @Override public int getPrecedingPositionsToFail() { @@ -211,6 +224,12 @@ public boolean testBytes(byte[] buffer, int offset, int length) return false; } + @Override + public boolean testLength(int length) + { + return false; + } + @Override public String toString() { @@ -268,6 +287,12 @@ public boolean testBytes(byte[] buffer, int offset, int length) return false; } + @Override + public boolean testLength(int length) + { + return false; + } + @Override public String toString() { @@ -325,6 +350,12 @@ public boolean testBytes(byte[] buffer, int offset, int length) return true; } + @Override + public boolean testLength(int length) + { + return true; + } + @Override public String toString() { @@ -887,6 +918,12 @@ public boolean testBytes(byte[] buffer, int offset, int length) return true; } + @Override + public boolean testLength(int length) + { + return !singleValue || lower.length == length; + } + @Override public int hashCode() { @@ -943,6 +980,8 @@ class BytesValues private final int hashTableSizeMask; private final long[] bloom; private final int bloomSize; + // Contains true in position i if at least one of the values has length i. + private final boolean[] lengthExists; private BytesValues(byte[][] values, boolean nullAllowed) { @@ -952,6 +991,7 @@ private BytesValues(byte[][] values, boolean nullAllowed) checkArgument(values.length > 1, "values must contain at least 2 entries"); this.values = values; + lengthExists = new boolean[Arrays.stream(values).mapToInt(value -> value.length).max().getAsInt() + 1]; // Linear hash table size is the highest power of two less than or equal to number of values * 4. This means that the // table is under half full, e.g. 127 elements gets 256 slots. int hashTableSize = Integer.highestOneBit(values.length * 4); @@ -961,6 +1001,7 @@ private BytesValues(byte[][] values, boolean nullAllowed) bloomSize = Math.max(1, hashTableSize / 8); bloom = new long[bloomSize]; for (byte[] value : values) { + lengthExists[value.length] = true; long hashCode = hash(value, 0, value.length); bloom[bloomIndex(hashCode)] |= bloomMask(hashCode); int position = (int) (hashCode & hashTableSizeMask); @@ -1003,6 +1044,12 @@ public boolean testBytes(byte[] value, int offset, int length) return false; } + @Override + public boolean testLength(int length) + { + return length < lengthExists.length && lengthExists[length]; + } + private static long bloomMask(long hashCode) { return (1L << ((hashCode >> 20) & 63)) | (1L << ((hashCode >> 26) & 63)) | (1L << ((hashCode >> 32) & 63)); @@ -1409,6 +1456,15 @@ public boolean testBytes(byte[] buffer, int offset, int length) return recordTestResult(filter.testBytes(buffer, offset, length)); } + + public boolean testLength(int length) + { + // Returns true without advancing to the next filter because this is a pre-check followed by a test on the value, + // which will advance the state. TODO: We could advance the state on false and not advance on true. Consider the + // case where testLength is the only filter on a list/map inside another. This would imply exposing advancing as a + // separate operation. + return true; + } } class NullsFilter @@ -1468,5 +1524,10 @@ public boolean testBytes(byte[] buffer, int offset, int length) { throw new UnsupportedOperationException(); } + + public boolean testLength(int length) + { + throw new UnsupportedOperationException(); + } } } diff --git a/presto-orc/src/test/java/com/facebook/presto/orc/TestPositionalFilter.java b/presto-orc/src/test/java/com/facebook/presto/orc/TestPositionalFilter.java index ff4a2bdad259..76671ce348e4 100644 --- a/presto-orc/src/test/java/com/facebook/presto/orc/TestPositionalFilter.java +++ b/presto-orc/src/test/java/com/facebook/presto/orc/TestPositionalFilter.java @@ -13,10 +13,13 @@ */ package com.facebook.presto.orc; -import com.facebook.presto.orc.TupleDomainFilter.BigintRange; +import com.facebook.presto.orc.TupleDomainFilter.BytesRange; import com.facebook.presto.orc.TupleDomainFilter.PositionalFilter; +import io.airlift.slice.Slices; import org.testng.annotations.Test; +import java.util.Arrays; + import static com.facebook.presto.testing.assertions.Assert.assertEquals; public class TestPositionalFilter @@ -26,8 +29,9 @@ public void test() { PositionalFilter filter = new PositionalFilter(); - // a[1] = 1 and a[3] = 3 - + // a[1] = '1' and a[3] = '3' The test data is converted to byte[]'s and the comparison is done using testLength() + // followed by testBytes() so as to cover the double use of the position when testLength succeeeds and testBytes + // fails. TupleDomainFilter[] filters = new TupleDomainFilter[] { equals(1), null, equals(3), null, equals(1), null, equals(3), null, @@ -36,13 +40,15 @@ public void test() equals(1), null, equals(3), null, null, null, null }; - long[] values = new long[] { + long[] numbers = new long[] { 1, 2, 3, 4, // pass 0, 2, 3, 4, // fail - 1, 2, 0, 4, 5, // fail + 1, 2, 0, 4, 55, // fail testLength() 1, 0, 3, 0, 5, 6, // pass - 1, 1, 2, 2, 3, 3, 4 // fail + 1, 1, 2, 2, 3, 3, 4 // fail testBytes() }; + // Convert the values to byte[][]. + byte[][] values = Arrays.stream(numbers).mapToObj(n -> toBytes(Long.valueOf(n).toString())).toArray(byte[][]::new); boolean[] expectedResults = new boolean[] { true, true, true, true, @@ -58,7 +64,9 @@ public void test() int valuesIndex = 0; for (int i = 0; i < expectedResults.length; i++) { - assertEquals(expectedResults[i], filter.testLong(values[valuesIndex++])); + boolean result = filter.testLength(values[valuesIndex].length) && filter.testBytes(values[valuesIndex], 0, values[valuesIndex].length); + assertEquals(expectedResults[i], result); + valuesIndex++; if (expectedResults[i] == false) { valuesIndex += filter.getSucceedingPositionsToFail(); } @@ -68,6 +76,12 @@ public void test() private TupleDomainFilter equals(int value) { - return BigintRange.of(value, value, false); + byte[] bytesValue = toBytes(Integer.valueOf(value).toString()); + return BytesRange.of(bytesValue, false, bytesValue, false, false); + } + + private static byte[] toBytes(String value) + { + return Slices.utf8Slice(value).getBytes(); } } diff --git a/presto-orc/src/test/java/com/facebook/presto/orc/TestTupleDomainFilter.java b/presto-orc/src/test/java/com/facebook/presto/orc/TestTupleDomainFilter.java index e7feeb3abbab..a2a13fa83b92 100644 --- a/presto-orc/src/test/java/com/facebook/presto/orc/TestTupleDomainFilter.java +++ b/presto-orc/src/test/java/com/facebook/presto/orc/TestTupleDomainFilter.java @@ -200,9 +200,11 @@ public void testBytesRange() { TupleDomainFilter filter = BytesRange.of(toBytes("abc"), false, toBytes("abc"), false, false); assertTrue(filter.testBytes(toBytes("abc"), 0, 3)); + assertTrue(filter.testLength(3)); assertFalse(filter.testNull()); assertFalse(filter.testBytes(toBytes("apple"), 0, 5)); + assertFalse(filter.testLength(4)); String theBestOfTimes = "It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity,..."; filter = BytesRange.of(null, true, toBytes(theBestOfTimes), false, false); @@ -210,6 +212,9 @@ public void testBytesRange() assertTrue(filter.testBytes(toBytes(theBestOfTimes), 0, 5)); assertTrue(filter.testBytes(toBytes(theBestOfTimes), 0, 50)); assertTrue(filter.testBytes(toBytes(theBestOfTimes), 0, 100)); + // testLength is true of all lengths for a range filter. + assertTrue(filter.testLength(1)); + assertTrue(filter.testLength(1000)); assertFalse(filter.testNull()); assertFalse(filter.testBytes(toBytes("Zzz"), 0, 3)); @@ -279,7 +284,9 @@ public void testBytesValues() } } filter = BytesValues.of(filterValues, false); + assertFalse(filter.testLength(10000)); for (int i = 0; i < testValues.length; i++) { + assertEquals(filter.testLength(i), i % 9 == 0); assertEquals(i % 9 == 0, filter.testBytes(testValues[i], 0, testValues[i].length)); } }