Skip to content

Commit

Permalink
Adds testLength to TupleDomainFilter
Browse files Browse the repository at this point in the history
This allows prefiltering strings and later lists and maps based on length.
We note that testLength of a PositionalFilter is true and has no effect on the position for simplicity. This could also advance the positional filter if false and leave it at position if true.
  • Loading branch information
[email protected] authored and mbasmanova committed Aug 27, 2019
1 parent 6d296f9 commit 2d71918
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,13 @@ public interface TupleDomainFilter

boolean testBytes(byte[] buffer, int offset, int length);

/**
* Filters like string equality and IN, as well as conditions on cardinality of lists and maps can be at least partly
* decided by looking at lengths alone. If this is false, then no further checks are needed. If true, eventual filters on the
* data itself need to be evaluated.
*/
boolean testLength(int length);

/**
* When a filter applied to a nested column fails, the whole top-level position should
* fail. To enable this functionality, the filter keeps track of the boundaries of
Expand Down Expand Up @@ -148,6 +155,12 @@ public boolean testBytes(byte[] buffer, int offset, int length)
throw new UnsupportedOperationException();
}

@Override
public boolean testLength(int length)
{
throw new UnsupportedOperationException();
}

@Override
public int getPrecedingPositionsToFail()
{
Expand Down Expand Up @@ -211,6 +224,12 @@ public boolean testBytes(byte[] buffer, int offset, int length)
return false;
}

@Override
public boolean testLength(int length)
{
return false;
}

@Override
public String toString()
{
Expand Down Expand Up @@ -268,6 +287,12 @@ public boolean testBytes(byte[] buffer, int offset, int length)
return false;
}

@Override
public boolean testLength(int length)
{
return false;
}

@Override
public String toString()
{
Expand Down Expand Up @@ -325,6 +350,12 @@ public boolean testBytes(byte[] buffer, int offset, int length)
return true;
}

@Override
public boolean testLength(int length)
{
return true;
}

@Override
public String toString()
{
Expand Down Expand Up @@ -887,6 +918,12 @@ public boolean testBytes(byte[] buffer, int offset, int length)
return true;
}

@Override
public boolean testLength(int length)
{
return !singleValue || lower.length == length;
}

@Override
public int hashCode()
{
Expand Down Expand Up @@ -943,6 +980,8 @@ class BytesValues
private final int hashTableSizeMask;
private final long[] bloom;
private final int bloomSize;
// Contains true in position i if at least one of the values has length i.
private final boolean[] lengthExists;

private BytesValues(byte[][] values, boolean nullAllowed)
{
Expand All @@ -952,6 +991,7 @@ private BytesValues(byte[][] values, boolean nullAllowed)
checkArgument(values.length > 1, "values must contain at least 2 entries");

this.values = values;
lengthExists = new boolean[Arrays.stream(values).mapToInt(value -> value.length).max().getAsInt() + 1];
// Linear hash table size is the highest power of two less than or equal to number of values * 4. This means that the
// table is under half full, e.g. 127 elements gets 256 slots.
int hashTableSize = Integer.highestOneBit(values.length * 4);
Expand All @@ -961,6 +1001,7 @@ private BytesValues(byte[][] values, boolean nullAllowed)
bloomSize = Math.max(1, hashTableSize / 8);
bloom = new long[bloomSize];
for (byte[] value : values) {
lengthExists[value.length] = true;
long hashCode = hash(value, 0, value.length);
bloom[bloomIndex(hashCode)] |= bloomMask(hashCode);
int position = (int) (hashCode & hashTableSizeMask);
Expand Down Expand Up @@ -1003,6 +1044,12 @@ public boolean testBytes(byte[] value, int offset, int length)
return false;
}

@Override
public boolean testLength(int length)
{
return length < lengthExists.length && lengthExists[length];
}

private static long bloomMask(long hashCode)
{
return (1L << ((hashCode >> 20) & 63)) | (1L << ((hashCode >> 26) & 63)) | (1L << ((hashCode >> 32) & 63));
Expand Down Expand Up @@ -1409,6 +1456,15 @@ public boolean testBytes(byte[] buffer, int offset, int length)

return recordTestResult(filter.testBytes(buffer, offset, length));
}

public boolean testLength(int length)
{
// Returns true without advancing to the next filter because this is a pre-check followed by a test on the value,
// which will advance the state. TODO: We could advance the state on false and not advance on true. Consider the
// case where testLength is the only filter on a list/map inside another. This would imply exposing advancing as a
// separate operation.
return true;
}
}

class NullsFilter
Expand Down Expand Up @@ -1468,5 +1524,10 @@ public boolean testBytes(byte[] buffer, int offset, int length)
{
throw new UnsupportedOperationException();
}

public boolean testLength(int length)
{
throw new UnsupportedOperationException();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,13 @@
*/
package com.facebook.presto.orc;

import com.facebook.presto.orc.TupleDomainFilter.BigintRange;
import com.facebook.presto.orc.TupleDomainFilter.BytesRange;
import com.facebook.presto.orc.TupleDomainFilter.PositionalFilter;
import io.airlift.slice.Slices;
import org.testng.annotations.Test;

import java.util.Arrays;

import static com.facebook.presto.testing.assertions.Assert.assertEquals;

public class TestPositionalFilter
Expand All @@ -26,8 +29,9 @@ public void test()
{
PositionalFilter filter = new PositionalFilter();

// a[1] = 1 and a[3] = 3

// a[1] = '1' and a[3] = '3' The test data is converted to byte[]'s and the comparison is done using testLength()
// followed by testBytes() so as to cover the double use of the position when testLength succeeeds and testBytes
// fails.
TupleDomainFilter[] filters = new TupleDomainFilter[] {
equals(1), null, equals(3), null,
equals(1), null, equals(3), null,
Expand All @@ -36,13 +40,15 @@ public void test()
equals(1), null, equals(3), null, null, null, null
};

long[] values = new long[] {
long[] numbers = new long[] {
1, 2, 3, 4, // pass
0, 2, 3, 4, // fail
1, 2, 0, 4, 5, // fail
1, 2, 0, 4, 55, // fail testLength()
1, 0, 3, 0, 5, 6, // pass
1, 1, 2, 2, 3, 3, 4 // fail
1, 1, 2, 2, 3, 3, 4 // fail testBytes()
};
// Convert the values to byte[][].
byte[][] values = Arrays.stream(numbers).mapToObj(n -> toBytes(Long.valueOf(n).toString())).toArray(byte[][]::new);

boolean[] expectedResults = new boolean[] {
true, true, true, true,
Expand All @@ -58,7 +64,9 @@ public void test()

int valuesIndex = 0;
for (int i = 0; i < expectedResults.length; i++) {
assertEquals(expectedResults[i], filter.testLong(values[valuesIndex++]));
boolean result = filter.testLength(values[valuesIndex].length) && filter.testBytes(values[valuesIndex], 0, values[valuesIndex].length);
assertEquals(expectedResults[i], result);
valuesIndex++;
if (expectedResults[i] == false) {
valuesIndex += filter.getSucceedingPositionsToFail();
}
Expand All @@ -68,6 +76,12 @@ public void test()

private TupleDomainFilter equals(int value)
{
return BigintRange.of(value, value, false);
byte[] bytesValue = toBytes(Integer.valueOf(value).toString());
return BytesRange.of(bytesValue, false, bytesValue, false, false);
}

private static byte[] toBytes(String value)
{
return Slices.utf8Slice(value).getBytes();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -200,16 +200,21 @@ public void testBytesRange()
{
TupleDomainFilter filter = BytesRange.of(toBytes("abc"), false, toBytes("abc"), false, false);
assertTrue(filter.testBytes(toBytes("abc"), 0, 3));
assertTrue(filter.testLength(3));

assertFalse(filter.testNull());
assertFalse(filter.testBytes(toBytes("apple"), 0, 5));
assertFalse(filter.testLength(4));

String theBestOfTimes = "It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity,...";
filter = BytesRange.of(null, true, toBytes(theBestOfTimes), false, false);
assertTrue(filter.testBytes(toBytes(theBestOfTimes), 0, theBestOfTimes.length()));
assertTrue(filter.testBytes(toBytes(theBestOfTimes), 0, 5));
assertTrue(filter.testBytes(toBytes(theBestOfTimes), 0, 50));
assertTrue(filter.testBytes(toBytes(theBestOfTimes), 0, 100));
// testLength is true of all lengths for a range filter.
assertTrue(filter.testLength(1));
assertTrue(filter.testLength(1000));

assertFalse(filter.testNull());
assertFalse(filter.testBytes(toBytes("Zzz"), 0, 3));
Expand Down Expand Up @@ -279,7 +284,9 @@ public void testBytesValues()
}
}
filter = BytesValues.of(filterValues, false);
assertFalse(filter.testLength(10000));
for (int i = 0; i < testValues.length; i++) {
assertEquals(filter.testLength(i), i % 9 == 0);
assertEquals(i % 9 == 0, filter.testBytes(testValues[i], 0, testValues[i].length));
}
}
Expand Down

0 comments on commit 2d71918

Please sign in to comment.