Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Estimate cost of inequality comparision filters #11518

Merged
merged 1 commit into from
Mar 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@
import static io.trino.cost.SymbolStatsEstimate.buildFrom;
raunaqmorarka marked this conversation as resolved.
Show resolved Hide resolved
import static io.trino.util.MoreMath.averageExcludingNaNs;
import static io.trino.util.MoreMath.max;
import static io.trino.util.MoreMath.maxExcludeNaN;
import static io.trino.util.MoreMath.min;
import static io.trino.util.MoreMath.minExcludeNaN;
import static java.lang.Double.NEGATIVE_INFINITY;
import static java.lang.Double.NaN;
import static java.lang.Double.POSITIVE_INFINITY;
Expand All @@ -31,6 +33,11 @@

public final class ComparisonStatsCalculator
{
// We assume uniform distribution of values within each range.
// Within the overlapping range, we assume that all pairs of distinct values from both ranges exist.
// Based on the above, we estimate that half of the pairs of values will match inequality predicate on average.
public static final double OVERLAPPING_RANGE_INEQUALITY_FILTER_COEFFICIENT = 0.5;
raunaqmorarka marked this conversation as resolved.
Show resolved Hide resolved

private ComparisonStatsCalculator() {}

public static PlanNodeStatsEstimate estimateExpressionToLiteralComparison(
Expand Down Expand Up @@ -164,6 +171,13 @@ public static PlanNodeStatsEstimate estimateExpressionToExpressionComparison(
case LESS_THAN_OR_EQUAL:
case GREATER_THAN:
case GREATER_THAN_OR_EQUAL:
return estimateExpressionToExpressionInequality(
operator,
inputStatistics,
leftExpressionStatistics,
leftExpressionSymbol,
rightExpressionStatistics,
rightExpressionSymbol);
case IS_DISTINCT_FROM:
return PlanNodeStatsEstimate.unknown();
}
Expand Down Expand Up @@ -239,4 +253,128 @@ private static PlanNodeStatsEstimate estimateExpressionNotEqualToExpression(
rightExpressionSymbol.ifPresent(symbol -> result.addSymbolStatistics(symbol, rightNullsFiltered));
return result.build();
}

private static PlanNodeStatsEstimate estimateExpressionToExpressionInequality(
raunaqmorarka marked this conversation as resolved.
Show resolved Hide resolved
ComparisonExpression.Operator operator,
PlanNodeStatsEstimate inputStatistics,
SymbolStatsEstimate leftExpressionStatistics,
Optional<Symbol> leftExpressionSymbol,
SymbolStatsEstimate rightExpressionStatistics,
Optional<Symbol> rightExpressionSymbol)
{
if (leftExpressionStatistics.isUnknown() || rightExpressionStatistics.isUnknown()) {
return PlanNodeStatsEstimate.unknown();
}
if (isNaN(leftExpressionStatistics.getNullsFraction()) && isNaN(rightExpressionStatistics.getNullsFraction())) {
sopel39 marked this conversation as resolved.
Show resolved Hide resolved
return PlanNodeStatsEstimate.unknown();
raunaqmorarka marked this conversation as resolved.
Show resolved Hide resolved
}
if (leftExpressionStatistics.statisticRange().isEmpty() || rightExpressionStatistics.statisticRange().isEmpty()) {
return inputStatistics.mapOutputRowCount(rowCount -> 0.0);
}

// We don't know the correlation between NULLs, so we take the max nullsFraction from the expression statistics
// to make a conservative estimate (nulls are fully correlated) for the NULLs filter factor
double nullsFilterFactor = 1 - maxExcludeNaN(leftExpressionStatistics.getNullsFraction(), rightExpressionStatistics.getNullsFraction());
switch (operator) {
case LESS_THAN:
case LESS_THAN_OR_EQUAL:
return estimateExpressionLessThanOrEqualToExpression(
inputStatistics,
leftExpressionStatistics,
leftExpressionSymbol,
rightExpressionStatistics,
rightExpressionSymbol,
nullsFilterFactor);
case GREATER_THAN:
case GREATER_THAN_OR_EQUAL:
sopel39 marked this conversation as resolved.
Show resolved Hide resolved
return estimateExpressionLessThanOrEqualToExpression(
inputStatistics,
rightExpressionStatistics,
rightExpressionSymbol,
leftExpressionStatistics,
leftExpressionSymbol,
nullsFilterFactor);
default:
throw new IllegalArgumentException("Unsupported inequality operator " + operator);
}
}

private static PlanNodeStatsEstimate estimateExpressionLessThanOrEqualToExpression(
PlanNodeStatsEstimate inputStatistics,
SymbolStatsEstimate leftExpressionStatistics,
Optional<Symbol> leftExpressionSymbol,
SymbolStatsEstimate rightExpressionStatistics,
Optional<Symbol> rightExpressionSymbol,
double nullsFilterFactor)
{
StatisticRange leftRange = StatisticRange.from(leftExpressionStatistics);
StatisticRange rightRange = StatisticRange.from(rightExpressionStatistics);
// left is always greater than right, no overlap
if (leftRange.getLow() > rightRange.getHigh()) {
return inputStatistics.mapOutputRowCount(rowCount -> 0.0);
}
// left is always lesser than right
if (leftRange.getHigh() < rightRange.getLow()) {
PlanNodeStatsEstimate.Builder estimate = PlanNodeStatsEstimate.buildFrom(inputStatistics);
leftExpressionSymbol.ifPresent(symbol -> estimate.addSymbolStatistics(
symbol,
leftExpressionStatistics.mapNullsFraction(nullsFraction -> 0.0)));
rightExpressionSymbol.ifPresent(symbol -> estimate.addSymbolStatistics(
symbol,
rightExpressionStatistics.mapNullsFraction(nullsFraction -> 0.0)));
return estimate.setOutputRowCount(inputStatistics.getOutputRowCount() * nullsFilterFactor)
.build();
}

PlanNodeStatsEstimate.Builder estimate = PlanNodeStatsEstimate.buildFrom(inputStatistics);
double leftOverlappingRangeFraction = leftRange.overlapPercentWith(rightRange);
double leftAlwaysLessRangeFraction;
if (leftRange.getLow() < rightRange.getLow()) {
leftAlwaysLessRangeFraction = min(
raunaqmorarka marked this conversation as resolved.
Show resolved Hide resolved
leftRange.overlapPercentWith(new StatisticRange(leftRange.getLow(), rightRange.getLow(), NaN)),
// Prevents expanding NDVs in case range fractions addition goes beyond 1 for infinite ranges
1 - leftOverlappingRangeFraction);
}
else {
leftAlwaysLessRangeFraction = 0;
}
leftExpressionSymbol.ifPresent(symbol -> estimate.addSymbolStatistics(
symbol,
SymbolStatsEstimate.builder()
.setLowValue(leftRange.getLow())
.setHighValue(minExcludeNaN(leftRange.getHigh(), rightRange.getHigh()))
.setAverageRowSize(leftExpressionStatistics.getAverageRowSize())
.setDistinctValuesCount(leftExpressionStatistics.getDistinctValuesCount() * (leftAlwaysLessRangeFraction + leftOverlappingRangeFraction))
.setNullsFraction(0)
.build()));

double rightOverlappingRangeFraction = rightRange.overlapPercentWith(leftRange);
double rightAlwaysGreaterRangeFraction;
if (leftRange.getHigh() < rightRange.getHigh()) {
rightAlwaysGreaterRangeFraction = min(
rightRange.overlapPercentWith(new StatisticRange(leftRange.getHigh(), rightRange.getHigh(), NaN)),
// Prevents expanding NDVs in case range fractions addition goes beyond 1 for infinite ranges
1 - rightOverlappingRangeFraction);
}
else {
rightAlwaysGreaterRangeFraction = 0;
}
rightExpressionSymbol.ifPresent(symbol -> estimate.addSymbolStatistics(
symbol,
SymbolStatsEstimate.builder()
.setLowValue(maxExcludeNaN(leftRange.getLow(), rightRange.getLow()))
.setHighValue(rightRange.getHigh())
.setAverageRowSize(rightExpressionStatistics.getAverageRowSize())
.setDistinctValuesCount(rightExpressionStatistics.getDistinctValuesCount() * (rightOverlappingRangeFraction + rightAlwaysGreaterRangeFraction))
.setNullsFraction(0)
.build()));
double filterFactor =
// all left range values which are below right range are selected
leftAlwaysLessRangeFraction +
// for pairs in overlapping range, only half of pairs are selected
leftOverlappingRangeFraction * rightOverlappingRangeFraction * OVERLAPPING_RANGE_INEQUALITY_FILTER_COEFFICIENT +
// all pairs where left value is in overlapping range and right value is above left range are selected
leftOverlappingRangeFraction * rightAlwaysGreaterRangeFraction;
return estimate.setOutputRowCount(inputStatistics.getOutputRowCount() * nullsFilterFactor * filterFactor).build();
sopel39 marked this conversation as resolved.
Show resolved Hide resolved
}
}
Loading