Skip to content

Commit

Permalink
Merge #74303
Browse files Browse the repository at this point in the history
74303: opt: split a disjunction of equijoin predicates into a union of joins r=msirek a=msirek

Previously, when the ON clause of an inner, semi or anti join contained
ORed equality predicates, the only available join method was cross join.

This was inadequate because cross join is often the worst performing
join method for joining large tables.

To address this, this patch adds a new cost-based transformation which
evaluates each disjunct in a separate join and unions or intersects the
results together.

Fixes #74302

Example query:

```
SELECT *
FROM   classRequest
       INNER JOIN classes
               ON classRequest.firstChoiceClassid = classes.classid
                  OR classRequest.secondChoiceClassid = classes.classid;
```
Transformation result written in pseudo-SQL:
```
SELECT DISTINCT ON (classes.<rowid_or_primary_key_columns>,
                    classRequest.<rowid_or_primary_key_columns>)
       dt.*
FROM   (
        SELECT     *
        FROM       classRequest
                   INNER JOIN classes
                   ON         classRequest.firstChoiceClassid =
		              classes.classid
                   UNION ALL
        SELECT     *
        FROM       classRequest
                   INNER JOIN classes
                   ON         classRequest.secondChoiceClassid =
		              classes.classid
       ) dt;
```

In addition, ORed ON clause selectivity estimation is enhanced to
estimate the selectivity of each '=' predicate separately and
combine the estimates in an iterative fashion like PostgreSQL does. This
enables the optimizer to cost the rewritten plan more accurately so it
will get picked.

Release justification: Performance improvement for queries with ORed
join predicates and improved selectivity estimation of ORed predicates

Release note (performance improvement): Performance of inner, semi or
anti join between two tables with ORed equijoin predicates is improved
by enabling the optimizer to select a join plan in which each equijoin
predicate is evaluated by a separate join, with the results of the joins
unioned or intersected together.

Co-authored-by: Mark Sirek <[email protected]>
  • Loading branch information
craig[bot] and msirek committed Apr 2, 2022
2 parents 0e6878b + e3de17a commit 2ecd90c
Show file tree
Hide file tree
Showing 14 changed files with 6,964 additions and 31 deletions.
968 changes: 968 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/disjunction_in_join

Large diffs are not rendered by default.

79 changes: 79 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/inner-join
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,33 @@ SELECT * from abc WHERE EXISTS (SELECT * FROM def WHERE a=d AND c=e)
1 1 2
2 1 1

# Exists with primary key columns selected
query III rowsort
SELECT a, b, c FROM abc WHERE EXISTS (SELECT * FROM def WHERE a=d OR a=e)
----
1 1 2
2 1 1
2 2 NULL

# Exists with primary key columns not selected
query I rowsort
SELECT c FROM abc WHERE EXISTS (SELECT * FROM def WHERE a=d OR a=e)
----
2
1
NULL

# Not Exists with primary key columns selected
query III rowsort
SELECT a, b, c FROM abc WHERE NOT EXISTS (SELECT * FROM def WHERE a=d OR a=e)
----

# Not Exists with primary key columns not selected
query I rowsort
SELECT c FROM abc WHERE NOT EXISTS (SELECT * FROM def WHERE a=d OR a=e)
----


# A semi-join emits exactly one row for every matching row in the LHS.
# The following test ensures that the SemiJoin doesn't commute into an
# InnerJoin as that guarantee would be lost.
Expand All @@ -38,11 +65,28 @@ INSERT INTO abc VALUES (1, 1, 1)
statement ok
INSERT INTO def VALUES (1, 1, 1), (2, 1, 1)

# Exists with primary key columns selected
query III rowsort
SELECT a, b, c FROM abc WHERE EXISTS (SELECT * FROM def WHERE a=d OR a=e)
----
1 1 1

# Exists with primary key columns not selected
query I rowsort
SELECT c FROM abc WHERE EXISTS (SELECT * FROM def WHERE a=d OR a=e)
----
1

# Not Exists with primary key columns selected
query III rowsort
SELECT a, b, c FROM abc WHERE NOT EXISTS (SELECT * FROM def WHERE a=d OR a=e)
----

# Not Exists with primary key columns not selected
query I rowsort
SELECT c FROM abc WHERE NOT EXISTS (SELECT * FROM def WHERE a=d OR a=e)
----

# Given that we know the reason the above query would fail if an InnerJoin
# was used - multiple rows emitted for each matching row in the LHS - we
# might think that adding a DistinctOn over the InnerJoin would help.
Expand All @@ -58,6 +102,21 @@ SELECT a, b, c FROM abc WHERE EXISTS (SELECT * FROM def WHERE a=d OR a=e)
#
# This tests that the InnerJoin commute rule for semi joins behaves sanely in
# these cases.

# InnerJoin with primary key columns selected
query III rowsort
SELECT a, b, c FROM abc, def WHERE a=d OR a=e
----
1 1 1
1 1 1

# InnerJoin with primary key columns not selected
query I rowsort
SELECT c FROM abc, def WHERE a=d OR a=e
----
1
1

statement ok
CREATE TABLE abc_decimal (a DECIMAL, b DECIMAL, c DECIMAL);
INSERT INTO abc_decimal VALUES (1, 1, 1), (1, 1, 1), (1.0, 1.0, 1.0), (1.00, 1.00, 1.00)
Expand All @@ -73,3 +132,23 @@ SELECT a, b, c FROM abc_decimal WHERE EXISTS (SELECT * FROM def_decimal WHERE a:
1 1 1
1.0 1.0 1.0
1.00 1.00 1.00

query RRR rowsort
SELECT a, b, c FROM abc_decimal WHERE EXISTS (SELECT * FROM def_decimal WHERE a::string=d::string or a::string=e::string)
----
1 1 1
1 1 1
1.0 1.0 1.0
1.00 1.00 1.00

query RRR rowsort
SELECT a, b, c FROM abc_decimal, def_decimal WHERE a::string=d::string or a::string=e::string
----
1 1 1
1 1 1
1.0 1.0 1.0
1.00 1.00 1.00

query RRR rowsort
SELECT a, b, c FROM abc_decimal WHERE NOT EXISTS (SELECT * FROM def_decimal WHERE a::string=d::string or a::string=e::string)
----
115 changes: 115 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/subquery
Original file line number Diff line number Diff line change
Expand Up @@ -550,3 +550,118 @@ WHERE
t.oid
NOT IN (SELECT (ARRAY[704, 11676, 10005, 3912, 11765, 59410, 11397])[i] FROM generate_series(1, 376) AS i)
----

statement ok
ALTER TABLE abc INJECT STATISTICS '[
{
"columns": ["a"],
"created_at": "2018-05-01 1:00:00.00000+00:00",
"row_count": 10000,
"distinct_count": 10000
}
]'

statement ok
ALTER TABLE abc INJECT STATISTICS '[
{
"columns": ["b"],
"created_at": "2018-05-01 1:00:00.00000+00:00",
"row_count": 10000,
"distinct_count": 10000
}
]'

statement ok
ALTER TABLE xyz INJECT STATISTICS '[
{
"columns": ["x"],
"created_at": "2018-05-01 1:00:00.00000+00:00",
"row_count": 1000,
"distinct_count": 1000
}
]'

statement ok
ALTER TABLE xyz INJECT STATISTICS '[
{
"columns": ["y"],
"created_at": "2018-05-01 1:00:00.00000+00:00",
"row_count": 1000,
"distinct_count": 1000
}
]'

statement ok
INSERT INTO xyz VALUES(5, 4, 7)

statement ok
INSERT INTO abc VALUES(12, 13, 14)

statement ok
CREATE INDEX abc_b ON abc(b)

statement ok
CREATE INDEX xyz_y ON xyz(y)

### Split Disjunctions Tests
query III rowsort
SELECT * FROM abc WHERE EXISTS (SELECT * FROM xyz WHERE abc.a = xyz.x OR abc.b = xyz.y)
----
4 5 6
7 8 9
2 5 6

query III rowsort
SELECT * FROM abc WHERE EXISTS (SELECT * FROM xyz WHERE abc.a = xyz.y OR abc.b = xyz.x)
----
2 5 6
4 5 6
12 13 14

query III rowsort
SELECT * FROM abc WHERE EXISTS (SELECT * FROM xyz WHERE (abc.a = xyz.x OR abc.b = xyz.y)and abc.a > 3 AND xyz.z > 10)
----
7 8 9

query III rowsort
SELECT * FROM abc WHERE EXISTS (SELECT * FROM xyz WHERE (abc.a = xyz.y OR abc.b = xyz.x) AND abc.a > 3 AND xyz.z > 10)
----
12 13 14

query III rowsort
SELECT * FROM abc WHERE NOT EXISTS (SELECT * FROM xyz WHERE abc.a = xyz.x OR abc.b = xyz.y)
----
12 13 14

query III rowsort
SELECT * FROM abc WHERE NOT EXISTS (SELECT * FROM xyz WHERE abc.a = xyz.y OR abc.b = xyz.x)
----
7 8 9

query III rowsort
SELECT * FROM abc WHERE NOT EXISTS (SELECT * FROM xyz WHERE (abc.a = xyz.x OR abc.b = xyz.y)and abc.a > 3 AND xyz.z > 10)
----
2 5 6
4 5 6
12 13 14

query III rowsort
SELECT * FROM abc WHERE NOT EXISTS (SELECT * FROM xyz WHERE (abc.a = xyz.y OR abc.b = xyz.x) AND abc.a > 3 AND xyz.z > 10)
----
2 5 6
4 5 6
7 8 9

query III rowsort
SELECT * FROM abc WHERE EXISTS (SELECT * FROM xyz WHERE (abc.a = xyz.x OR abc.b = xyz.y) AND (abc.a = xyz.y OR abc.b = xyz.y))
----
4 5 6
2 5 6

query III rowsort
SELECT * FROM abc WHERE NOT EXISTS (SELECT * FROM xyz WHERE (abc.a = xyz.x OR abc.b = xyz.y) AND (abc.a = xyz.y OR abc.b = xyz.y))
----
7 8 9
12 13 14

### End Split Disjunctions Tests
Loading

0 comments on commit 2ecd90c

Please sign in to comment.