Skip to content

Commit

Permalink
use AliasGenerator to generate aliases, use __cse prefix in commo…
Browse files Browse the repository at this point in the history
…n expression aliases, remove `DataType` from `ExprStats` as not needed, store aliases in `CommonExprs`, revert unnecessary changes
  • Loading branch information
peter-toth committed Jun 15, 2024
1 parent 476953d commit 3aa9053
Show file tree
Hide file tree
Showing 9 changed files with 236 additions and 304 deletions.
339 changes: 136 additions & 203 deletions datafusion/optimizer/src/common_subexpr_eliminate.rs

Large diffs are not rendered by default.

27 changes: 13 additions & 14 deletions datafusion/sqllogictest/test_files/aggregate.slt
Original file line number Diff line number Diff line change
Expand Up @@ -1976,7 +1976,7 @@ drop table t;

# test count with largeutf8
statement ok
create table t (c string) as values
create table t (c string) as values
(arrow_cast('a', 'LargeUtf8')),
(arrow_cast('b', 'LargeUtf8')),
(arrow_cast(null, 'LargeUtf8')),
Expand Down Expand Up @@ -3784,7 +3784,6 @@ Y 2021-01-01 2021-01-01T00:00:00
statement error DataFusion error: Error during planning: No function matches the given name and argument types 'AVG\(Date32\)'\. You might need to add explicit type casts\.
SELECT avg(date32), avg(date64) FROM t


statement error DataFusion error: Error during planning: No function matches the given name and argument types 'AVG\(Date32\)'\. You might need to add explicit type casts\.
SELECT tag, avg(date32), avg(date64) FROM t GROUP BY tag ORDER BY tag;

Expand Down Expand Up @@ -4048,7 +4047,7 @@ statement ok
create table t (c1 decimal(10, 0), c2 int) as values (null, null), (null, null), (null, null);

query RTIT
select
select
sum(c1), arrow_typeof(sum(c1)),
sum(c2), arrow_typeof(sum(c2))
from t;
Expand Down Expand Up @@ -4682,7 +4681,7 @@ NULL NULL 3 NULL 1 4 0 8 0

# regr_*() basic tests
query RRRRRRRRR
select
select
regr_slope(column2, column1),
regr_intercept(column2, column1),
regr_count(column2, column1),
Expand All @@ -4697,7 +4696,7 @@ from (values (1,2), (2,4), (3,6));
2 0 3 1 2 4 2 8 4

query RRRRRRRRR
select
select
regr_slope(c12, c11),
regr_intercept(c12, c11),
regr_count(c12, c11),
Expand All @@ -4715,7 +4714,7 @@ from aggregate_test_100;

# regr_*() functions ignore NULLs
query RRRRRRRRR
select
select
regr_slope(column2, column1),
regr_intercept(column2, column1),
regr_count(column2, column1),
Expand All @@ -4730,7 +4729,7 @@ from (values (1,NULL), (2,4), (3,6));
2 0 2 1 2.5 5 0.5 2 1

query RRRRRRRRR
select
select
regr_slope(column2, column1),
regr_intercept(column2, column1),
regr_count(column2, column1),
Expand All @@ -4745,7 +4744,7 @@ from (values (1,NULL), (NULL,4), (3,6));
NULL NULL 1 NULL 3 6 0 0 0

query RRRRRRRRR
select
select
regr_slope(column2, column1),
regr_intercept(column2, column1),
regr_count(column2, column1),
Expand All @@ -4760,8 +4759,8 @@ from (values (1,NULL), (NULL,4), (NULL,NULL));
NULL NULL 0 NULL NULL NULL NULL NULL NULL

query TRRRRRRRRR rowsort
select
column3,
select
column3,
regr_slope(column2, column1),
regr_intercept(column2, column1),
regr_count(column2, column1),
Expand All @@ -4785,7 +4784,7 @@ statement ok
set datafusion.execution.batch_size = 1;

query RRRRRRRRR
select
select
regr_slope(c12, c11),
regr_intercept(c12, c11),
regr_count(c12, c11),
Expand All @@ -4803,7 +4802,7 @@ statement ok
set datafusion.execution.batch_size = 2;

query RRRRRRRRR
select
select
regr_slope(c12, c11),
regr_intercept(c12, c11),
regr_count(c12, c11),
Expand All @@ -4821,7 +4820,7 @@ statement ok
set datafusion.execution.batch_size = 3;

query RRRRRRRRR
select
select
regr_slope(c12, c11),
regr_intercept(c12, c11),
regr_count(c12, c11),
Expand Down Expand Up @@ -5014,7 +5013,7 @@ select count(*) from (select count(*) a, count(*) b from (select 1));

# UTF8 string matters for string to &[u8] conversion, add it to prevent regression
statement ok
create table distinct_count_string_table as values
create table distinct_count_string_table as values
(1, 'a', 'longstringtest_a', '台灣'),
(2, 'b', 'longstringtest_b1', '日本'),
(2, 'b', 'longstringtest_b2', '中國'),
Expand Down
8 changes: 4 additions & 4 deletions datafusion/sqllogictest/test_files/group_by.slt
Original file line number Diff line number Diff line change
Expand Up @@ -4187,8 +4187,8 @@ EXPLAIN SELECT SUM(DISTINCT CAST(x AS DOUBLE)), MAX(DISTINCT CAST(x AS DOUBLE))
logical_plan
01)Projection: sum(alias1) AS sum(DISTINCT t1.x), MAX(alias1) AS MAX(DISTINCT t1.x)
02)--Aggregate: groupBy=[[t1.y]], aggr=[[sum(alias1), MAX(alias1)]]
03)----Aggregate: groupBy=[[t1.y, #1 AS t1.x AS alias1]], aggr=[[]]
04)------Projection: CAST(t1.x AS Float64) AS #1, t1.y
03)----Aggregate: groupBy=[[t1.y, __cse_1 AS t1.x AS alias1]], aggr=[[]]
04)------Projection: CAST(t1.x AS Float64) AS __cse_1, t1.y
05)--------TableScan: t1 projection=[x, y]
physical_plan
01)ProjectionExec: expr=[sum(alias1)@1 as sum(DISTINCT t1.x), MAX(alias1)@2 as MAX(DISTINCT t1.x)]
Expand All @@ -4200,8 +4200,8 @@ physical_plan
07)------------CoalesceBatchesExec: target_batch_size=2
08)--------------RepartitionExec: partitioning=Hash([y@0, alias1@1], 8), input_partitions=8
09)----------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
10)------------------AggregateExec: mode=Partial, gby=[y@1 as y, #1@0 as alias1], aggr=[]
11)--------------------ProjectionExec: expr=[CAST(x@0 AS Float64) as #1, y@1 as y]
10)------------------AggregateExec: mode=Partial, gby=[y@1 as y, __cse_1@0 as alias1], aggr=[]
11)--------------------ProjectionExec: expr=[CAST(x@0 AS Float64) as __cse_1, y@1 as y]
12)----------------------MemoryExec: partitions=1, partition_sizes=[1]

# create an unbounded table that contains ordered timestamp.
Expand Down
8 changes: 4 additions & 4 deletions datafusion/sqllogictest/test_files/joins.slt
Original file line number Diff line number Diff line change
Expand Up @@ -2496,31 +2496,31 @@ SELECT * FROM test_timestamps_tz_table as t1 JOIN (SELECT * FROM test_timestamps

# test timestamp join on micros datatype
query PPPPTPPPPT rowsort
SELECT * FROM test_timestamps_table as t1 JOIN (SELECT * FROM test_timestamps_table ) as t2 ON t1.micros = t2.micros;
SELECT * FROM test_timestamps_table as t1 JOIN (SELECT * FROM test_timestamps_table ) as t2 ON t1.micros = t2.micros
----
2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123 2011-12-13T11:13:10 Row 1 2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123 2011-12-13T11:13:10 Row 1
2018-11-13T17:11:10.011375885 2018-11-13T17:11:10.011375 2018-11-13T17:11:10.011 2018-11-13T17:11:10 Row 0 2018-11-13T17:11:10.011375885 2018-11-13T17:11:10.011375 2018-11-13T17:11:10.011 2018-11-13T17:11:10 Row 0
2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10 Row 3 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10 Row 3

# test timestamp with timezone join on micros datatype
query PPPPTPPPPT rowsort
SELECT * FROM test_timestamps_tz_table as t1 JOIN (SELECT * FROM test_timestamps_tz_table ) as t2 ON t1.micros = t2.micros;
SELECT * FROM test_timestamps_tz_table as t1 JOIN (SELECT * FROM test_timestamps_tz_table ) as t2 ON t1.micros = t2.micros
----
2011-12-13T11:13:10.123450Z 2011-12-13T11:13:10.123450Z 2011-12-13T11:13:10.123Z 2011-12-13T11:13:10Z Row 1 2011-12-13T11:13:10.123450Z 2011-12-13T11:13:10.123450Z 2011-12-13T11:13:10.123Z 2011-12-13T11:13:10Z Row 1
2018-11-13T17:11:10.011375885Z 2018-11-13T17:11:10.011375Z 2018-11-13T17:11:10.011Z 2018-11-13T17:11:10Z Row 0 2018-11-13T17:11:10.011375885Z 2018-11-13T17:11:10.011375Z 2018-11-13T17:11:10.011Z 2018-11-13T17:11:10Z Row 0
2021-01-01T05:11:10.432Z 2021-01-01T05:11:10.432Z 2021-01-01T05:11:10.432Z 2021-01-01T05:11:10Z Row 3 2021-01-01T05:11:10.432Z 2021-01-01T05:11:10.432Z 2021-01-01T05:11:10.432Z 2021-01-01T05:11:10Z Row 3

# test timestamp join on millis datatype
query PPPPTPPPPT rowsort
SELECT * FROM test_timestamps_table as t1 JOIN (SELECT * FROM test_timestamps_table ) as t2 ON t1.millis = t2.millis;
SELECT * FROM test_timestamps_table as t1 JOIN (SELECT * FROM test_timestamps_table ) as t2 ON t1.millis = t2.millis
----
2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123 2011-12-13T11:13:10 Row 1 2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123 2011-12-13T11:13:10 Row 1
2018-11-13T17:11:10.011375885 2018-11-13T17:11:10.011375 2018-11-13T17:11:10.011 2018-11-13T17:11:10 Row 0 2018-11-13T17:11:10.011375885 2018-11-13T17:11:10.011375 2018-11-13T17:11:10.011 2018-11-13T17:11:10 Row 0
2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10 Row 3 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10 Row 3

# test timestamp with timezone join on millis datatype
query PPPPTPPPPT rowsort
SELECT * FROM test_timestamps_tz_table as t1 JOIN (SELECT * FROM test_timestamps_tz_table ) as t2 ON t1.millis = t2.millis;
SELECT * FROM test_timestamps_tz_table as t1 JOIN (SELECT * FROM test_timestamps_tz_table ) as t2 ON t1.millis = t2.millis
----
2011-12-13T11:13:10.123450Z 2011-12-13T11:13:10.123450Z 2011-12-13T11:13:10.123Z 2011-12-13T11:13:10Z Row 1 2011-12-13T11:13:10.123450Z 2011-12-13T11:13:10.123450Z 2011-12-13T11:13:10.123Z 2011-12-13T11:13:10Z Row 1
2018-11-13T17:11:10.011375885Z 2018-11-13T17:11:10.011375Z 2018-11-13T17:11:10.011Z 2018-11-13T17:11:10Z Row 0 2018-11-13T17:11:10.011375885Z 2018-11-13T17:11:10.011375Z 2018-11-13T17:11:10.011Z 2018-11-13T17:11:10Z Row 0
Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/scalar.slt
Original file line number Diff line number Diff line change
Expand Up @@ -1906,7 +1906,7 @@ D false

# test string_temporal_coercion
query BBBBBBBBBB
select
select
arrow_cast(to_timestamp('2020-01-01 01:01:11.1234567890Z'), 'Timestamp(Second, None)') == '2020-01-01T01:01:11',
arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Second, None)') == arrow_cast('2020-01-02T01:01:11', 'LargeUtf8'),
arrow_cast(to_timestamp('2020-01-03 01:01:11.1234567890Z'), 'Time32(Second)') == '01:01:11',
Expand Down
16 changes: 8 additions & 8 deletions datafusion/sqllogictest/test_files/select.slt
Original file line number Diff line number Diff line change
Expand Up @@ -1436,12 +1436,12 @@ query TT
EXPLAIN SELECT x/2, x/2+1 FROM t;
----
logical_plan
01)Projection: #1 AS t.x / Int64(2), #1 AS t.x / Int64(2) + Int64(1)
02)--Projection: t.x / Int64(2) AS #1
01)Projection: __cse_1 AS t.x / Int64(2), __cse_1 AS t.x / Int64(2) + Int64(1)
02)--Projection: t.x / Int64(2) AS __cse_1
03)----TableScan: t projection=[x]
physical_plan
01)ProjectionExec: expr=[#1@0 as t.x / Int64(2), #1@0 + 1 as t.x / Int64(2) + Int64(1)]
02)--ProjectionExec: expr=[x@0 / 2 as #1]
01)ProjectionExec: expr=[__cse_1@0 as t.x / Int64(2), __cse_1@0 + 1 as t.x / Int64(2) + Int64(1)]
02)--ProjectionExec: expr=[x@0 / 2 as __cse_1]
03)----MemoryExec: partitions=1, partition_sizes=[1]

query II
Expand All @@ -1454,12 +1454,12 @@ query TT
EXPLAIN SELECT abs(x), abs(x) + abs(y) FROM t;
----
logical_plan
01)Projection: #1 AS abs(t.x), #1 AS abs(t.x) + abs(t.y)
02)--Projection: abs(t.x) AS #1, t.y
01)Projection: __cse_1 AS abs(t.x), __cse_1 AS abs(t.x) + abs(t.y)
02)--Projection: abs(t.x) AS __cse_1, t.y
03)----TableScan: t projection=[x, y]
physical_plan
01)ProjectionExec: expr=[#1@0 as abs(t.x), #1@0 + abs(y@1) as abs(t.x) + abs(t.y)]
02)--ProjectionExec: expr=[abs(x@0) as #1, y@1 as y]
01)ProjectionExec: expr=[__cse_1@0 as abs(t.x), __cse_1@0 + abs(y@1) as abs(t.x) + abs(t.y)]
02)--ProjectionExec: expr=[abs(x@0) as __cse_1, y@1 as y]
03)----MemoryExec: partitions=1, partition_sizes=[1]

query II
Expand Down
8 changes: 4 additions & 4 deletions datafusion/sqllogictest/test_files/subquery.slt
Original file line number Diff line number Diff line change
Expand Up @@ -1069,8 +1069,8 @@ query TT
explain select a/2, a/2 + 1 from t
----
logical_plan
01)Projection: #1 AS t.a / Int64(2), #1 AS t.a / Int64(2) + Int64(1)
02)--Projection: t.a / Int64(2) AS #1
01)Projection: __cse_1 AS t.a / Int64(2), __cse_1 AS t.a / Int64(2) + Int64(1)
02)--Projection: t.a / Int64(2) AS __cse_1
03)----TableScan: t projection=[a]

statement ok
Expand All @@ -1080,8 +1080,8 @@ query TT
explain select a/2, a/2 + 1 from t
----
logical_plan
01)Projection: #1 AS t.a / Int64(2), #1 AS t.a / Int64(2) + Int64(1)
02)--Projection: t.a / Int64(2) AS #1
01)Projection: __cse_1 AS t.a / Int64(2), __cse_1 AS t.a / Int64(2) + Int64(1)
02)--Projection: t.a / Int64(2) AS __cse_1
03)----TableScan: t projection=[a]

###
Expand Down
6 changes: 3 additions & 3 deletions datafusion/sqllogictest/test_files/tpch/q1.slt.part
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ explain select
logical_plan
01)Sort: lineitem.l_returnflag ASC NULLS LAST, lineitem.l_linestatus ASC NULLS LAST
02)--Projection: lineitem.l_returnflag, lineitem.l_linestatus, sum(lineitem.l_quantity) AS sum_qty, sum(lineitem.l_extendedprice) AS sum_base_price, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS sum_disc_price, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax) AS sum_charge, AVG(lineitem.l_quantity) AS avg_qty, AVG(lineitem.l_extendedprice) AS avg_price, AVG(lineitem.l_discount) AS avg_disc, COUNT(*) AS count_order
03)----Aggregate: groupBy=[[lineitem.l_returnflag, lineitem.l_linestatus]], aggr=[[sum(lineitem.l_quantity), sum(lineitem.l_extendedprice), sum(#1) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount), sum(#1 * (Decimal128(Some(1),20,0) + lineitem.l_tax)) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax), AVG(lineitem.l_quantity), AVG(lineitem.l_extendedprice), AVG(lineitem.l_discount), COUNT(Int64(1)) AS COUNT(*)]]
04)------Projection: lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS #1, lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, lineitem.l_tax, lineitem.l_returnflag, lineitem.l_linestatus
03)----Aggregate: groupBy=[[lineitem.l_returnflag, lineitem.l_linestatus]], aggr=[[sum(lineitem.l_quantity), sum(lineitem.l_extendedprice), sum(__cse_1) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount), sum(__cse_1 * (Decimal128(Some(1),20,0) + lineitem.l_tax)) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax), AVG(lineitem.l_quantity), AVG(lineitem.l_extendedprice), AVG(lineitem.l_discount), COUNT(Int64(1)) AS COUNT(*)]]
04)------Projection: lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS __cse_1, lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, lineitem.l_tax, lineitem.l_returnflag, lineitem.l_linestatus
05)--------Filter: lineitem.l_shipdate <= Date32("1998-09-02")
06)----------TableScan: lineitem projection=[l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate], partial_filters=[lineitem.l_shipdate <= Date32("1998-09-02")]
physical_plan
Expand All @@ -54,7 +54,7 @@ physical_plan
05)--------CoalesceBatchesExec: target_batch_size=8192
06)----------RepartitionExec: partitioning=Hash([l_returnflag@0, l_linestatus@1], 4), input_partitions=4
07)------------AggregateExec: mode=Partial, gby=[l_returnflag@5 as l_returnflag, l_linestatus@6 as l_linestatus], aggr=[sum(lineitem.l_quantity), sum(lineitem.l_extendedprice), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax), AVG(lineitem.l_quantity), AVG(lineitem.l_extendedprice), AVG(lineitem.l_discount), COUNT(*)]
08)--------------ProjectionExec: expr=[l_extendedprice@1 * (Some(1),20,0 - l_discount@2) as #1, l_quantity@0 as l_quantity, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, l_tax@3 as l_tax, l_returnflag@4 as l_returnflag, l_linestatus@5 as l_linestatus]
08)--------------ProjectionExec: expr=[l_extendedprice@1 * (Some(1),20,0 - l_discount@2) as __cse_1, l_quantity@0 as l_quantity, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, l_tax@3 as l_tax, l_returnflag@4 as l_returnflag, l_linestatus@5 as l_linestatus]
09)----------------CoalesceBatchesExec: target_batch_size=8192
10)------------------FilterExec: l_shipdate@6 <= 1998-09-02
11)--------------------CsvExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate], has_header=false
Expand Down
Loading

0 comments on commit 3aa9053

Please sign in to comment.