From a4852dc20dd5f3944a681648f70cd3a99d2b7bac Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 28 Aug 2024 11:21:08 +0200 Subject: [PATCH 1/7] feat:add operators to support duplicate eliminated joins --- proto/substrait/algebra.proto | 49 +++++++++++++++++++++++ site/docs/relations/physical_relations.md | 40 +++++++++++++++++- 2 files changed, 88 insertions(+), 1 deletion(-) diff --git a/proto/substrait/algebra.proto b/proto/substrait/algebra.proto index 08f775a8f..5bfb3bdb9 100644 --- a/proto/substrait/algebra.proto +++ b/proto/substrait/algebra.proto @@ -480,6 +480,8 @@ message Rel { HashJoinRel hash_join = 13; MergeJoinRel merge_join = 14; NestedLoopJoinRel nested_loop_join = 18; + DuplicateEliminatedGetRel duplicate_eliminated_get = 23; + DuplicateEliminatedJoinRel duplicate_eliminated_join = 24; ConsistentPartitionWindowRel window = 17; ExchangeRel exchange = 15; ExpandRel expand = 16; @@ -773,6 +775,53 @@ message NestedLoopJoinRel { substrait.extensions.AdvancedExtension advanced_extension = 10; } +message DuplicateEliminatedGetRel { + RelCommon common = 1; + ReferenceRel input = 2; + repeated Expression.FieldReference column_ids = 3; +} + +message DuplicateEliminatedJoinRel { + RelCommon common = 1; + Rel left = 2; + Rel right = 3; + + Expression expression = 4; + Expression post_join_filter = 5; + + JoinType type = 6; + + // The set of columns that will be duplicate eliminated from the LHS and pushed into the RHS + repeated Expression.FieldReference duplicate_eliminated_columns = 7; + + DuplicateEliminatedSide duplicate_eliminated_side = 8; + + // If this is a DuplicateEliminatedJoin, whether it has been flipped to de-duplicating the LHS or RHS + enum DuplicateEliminatedSide { + DUPLICATE_ELIMINATED_SIDE_UNSPECIFIED = 0; + DUPLICATE_ELIMINATED_SIDE_LEFT = 1; + DUPLICATE_ELIMINATED_SIDE_RIGHT = 2; + } + + enum JoinType { + JOIN_TYPE_UNSPECIFIED = 0; + JOIN_TYPE_INNER = 1; + JOIN_TYPE_OUTER = 2; + JOIN_TYPE_LEFT = 3; + JOIN_TYPE_RIGHT = 4; + JOIN_TYPE_LEFT_SEMI = 5; + JOIN_TYPE_LEFT_ANTI = 6; + JOIN_TYPE_LEFT_SINGLE = 7; + JOIN_TYPE_RIGHT_SEMI = 8; + JOIN_TYPE_RIGHT_ANTI = 9; + JOIN_TYPE_RIGHT_SINGLE = 10; + JOIN_TYPE_LEFT_MARK = 11; + JOIN_TYPE_RIGHT_MARK = 12; + } + + substrait.extensions.AdvancedExtension advanced_extension = 10; +} + // The argument of a function message FunctionArgument { oneof arg_type { diff --git a/site/docs/relations/physical_relations.md b/site/docs/relations/physical_relations.md index b7482b693..1ca82ba48 100644 --- a/site/docs/relations/physical_relations.md +++ b/site/docs/relations/physical_relations.md @@ -47,7 +47,45 @@ The nested loop join operator does a join by holding the entire right input and | Join Expression | A boolean condition that describes whether each record from the left set "match" the record from the right set. | Optional. Defaults to true (a Cartesian join). | | Join Type | One of the join types defined in the Join operator. | Required | - +## Duplicate Eliminated Join Operator +The Duplicate Eliminated Join, along with the [Duplicate Eliminated Get Operator](physical_relations.md#duplicate-eliminated-get-operator) are the two necessary operators that enable general subquery unnesting. (See the [Unnesting Arbitrary Queries](https://cs.emis.de/LNI/Proceedings/Proceedings241/383.pdf) paper for more information.) + +The Duplicate Eliminated Join is essentially a [Regular Join Operator](logical_relations.md#join-operator). It can have any regular join type, and its execution is the same. The main difference is that one of its children has, somewhere in its subtree, a dependency on the deduplicated result of the other. Therefore, this operator pushes the deduplicated result to its dependent child via the Duplicate Eliminated Get Operator. The side that will be deduplicated is specified in the Duplicate Eliminated Side property. The other side is the one that depends on the deduplication. + +| Signature | Value | +| -------------------- |----------------------------------------------------------------------------------------------| +| Inputs | 2 | +| Outputs | 1 | +| Property Maintenance | It is the same as the [Hash Equijoin Operator](physical_relations.md#hash-equijoin-operator) | +| Direct Output Order | Same as the [Join](logical_relations.md#join-operator) operator. | + +### Duplicate Eliminated Join Properties + +| Property | Description | Required | +|---------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------| +| Left Input | A relational input. | Required | +| Right Input | A relational input. | Required | +| Left Keys | References to the fields to join on in the left input. | Required | +| Right Keys | References to the fields to join on in the right input. | Required | +| Post Join Predicate | An additional expression that can be used to reduce the output of the join operation post the equality condition. Minimizes the overhead of secondary join conditions that cannot be evaluated using the equijoin keys. | Optional, defaults true. | +| Join Type | One of the join types defined in the Join operator. | Required | +| Duplicate Eliminated Side | The side that is deduplicated and pushed into the other side. | Required | + +## Duplicate Eliminated Get Operator +An operator that takes as its input the result of the deduplicated side of the Duplicate Eliminated Join. It simply scans the input and outputs the deduplicated. +| Signature | Value | +| -------------------- |-------------------------------------------------------------------------------------| +| Inputs | 1 | +| Outputs | 1 | +| Property Maintenance | Distribution is not maintained due to the deduplication. Orderedness is eliminated. | +| Direct Output Order | It will only project the deduplicated columns from it's input | + +### Duplicate Eliminated Get Properties + +| Property | Description | Required | +|------------|----------------------------------------------------|-----------------------------| +| Input | A relational input. | Required | +| Column IDs | The columns that were deduplicated from the input. | Required | ## Merge Equijoin Operator From bd0eb8bc4fcf7b13c36fae8dbe7869f3b94a2a83 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Fri, 30 Aug 2024 13:02:53 +0200 Subject: [PATCH 2/7] add values working to get operator --- site/docs/relations/physical_relations.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site/docs/relations/physical_relations.md b/site/docs/relations/physical_relations.md index 1ca82ba48..b67f29438 100644 --- a/site/docs/relations/physical_relations.md +++ b/site/docs/relations/physical_relations.md @@ -72,7 +72,7 @@ The Duplicate Eliminated Join is essentially a [Regular Join Operator](logical_r | Duplicate Eliminated Side | The side that is deduplicated and pushed into the other side. | Required | ## Duplicate Eliminated Get Operator -An operator that takes as its input the result of the deduplicated side of the Duplicate Eliminated Join. It simply scans the input and outputs the deduplicated. +An operator that takes as its input the result of the deduplicated side of the Duplicate Eliminated Join. It simply scans the input and outputs the deduplicated values. | Signature | Value | | -------------------- |-------------------------------------------------------------------------------------| | Inputs | 1 | From fa73e92cff0bd9e73d5064812b98cdd2a39b3071 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Tue, 3 Sep 2024 10:23:54 +0200 Subject: [PATCH 3/7] Update site/docs/relations/physical_relations.md Co-authored-by: Weston Pace --- site/docs/relations/physical_relations.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/site/docs/relations/physical_relations.md b/site/docs/relations/physical_relations.md index b67f29438..da30b07e5 100644 --- a/site/docs/relations/physical_relations.md +++ b/site/docs/relations/physical_relations.md @@ -50,7 +50,9 @@ The nested loop join operator does a join by holding the entire right input and ## Duplicate Eliminated Join Operator The Duplicate Eliminated Join, along with the [Duplicate Eliminated Get Operator](physical_relations.md#duplicate-eliminated-get-operator) are the two necessary operators that enable general subquery unnesting. (See the [Unnesting Arbitrary Queries](https://cs.emis.de/LNI/Proceedings/Proceedings241/383.pdf) paper for more information.) -The Duplicate Eliminated Join is essentially a [Regular Join Operator](logical_relations.md#join-operator). It can have any regular join type, and its execution is the same. The main difference is that one of its children has, somewhere in its subtree, a dependency on the deduplicated result of the other. Therefore, this operator pushes the deduplicated result to its dependent child via the Duplicate Eliminated Get Operator. The side that will be deduplicated is specified in the Duplicate Eliminated Side property. The other side is the one that depends on the deduplication. +The Duplicate Eliminated Join is essentially a [Regular Join Operator](logical_relations.md#join-operator). It can have any regular join type, and its execution is the same. The only restriction is that the join must be a hash equi-join. The main difference is that one of its children has, somewhere in its subtree, a dependency on the deduplicated result of the other. Therefore, this operator pushes the deduplicated result to its dependent child via the Duplicate Eliminated Get Operator. The side that will be deduplicated is specified in the Duplicate Eliminated Side property. The other side is the one that depends on the deduplication. + +The duplicate eliminated join has two outputs. The first output is no different than a regular join output. The second output is the output to the duplicate eliminated get operator. This output contains only the columns listed in the `duplicate_eliminated_columns` property. This output must be emitted as soon as the build phase is complete. This is because this output is required to compute the probe input. | Signature | Value | | -------------------- |----------------------------------------------------------------------------------------------| From 8c376c6f6d8571776658941efe72fc9b935fb662 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Tue, 3 Sep 2024 14:31:16 +0200 Subject: [PATCH 4/7] Update site/docs/relations/physical_relations.md Co-authored-by: Weston Pace --- site/docs/relations/physical_relations.md | 1 + 1 file changed, 1 insertion(+) diff --git a/site/docs/relations/physical_relations.md b/site/docs/relations/physical_relations.md index da30b07e5..a4bffad38 100644 --- a/site/docs/relations/physical_relations.md +++ b/site/docs/relations/physical_relations.md @@ -72,6 +72,7 @@ The duplicate eliminated join has two outputs. The first output is no different | Post Join Predicate | An additional expression that can be used to reduce the output of the join operation post the equality condition. Minimizes the overhead of secondary join conditions that cannot be evaluated using the equijoin keys. | Optional, defaults true. | | Join Type | One of the join types defined in the Join operator. | Required | | Duplicate Eliminated Side | The side that is deduplicated and pushed into the other side. | Required | +| Duplicate Eliminated Columns | The columns that should be included in the deduplicated output. These columns must be key columns. | Required | ## Duplicate Eliminated Get Operator An operator that takes as its input the result of the deduplicated side of the Duplicate Eliminated Join. It simply scans the input and outputs the deduplicated values. From bbc75dd98056365024934e93edba762af91b1f76 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Tue, 3 Sep 2024 14:31:32 +0200 Subject: [PATCH 5/7] Update site/docs/relations/physical_relations.md Co-authored-by: Weston Pace --- site/docs/relations/physical_relations.md | 1 - 1 file changed, 1 deletion(-) diff --git a/site/docs/relations/physical_relations.md b/site/docs/relations/physical_relations.md index a4bffad38..d85d51bcb 100644 --- a/site/docs/relations/physical_relations.md +++ b/site/docs/relations/physical_relations.md @@ -88,7 +88,6 @@ An operator that takes as its input the result of the deduplicated side of the D | Property | Description | Required | |------------|----------------------------------------------------|-----------------------------| | Input | A relational input. | Required | -| Column IDs | The columns that were deduplicated from the input. | Required | ## Merge Equijoin Operator From 5fa49e71d39026695e2e76064925d3329ef08a71 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Tue, 3 Sep 2024 14:37:18 +0200 Subject: [PATCH 6/7] PR requests --- proto/substrait/algebra.proto | 1 - site/docs/relations/physical_relations.md | 12 ++++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/proto/substrait/algebra.proto b/proto/substrait/algebra.proto index 5bfb3bdb9..abcd95a36 100644 --- a/proto/substrait/algebra.proto +++ b/proto/substrait/algebra.proto @@ -778,7 +778,6 @@ message NestedLoopJoinRel { message DuplicateEliminatedGetRel { RelCommon common = 1; ReferenceRel input = 2; - repeated Expression.FieldReference column_ids = 3; } message DuplicateEliminatedJoinRel { diff --git a/site/docs/relations/physical_relations.md b/site/docs/relations/physical_relations.md index d85d51bcb..11cdb2e27 100644 --- a/site/docs/relations/physical_relations.md +++ b/site/docs/relations/physical_relations.md @@ -54,12 +54,12 @@ The Duplicate Eliminated Join is essentially a [Regular Join Operator](logical_r The duplicate eliminated join has two outputs. The first output is no different than a regular join output. The second output is the output to the duplicate eliminated get operator. This output contains only the columns listed in the `duplicate_eliminated_columns` property. This output must be emitted as soon as the build phase is complete. This is because this output is required to compute the probe input. -| Signature | Value | -| -------------------- |----------------------------------------------------------------------------------------------| -| Inputs | 2 | -| Outputs | 1 | -| Property Maintenance | It is the same as the [Hash Equijoin Operator](physical_relations.md#hash-equijoin-operator) | -| Direct Output Order | Same as the [Join](logical_relations.md#join-operator) operator. | +| Signature | Value | +| -------------------- |-------------------------------------------------------------------------------------------------------------------| +| Inputs | 2 | +| Outputs | 2 One output is from the deduplicated columns in the Duplicate Eliminated Get, and the second is the join output. | +| Property Maintenance | It is the same as the [Hash Equijoin Operator](physical_relations.md#hash-equijoin-operator) | +| Direct Output Order | Same as the [Join](logical_relations.md#join-operator) operator. | ### Duplicate Eliminated Join Properties From 991f772b296f4953e2b286e305886bf333e2b95a Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Tue, 3 Sep 2024 17:32:04 +0200 Subject: [PATCH 7/7] update description --- site/docs/relations/physical_relations.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site/docs/relations/physical_relations.md b/site/docs/relations/physical_relations.md index 11cdb2e27..0a96d7533 100644 --- a/site/docs/relations/physical_relations.md +++ b/site/docs/relations/physical_relations.md @@ -52,7 +52,7 @@ The Duplicate Eliminated Join, along with the [Duplicate Eliminated Get Operator The Duplicate Eliminated Join is essentially a [Regular Join Operator](logical_relations.md#join-operator). It can have any regular join type, and its execution is the same. The only restriction is that the join must be a hash equi-join. The main difference is that one of its children has, somewhere in its subtree, a dependency on the deduplicated result of the other. Therefore, this operator pushes the deduplicated result to its dependent child via the Duplicate Eliminated Get Operator. The side that will be deduplicated is specified in the Duplicate Eliminated Side property. The other side is the one that depends on the deduplication. -The duplicate eliminated join has two outputs. The first output is no different than a regular join output. The second output is the output to the duplicate eliminated get operator. This output contains only the columns listed in the `duplicate_eliminated_columns` property. This output must be emitted as soon as the build phase is complete. This is because this output is required to compute the probe input. +The Duplicate Eliminated Join has two outputs. The first output is no different than a regular join output. The second output is the output to the duplicate eliminated get operator. This output contains only the columns listed in the `duplicate_eliminated_columns` property. This output contains only the columns listed in the `duplicate_eliminated_columns` property, corresponding to the side depicted by the `duplicate_eliminated_side`. Note that either the build side or the probe side can be deduplicated and then pushed to the other. | Signature | Value | | -------------------- |-------------------------------------------------------------------------------------------------------------------|