Skip to content

Commit

Permalink
Minor expression cleanups
Browse files Browse the repository at this point in the history
- Add java package name for all proto files
- Add an explicit CAST expression
- Update the varchar literal to specify it's length
- Move AggregateFunction, AggregationPhase and SortField out of the Expression message (top-level in expression.proto).
- Remove AggregationPhase from aggregate rel since it is specified on the individual aggregation function bindings
- Add support for a filter as part of an aggregation measure (SQL2003)
- Cleanup some markdown linebreaks
  • Loading branch information
jacques-n committed Nov 25, 2021
1 parent 5d965be commit 2dd210f
Show file tree
Hide file tree
Showing 11 changed files with 78 additions and 60 deletions.
87 changes: 52 additions & 35 deletions binary/expression.proto
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import "type.proto";
import "selection.proto";

option java_multiple_files = true;
option java_package = "io.substrait.proto";
option csharp_namespace = "Substrait.Protobuf";

message Expression {
Expand All @@ -19,6 +20,7 @@ message Expression {
SingularOrList singular_or_list = 8;
MultiOrList multi_or_list = 9;
Enum enum = 10;
Cast cast = 11;
}

message Enum {
Expand Down Expand Up @@ -47,7 +49,7 @@ message Expression {
IntervalYearToMonth interval_year_to_month = 19;
IntervalDayToSecond interval_day_to_second = 20;
string fixed_char = 21;
string var_char = 22;
VarChar var_char = 22;
bytes fixed_binary = 23;
Decimal decimal = 24;
Struct struct = 25;
Expand All @@ -58,6 +60,16 @@ message Expression {
List list = 30;
}

// whether the literal type should be treated as a nullable type. Applies to
// all members of union other than the Typed null (which should directly
// declare nullability).
bool nullable = 50;

message VarChar {
string value = 1;
uint32 length = 2;
}

message Decimal {
// little-endian twos-complement integer representation of complete value
// (ignoring precision) Always 16 bytes in length
Expand Down Expand Up @@ -106,23 +118,6 @@ message Expression {
Type output_type = 3;
}

message AggregateFunction {
// points to a function_anchor defined in this plan
uint32 function_reference = 1;
repeated Expression args = 2;
repeated SortField sorts = 3;
AggregationPhase phase = 4;
Type output_type = 5;
}

enum AggregationPhase {
UNKNOWN = 0;
INITIAL_TO_INTERMEDIATE = 1;
INTERMEDIATE_TO_INTERMEDIATE = 2;
INITIAL_TO_RESULT = 3;
INTERMEDIATE_TO_RESULT = 4;
}

message WindowFunction {
// points to a function_anchor defined in this plan
uint32 function_reference = 1;
Expand Down Expand Up @@ -153,23 +148,6 @@ message Expression {
}
}

message SortField {
Expression expr = 1;

oneof sort_kind {
SortDirection direction = 2;
uint32 comparison_function_reference = 3;
}
enum SortDirection {
UNKNOWN = 0;
ASC_NULLS_FIRST = 1;
ASC_NULLS_LAST = 2;
DESC_NULLS_FIRST = 3;
DESC_NULLS_LAST = 4;
CLUSTERED = 5;
}
}

message IfThen {

repeated IfClause ifs = 1;
Expand All @@ -181,6 +159,11 @@ message Expression {
}
}

message Cast {
Type type = 1;
Expression input = 2;
}

message SwitchExpression {
repeated IfValue ifs = 1;
Expression else = 2;
Expand Down Expand Up @@ -222,3 +205,37 @@ message Expression {
}
}
}

message SortField {
Expression expr = 1;

oneof sort_kind {
SortDirection direction = 2;
uint32 comparison_function_reference = 3;
}
enum SortDirection {
UNKNOWN = 0;
ASC_NULLS_FIRST = 1;
ASC_NULLS_LAST = 2;
DESC_NULLS_FIRST = 3;
DESC_NULLS_LAST = 4;
CLUSTERED = 5;
}
}

enum AggregationPhase {
UNKNOWN = 0;
INITIAL_TO_INTERMEDIATE = 1;
INTERMEDIATE_TO_INTERMEDIATE = 2;
INITIAL_TO_RESULT = 3;
INTERMEDIATE_TO_RESULT = 4;
}

message AggregateFunction {
// points to a function_anchor defined in this plan
uint32 function_reference = 1;
repeated Expression args = 2;
repeated SortField sorts = 3;
AggregationPhase phase = 4;
Type output_type = 5;
}
1 change: 1 addition & 0 deletions binary/extensions.proto
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ syntax = "proto3";
package io.substrait.extensions;

option java_multiple_files = true;
option java_package = "io.substrait.proto";
option csharp_namespace = "Substrait.Protobuf";

import "google/protobuf/any.proto";
Expand Down
1 change: 1 addition & 0 deletions binary/function.proto
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import "parameterized_types.proto";
import "type_expressions.proto";

option java_multiple_files = true;
option java_package = "io.substrait.proto";
option csharp_namespace = "Substrait.Protobuf";

// List of function signatures available.
Expand Down
1 change: 1 addition & 0 deletions binary/parameterized_types.proto
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ package io.substrait;
import "type.proto";

option java_multiple_files = true;
option java_package = "io.substrait.proto";
option csharp_namespace = "Substrait.Protobuf";

message ParameterizedType {
Expand Down
1 change: 1 addition & 0 deletions binary/plan.proto
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import "relations.proto";
import "extensions.proto";

option java_multiple_files = true;
option java_package = "io.substrait.proto";
option csharp_namespace = "Substrait.Protobuf";

// Either a relation or root relation
Expand Down
14 changes: 11 additions & 3 deletions binary/relations.proto
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import "extensions.proto";
import "google/protobuf/any.proto";

option java_multiple_files = true;
option java_package = "io.substrait.proto";
option csharp_namespace = "Substrait.Protobuf";

message RelCommon {
Expand Down Expand Up @@ -136,18 +137,25 @@ message AggregateRel {
Rel input = 2;
repeated Grouping groupings = 3;
repeated Measure measures = 4;
Expression.AggregationPhase phase = 5;

io.substrait.extensions.AdvancedExtension advanced_extension = 10;

message Grouping { repeated int32 input_fields = 1; }

message Measure { Expression.AggregateFunction measure = 1; }
message Measure {
AggregateFunction measure = 1;

// An optional boolean expression that acts to filter which records are
// included in the measure. True means include this record for calculation
// within the measure.
Expression filter = 2;
}
}

message SortRel {
RelCommon common = 1;
Rel input = 2;
repeated Expression.SortField sorts = 3;
repeated SortField sorts = 3;
io.substrait.extensions.AdvancedExtension advanced_extension = 10;
}

Expand Down
1 change: 1 addition & 0 deletions binary/selection.proto
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ syntax = "proto3";
package io.substrait;

option java_multiple_files = true;
option java_package = "io.substrait.proto";
option csharp_namespace = "Substrait.Protobuf";

message ReferenceSegment {
Expand Down
1 change: 1 addition & 0 deletions binary/type.proto
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ syntax = "proto3";
package io.substrait;

option java_multiple_files = true;
option java_package = "io.substrait.proto";
option csharp_namespace = "Substrait.Protobuf";

message Type {
Expand Down
1 change: 1 addition & 0 deletions binary/type_expressions.proto
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ package io.substrait;
import "type.proto";

option java_multiple_files = true;
option java_package = "io.substrait.proto";
option csharp_namespace = "Substrait.Protobuf";

message DerivationExpression {
Expand Down
12 changes: 7 additions & 5 deletions site/docs/expressions/specialized_record_expressions.md
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
# Specialized Record Expressions

While most all types of operations could be reduced to functions, in some cases this would be overly simplistic. Instead, it is helpful to construct some other expression constructs.

These constructs should be focused on different expression types as opposed to something that directly related to syntactic sugar. For example, CAST and EXTRACT or SQL operations that are presented using specialized syntax. However, they can easily modeled using a function paradigm with minimal complexity.



## Literal Expressions
For each data type, it is possible to create a literal value for that data type. The representation depends on the serialization format. Literal expressions include both a type literal and a possibly null value.

For each data type, it is possible to create a literal value for that data type. The representation depends on the serialization format.


## Cast Expression
To convert a value from one type to another, Substrait defines a cast expression. Cast expression declare an expected type and an input argument.

## If Expression


## If Expression
An if value expression is an expression composed of one if clause, zero or more else if clauses and an else clause. In pseudo code, they are envisioned as:

```
Expand All @@ -23,7 +27,6 @@ else <result expression 3>
When an if expression is declared, all return expressions must be the same identical type.

#### Shortcut Behavior

An if expression is expected to logically short-circuit on a positive outcome. This means that a skipped else/elseif expression cannot cause an error. For example, this should not actually throw an error despite the fact that the cast operation should fail.

```
Expand All @@ -34,7 +37,6 @@ else cast('hello' as integer)


## Switch Expression

Switch expression allow a selection of alternate branches based on the value of a given expression. They are an optimized form of a generic if expression where all conditions are equality to the same value. In pseudo-code:

```
Expand Down
18 changes: 1 addition & 17 deletions site/docs/relations/logical_relations.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ The read operator is an operator that produces one output. A simple example woul

Read definition types are built by the community and added to the specification. This is a portion of specification that is expected to grow rapidly.



#### Virtual Table

| Property | Description | Required |
Expand Down Expand Up @@ -101,8 +99,6 @@ The project operation will produce one or more additional expressions based on t
| Property Maintenance | Distribution maintained, mapped by emit. Orderedness: Maintained if no window operations. Extended to include projection fields if fields are direct references. If window operations are present, no orderedness is maintained. |
| Direct Output Order | The field order of the input + the list of new expressions in the order they are declared in the expressions list. |



### Project Properties

| Property | Description | Required |
Expand All @@ -123,8 +119,6 @@ The join operation will combine two separate inputs into a single output, based
| Property Maintenance | Distribution is maintained. Orderedness is empty post operation. Physical relations may provide better property maintenance. |
| Direct Output Order | The emit order of the left input followed by the emit order of the right input. |



### Join Properties

| Property | Description | Required |
Expand Down Expand Up @@ -156,8 +150,6 @@ The set operation ecompasses several set level operations that support combining
| Property Maintenance | Maintains distribution if all inputs have the same ordinal distribution. Orderness is not maintained. |
| Direct Output Order | All inputs are ordinally matched and returned together. All inputs must have matching record types. |



### Set Properties

| Property | Description | Required |
Expand Down Expand Up @@ -198,7 +190,6 @@ The fetch operation eliminates records outside a desired window. Typically corre
| Offset | A positive integer. Declares the offset for retrieval of records. | Optional, defaults to 0. |
| Count | A positive integer. Declares the number of records that should be returned. | Required |

###

## Aggregate Operation

Expand All @@ -218,9 +209,8 @@ The aggregate operation groups input data on one or more sets of grouping keys,
| Input | The relational input | Required |
| Grouping Sets | One or more grouping sets | Optional, required if no measures. |
| Per Grouping Set | A list of expression grouping that the aggregation measured should be calculated for. | Optional, defaults to 0. |
| Measures | A list of one or more aggregate expressions. | Optional, required if no grouping sets. |
| Measures | A list of one or more aggregate expressions along with an optional filter. | Optional, required if no grouping sets. |

###

## Write Operator

Expand All @@ -247,8 +237,6 @@ The write operator is an operator that consumes one output and writes it to stor

Write definition types are built by the community and added to the specification. This is a portion of specification that is expected to grow rapidly.



#### Virtual Table

| Property | Description | Required |
Expand All @@ -267,10 +255,6 @@ Write definition types are built by the community and added to the specification







## Discussion Points

* How to handle correlated operations?

0 comments on commit 2dd210f

Please sign in to comment.