From 1079b75c68b0a49e7ccde0baa2963f535b7aae24 Mon Sep 17 00:00:00 2001 From: "Heres, Daniel" Date: Fri, 15 Oct 2021 08:54:25 +0200 Subject: [PATCH 1/2] Implement is distinct --- ballista/rust/core/Cargo.toml | 2 +- datafusion/Cargo.toml | 2 +- datafusion/src/logical_plan/operators.rs | 6 ++ .../src/physical_plan/expressions/binary.rs | 63 ++++++++++++++++++- datafusion/src/sql/planner.rs | 12 ++++ datafusion/tests/sql.rs | 42 +++++++++++++ 6 files changed, 123 insertions(+), 4 deletions(-) diff --git a/ballista/rust/core/Cargo.toml b/ballista/rust/core/Cargo.toml index ac53aa00e47e..b3a4e42ac0d9 100644 --- a/ballista/rust/core/Cargo.toml +++ b/ballista/rust/core/Cargo.toml @@ -37,7 +37,7 @@ hashbrown = "0.11" log = "0.4" prost = "0.8" serde = {version = "1", features = ["derive"]} -sqlparser = "0.11.0" +sqlparser = "0.12.0" tokio = "1.0" tonic = "0.5" uuid = { version = "0.8", features = ["v4"] } diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index 0c0b3663b9d8..4fa646165d95 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -52,7 +52,7 @@ ahash = "0.7" hashbrown = { version = "0.11", features = ["raw"] } arrow = { version = "^5.3", features = ["prettyprint"] } parquet = { version = "^5.3", features = ["arrow"] } -sqlparser = "0.11" +sqlparser = "0.12" paste = "^1.0" num_cpus = "1.13.0" chrono = "0.4" diff --git a/datafusion/src/logical_plan/operators.rs b/datafusion/src/logical_plan/operators.rs index 0e00736faac8..50bd682ae3f0 100644 --- a/datafusion/src/logical_plan/operators.rs +++ b/datafusion/src/logical_plan/operators.rs @@ -52,6 +52,10 @@ pub enum Operator { Like, /// Does not match a wildcard pattern NotLike, + /// IS DISTINCT FROM + IsDistinctFrom, + /// IS NOT DISTINCT FROM + IsNotDistinctFrom, /// Case sensitive regex match RegexMatch, /// Case insensitive regex match @@ -84,6 +88,8 @@ impl fmt::Display for Operator { Operator::RegexIMatch => "~*", Operator::RegexNotMatch => "!~", Operator::RegexNotIMatch => "!~*", + Operator::IsDistinctFrom => "IS DISTINCT FROM", + Operator::IsNotDistinctFrom => "IS NOT DISTINCT FROM", }; write!(f, "{}", display) } diff --git a/datafusion/src/physical_plan/expressions/binary.rs b/datafusion/src/physical_plan/expressions/binary.rs index 5838239eec8c..4a3ef581e6f3 100644 --- a/datafusion/src/physical_plan/expressions/binary.rs +++ b/datafusion/src/physical_plan/expressions/binary.rs @@ -36,7 +36,7 @@ use arrow::compute::kernels::comparison::{ lt_eq_utf8_scalar, lt_utf8_scalar, neq_utf8_scalar, nlike_utf8_scalar, regexp_is_match_utf8_scalar, }; -use arrow::datatypes::{DataType, Schema, TimeUnit}; +use arrow::datatypes::{ArrowNumericType, DataType, Schema, TimeUnit}; use arrow::record_batch::RecordBatch; use crate::error::{DataFusionError, Result}; @@ -460,6 +460,9 @@ fn common_binary_type( | Operator::RegexIMatch | Operator::RegexNotMatch | Operator::RegexNotIMatch => string_coercion(lhs_type, rhs_type), + Operator::IsDistinctFrom | Operator::IsNotDistinctFrom => { + eq_coercion(lhs_type, rhs_type) + } }; // re-write the error message of failed coercions to include the operator's information @@ -502,7 +505,9 @@ pub fn binary_operator_data_type( | Operator::RegexMatch | Operator::RegexIMatch | Operator::RegexNotMatch - | Operator::RegexNotIMatch => Ok(DataType::Boolean), + | Operator::RegexNotIMatch + | Operator::IsDistinctFrom + | Operator::IsNotDistinctFrom => Ok(DataType::Boolean), // math operations return the same value as the common coerced type Operator::Plus | Operator::Minus @@ -680,6 +685,10 @@ impl BinaryExpr { Operator::GtEq => binary_array_op!(left, right, gt_eq), Operator::Eq => binary_array_op!(left, right, eq), Operator::NotEq => binary_array_op!(left, right, neq), + Operator::IsDistinctFrom => binary_array_op!(left, right, is_distinct_from), + Operator::IsNotDistinctFrom => { + binary_array_op!(left, right, is_not_distinct_from) + } Operator::Plus => binary_primitive_array_op!(left, right, add), Operator::Minus => binary_primitive_array_op!(left, right, subtract), Operator::Multiply => binary_primitive_array_op!(left, right, multiply), @@ -723,6 +732,56 @@ impl BinaryExpr { } } +fn is_distinct_from( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result +where + T: ArrowNumericType, +{ + Ok(left + .iter() + .zip(right.iter()) + .map(|(x, y)| Some(x != y)) + .collect()) +} + +fn is_distinct_from_utf8( + left: &GenericStringArray, + right: &GenericStringArray, +) -> Result { + Ok(left + .iter() + .zip(right.iter()) + .map(|(x, y)| Some(x != y)) + .collect()) +} + +fn is_not_distinct_from( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result +where + T: ArrowNumericType, +{ + Ok(left + .iter() + .zip(right.iter()) + .map(|(x, y)| Some(x == y)) + .collect()) +} + +fn is_not_distinct_from_utf8( + left: &GenericStringArray, + right: &GenericStringArray, +) -> Result { + Ok(left + .iter() + .zip(right.iter()) + .map(|(x, y)| Some(x == y)) + .collect()) +} + /// return two physical expressions that are optionally coerced to a /// common type that the binary operator supports. fn binary_cast( diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index 2db2b5cb04db..5c1b50107e25 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -1202,6 +1202,18 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { self.sql_expr_to_logical_expr(expr, schema)?, ))), + SQLExpr::IsDistinctFrom(left, right) => Ok(Expr::BinaryExpr { + left: Box::new(self.sql_expr_to_logical_expr(left, schema)?), + op: Operator::IsDistinctFrom, + right: Box::new(self.sql_expr_to_logical_expr(right, schema)?), + }), + + SQLExpr::IsNotDistinctFrom(left, right) => Ok(Expr::BinaryExpr { + left: Box::new(self.sql_expr_to_logical_expr(left, schema)?), + op: Operator::IsNotDistinctFrom, + right: Box::new(self.sql_expr_to_logical_expr(right, schema)?), + }), + SQLExpr::UnaryOp { ref op, ref expr } => match op { UnaryOperator::Not => Ok(Expr::Not(Box::new( self.sql_expr_to_logical_expr(expr, schema)?, diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index f52920575afb..d0f4bbce12b9 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -594,6 +594,48 @@ async fn select_distinct_simple_4() { assert_batches_sorted_eq!(expected, &actual); } +#[tokio::test] +async fn select_distinct_from() { + let mut ctx = ExecutionContext::new(); + + let sql = "select + 1 IS DISTINCT FROM CAST(NULL as INT) as a, + 1 IS DISTINCT FROM 1 as b, + 1 IS NOT DISTINCT FROM CAST(NULL as INT) as c, + 1 IS NOT DISTINCT FROM 1 as d + "; + let actual = execute_to_batches(&mut ctx, sql).await; + let expected = vec![ + "+------+-------+-------+------+", + "| a | b | c | d |", + "+------+-------+-------+------+", + "| true | false | false | true |", + "+------+-------+-------+------+", + ]; + assert_batches_eq!(expected, &actual); +} + +#[tokio::test] +async fn select_distinct_from_utf8() { + let mut ctx = ExecutionContext::new(); + + let sql = "select + 'x' IS DISTINCT FROM NULL as a, + 'x' IS DISTINCT FROM 'x' as b, + 'x' IS NOT DISTINCT FROM NULL as c, + 'x' IS NOT DISTINCT FROM 'x' as d + "; + let actual = execute_to_batches(&mut ctx, sql).await; + let expected = vec![ + "+------+-------+-------+------+", + "| a | b | c | d |", + "+------+-------+-------+------+", + "| true | false | false | true |", + "+------+-------+-------+------+", + ]; + assert_batches_eq!(expected, &actual); +} + #[tokio::test] async fn projection_same_fields() -> Result<()> { let mut ctx = ExecutionContext::new(); From d943256f1353af15226ae689c7b77beed5ec2343 Mon Sep 17 00:00:00 2001 From: "Heres, Daniel" Date: Fri, 15 Oct 2021 15:41:31 +0200 Subject: [PATCH 2/2] Extra tests --- datafusion/tests/sql.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index d0f4bbce12b9..283033bcde4e 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -602,15 +602,17 @@ async fn select_distinct_from() { 1 IS DISTINCT FROM CAST(NULL as INT) as a, 1 IS DISTINCT FROM 1 as b, 1 IS NOT DISTINCT FROM CAST(NULL as INT) as c, - 1 IS NOT DISTINCT FROM 1 as d + 1 IS NOT DISTINCT FROM 1 as d, + NULL IS DISTINCT FROM NULL as e, + NULL IS NOT DISTINCT FROM NULL as f "; let actual = execute_to_batches(&mut ctx, sql).await; let expected = vec![ - "+------+-------+-------+------+", - "| a | b | c | d |", - "+------+-------+-------+------+", - "| true | false | false | true |", - "+------+-------+-------+------+", + "+------+-------+-------+------+-------+------+", + "| a | b | c | d | e | f |", + "+------+-------+-------+------+-------+------+", + "| true | false | false | true | false | true |", + "+------+-------+-------+------+-------+------+", ]; assert_batches_eq!(expected, &actual); }