From 324e30438561b9a6229298ff68218c2a83f73211 Mon Sep 17 00:00:00 2001 From: Jefffrey <22608443+Jefffrey@users.noreply.github.com> Date: Mon, 20 Feb 2023 20:58:24 +1100 Subject: [PATCH 01/13] Support catalog.schema.table.column in SQL SELECT and WHERE --- datafusion/common/src/column.rs | 74 ++- datafusion/common/src/dfschema.rs | 145 +++--- datafusion/common/src/error.rs | 58 +-- datafusion/common/src/table_reference.rs | 453 ++++++++++-------- datafusion/common/src/utils.rs | 134 ++++++ datafusion/core/src/physical_plan/planner.rs | 2 +- datafusion/core/tests/sql/idenfifers.rs | 8 +- datafusion/core/tests/sql/references.rs | 2 +- .../test_files/information_schema.slt | 24 + .../tests/sqllogictests/test_files/join.slt | 2 +- datafusion/expr/src/expr_rewriter.rs | 5 +- datafusion/expr/src/expr_schema.rs | 4 +- datafusion/expr/src/logical_plan/builder.rs | 78 +-- datafusion/expr/src/utils.rs | 4 +- .../optimizer/src/common_subexpr_eliminate.rs | 6 +- datafusion/optimizer/src/optimizer.rs | 6 +- .../optimizer/src/push_down_projection.rs | 9 +- .../optimizer/src/scalar_subquery_to_join.rs | 10 +- .../simplify_expressions/expr_simplifier.rs | 8 +- datafusion/optimizer/src/type_coercion.rs | 17 +- .../src/unwrap_cast_in_comparison.rs | 26 +- .../proto/src/logical_plan/from_proto.rs | 5 +- datafusion/proto/src/logical_plan/to_proto.rs | 6 +- datafusion/sql/src/expr/identifier.rs | 168 +++++-- datafusion/sql/src/expr/mod.rs | 2 +- datafusion/sql/src/planner.rs | 13 +- datafusion/sql/tests/integration_test.rs | 23 +- docs/source/user-guide/example-usage.md | 4 +- 28 files changed, 856 insertions(+), 440 deletions(-) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index ee1304ea05a8..f6a660149954 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -17,7 +17,8 @@ //! Column -use crate::{DFSchema, DataFusionError, Result, SchemaError}; +use crate::utils::{parse_identifiers_normalized, quote_identifier}; +use crate::{DFSchema, DataFusionError, OwnedTableReference, Result, SchemaError}; use std::collections::HashSet; use std::convert::Infallible; use std::fmt; @@ -27,15 +28,18 @@ use std::sync::Arc; /// A named reference to a qualified field in a schema. #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct Column { - /// relation/table name. - pub relation: Option, + /// relation/table reference. + pub relation: Option, /// field/column name. pub name: String, } impl Column { /// Create Column from optional qualifier and name - pub fn new(relation: Option>, name: impl Into) -> Self { + pub fn new( + relation: Option>, + name: impl Into, + ) -> Self { Self { relation: relation.map(|r| r.into()), name: name.into(), @@ -53,26 +57,36 @@ impl Column { /// Deserialize a fully qualified name string into a column pub fn from_qualified_name(flat_name: impl Into) -> Self { let flat_name = flat_name.into(); - use sqlparser::tokenizer::Token; - - let dialect = sqlparser::dialect::GenericDialect {}; - let mut tokenizer = sqlparser::tokenizer::Tokenizer::new(&dialect, &flat_name); - if let Ok(tokens) = tokenizer.tokenize() { - if let [Token::Word(relation), Token::Period, Token::Word(name)] = - tokens.as_slice() - { - return Column { - relation: Some(relation.value.clone()), - name: name.value.clone(), - }; - } - } - // any expression that's not in the form of `foo.bar` will be treated as unqualified column - // name - Column { - relation: None, - name: flat_name, - } + let mut idents = parse_identifiers_normalized(&flat_name); + + let (relation, name) = match idents.len() { + 1 => (None, idents.remove(0)), + 2 => ( + Some(OwnedTableReference::Bare { + table: idents.remove(0), + }), + idents.remove(0), + ), + 3 => ( + Some(OwnedTableReference::Partial { + schema: idents.remove(0), + table: idents.remove(0), + }), + idents.remove(0), + ), + 4 => ( + Some(OwnedTableReference::Full { + catalog: idents.remove(0), + schema: idents.remove(0), + table: idents.remove(0), + }), + idents.remove(0), + ), + // any expression that failed to parse or has more than 4 period delimited + // identifiers will be treated as an unqualified column name + _ => (None, flat_name), + }; + Self { relation, name } } /// Serialize column into a flat name string @@ -83,6 +97,16 @@ impl Column { } } + /// Serialize column into a quoted flat name string + pub fn quoted_flat_name(&self) -> String { + match &self.relation { + Some(r) => { + format!("{}.{}", r.to_quoted_string(), quote_identifier(&self.name)) + } + None => quote_identifier(&self.name), + } + } + /// Qualify column if not done yet. /// /// If this column already has a [relation](Self::relation), it will be returned as is and the given parameters are @@ -147,7 +171,7 @@ impl Column { } Err(DataFusionError::SchemaError(SchemaError::FieldNotFound { - field: Column::new(self.relation.clone(), self.name), + field: Box::new(Column::new(self.relation.clone(), self.name)), valid_fields: schemas .iter() .flat_map(|s| s.fields().iter().map(|f| f.qualified_column())) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 982459ac658b..023b0873d344 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -23,7 +23,8 @@ use std::convert::TryFrom; use std::sync::Arc; use crate::error::{DataFusionError, Result, SchemaError}; -use crate::{field_not_found, Column, TableReference}; +use crate::utils::quote_identifier; +use crate::{field_not_found, Column, OwnedTableReference, TableReference}; use arrow::compute::can_cast_types; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; @@ -69,7 +70,7 @@ impl DFSchema { if !qualified_names.insert((qualifier, field.name())) { return Err(DataFusionError::SchemaError( SchemaError::DuplicateQualifiedField { - qualifier: qualifier.to_string(), + qualifier: qualifier.clone(), name: field.name().to_string(), }, )); @@ -89,18 +90,16 @@ impl DFSchema { let mut qualified_names = qualified_names .iter() .map(|(l, r)| (l.to_owned(), r.to_owned())) - .collect::>(); - qualified_names.sort_by(|a, b| { - let a = format!("{}.{}", a.0, a.1); - let b = format!("{}.{}", b.0, b.1); - a.cmp(&b) - }); + .collect::>(); + qualified_names.sort(); for (qualifier, name) in &qualified_names { if unqualified_names.contains(name) { return Err(DataFusionError::SchemaError( SchemaError::AmbiguousReference { - qualifier: Some(qualifier.to_string()), - name: name.to_string(), + field: Column { + relation: Some((*qualifier).clone()), + name: name.to_string(), + }, }, )); } @@ -139,7 +138,9 @@ impl DFSchema { for field in other_schema.fields() { // skip duplicate columns let duplicated_field = match field.qualifier() { - Some(q) => self.field_with_name(Some(q.as_str()), field.name()).is_ok(), + Some(q) => self + .field_with_name(Some(&q.as_table_reference()), field.name()) + .is_ok(), // for unqualified columns, check as unqualified name None => self.field_with_unqualified_name(field.name()).is_ok(), }; @@ -172,7 +173,7 @@ impl DFSchema { // a fully qualified field name is provided. match &self.fields[i].qualifier { Some(qualifier) => { - if (qualifier.to_owned() + "." + self.fields[i].name()) == name { + if (qualifier.to_string() + "." + self.fields[i].name()) == name { return Err(DataFusionError::Plan(format!( "Fully qualified field name '{name}' was supplied to `index_of` \ which is deprecated. Please use `index_of_column_by_name` instead" @@ -184,12 +185,12 @@ impl DFSchema { } } - Err(field_not_found(None, name, self)) + Err(field_not_found::<&str>(None, name, self)) } pub fn index_of_column_by_name( &self, - qualifier: Option<&str>, + qualifier: Option<&TableReference>, name: &str, ) -> Result> { let mut matches = self @@ -200,19 +201,19 @@ impl DFSchema { // field to lookup is qualified. // current field is qualified and not shared between relations, compare both // qualifier and name. - (Some(q), Some(field_q)) => q == field_q && field.name() == name, + (Some(q), Some(field_q)) => { + q.resolved_eq(&field_q.as_table_reference()) && field.name() == name + } // field to lookup is qualified but current field is unqualified. (Some(qq), None) => { // the original field may now be aliased with a name that matches the // original qualified name - let table_ref = TableReference::parse_str(field.name().as_str()); - match table_ref { - TableReference::Partial { schema, table } => { - schema == qq && table == name - } - TableReference::Full { schema, table, .. } => { - schema == qq && table == name - } + let column = Column::from_qualified_name(field.name()); + match column { + Column { + relation: Some(r), + name: column_name, + } => &r == qq && column_name == name, _ => false, } } @@ -226,9 +227,11 @@ impl DFSchema { None => Ok(Some(idx)), // found more than one matches Some(_) => Err(DataFusionError::Internal(format!( - "Ambiguous reference to qualified field named '{}.{}'", - qualifier.unwrap_or(""), - name + "Ambiguous reference to qualified field named {}.{}", + qualifier + .map(|q| q.to_quoted_string()) + .unwrap_or("".to_string()), + quote_identifier(name) ))), }, } @@ -236,23 +239,22 @@ impl DFSchema { /// Find the index of the column with the given qualifier and name pub fn index_of_column(&self, col: &Column) -> Result { - let qualifier = col.relation.as_deref(); - self.index_of_column_by_name(col.relation.as_deref(), &col.name)? - .ok_or_else(|| { - field_not_found(qualifier.map(|s| s.to_string()), &col.name, self) - }) + let tr = col.relation.as_ref().map(|r| r.as_table_reference()); + self.index_of_column_by_name(tr.as_ref(), &col.name)? + .ok_or_else(|| field_not_found(col.relation.clone(), &col.name, self)) } /// Check if the column is in the current schema pub fn is_column_from_schema(&self, col: &Column) -> Result { - self.index_of_column_by_name(col.relation.as_deref(), &col.name) + let tr = col.relation.as_ref().map(|r| r.as_table_reference()); + self.index_of_column_by_name(tr.as_ref(), &col.name) .map(|idx| idx.is_some()) } /// Find the field with the given name pub fn field_with_name( &self, - qualifier: Option<&str>, + qualifier: Option<&TableReference>, name: &str, ) -> Result<&DFField> { if let Some(qualifier) = qualifier { @@ -263,7 +265,7 @@ impl DFSchema { } /// Find all fields having the given qualifier - pub fn fields_with_qualified(&self, qualifier: &str) -> Vec<&DFField> { + pub fn fields_with_qualified(&self, qualifier: &TableReference) -> Vec<&DFField> { self.fields .iter() .filter(|field| field.qualifier().map(|q| q.eq(qualifier)).unwrap_or(false)) @@ -282,12 +284,14 @@ impl DFSchema { pub fn field_with_unqualified_name(&self, name: &str) -> Result<&DFField> { let matches = self.fields_with_unqualified_name(name); match matches.len() { - 0 => Err(field_not_found(None, name, self)), + 0 => Err(field_not_found::<&str>(None, name, self)), 1 => Ok(matches[0]), _ => Err(DataFusionError::SchemaError( SchemaError::AmbiguousReference { - qualifier: None, - name: name.to_string(), + field: Column { + relation: None, + name: name.to_string(), + }, }, )), } @@ -296,7 +300,7 @@ impl DFSchema { /// Find the field with the given qualified name pub fn field_with_qualified_name( &self, - qualifier: &str, + qualifier: &TableReference, name: &str, ) -> Result<&DFField> { let idx = self @@ -309,7 +313,9 @@ impl DFSchema { /// Find the field with the given qualified column pub fn field_from_column(&self, column: &Column) -> Result<&DFField> { match &column.relation { - Some(r) => self.field_with_qualified_name(r, &column.name), + Some(r) => { + self.field_with_qualified_name(&r.as_table_reference(), &column.name) + } None => self.field_with_unqualified_name(&column.name), } } @@ -413,12 +419,13 @@ impl DFSchema { } /// Replace all field qualifier with new value in schema - pub fn replace_qualifier(self, qualifier: &str) -> Self { + pub fn replace_qualifier(self, qualifier: impl Into) -> Self { + let qualifier = qualifier.into(); DFSchema { fields: self .fields .into_iter() - .map(|f| DFField::from_qualified(qualifier, f.field)) + .map(|f| DFField::from_qualified(qualifier.clone(), f.field)) .collect(), ..self } @@ -573,21 +580,21 @@ impl ExprSchema for DFSchema { #[derive(Debug, Clone, PartialEq, Eq)] pub struct DFField { /// Optional qualifier (usually a table or relation name) - qualifier: Option, + qualifier: Option, /// Arrow field definition field: Field, } impl DFField { /// Creates a new `DFField` - pub fn new( - qualifier: Option<&str>, + pub fn new>( + qualifier: Option, name: &str, data_type: DataType, nullable: bool, ) -> Self { DFField { - qualifier: qualifier.map(|s| s.to_owned()), + qualifier: qualifier.map(|s| s.into()), field: Field::new(name, data_type, nullable), } } @@ -601,9 +608,12 @@ impl DFField { } /// Create a qualified field from an existing Arrow field - pub fn from_qualified(qualifier: &str, field: Field) -> Self { + pub fn from_qualified( + qualifier: impl Into, + field: Field, + ) -> Self { Self { - qualifier: Some(qualifier.to_owned()), + qualifier: Some(qualifier.into()), field, } } @@ -649,7 +659,7 @@ impl DFField { } /// Get the optional qualifier - pub fn qualifier(&self) -> Option<&String> { + pub fn qualifier(&self) -> Option<&OwnedTableReference> { self.qualifier.as_ref() } @@ -677,7 +687,7 @@ mod tests { // lookup with unqualified name "t1.c0" let err = schema.index_of_column(&col).err().unwrap(); assert_eq!( - "Schema error: No field named 't1.c0'. Valid fields are 't1'.'c0', 't1'.'c1'.", + r#"Schema error: No field named "t1.c0". Valid fields are "t1"."c0", "t1"."c1"."#, &format!("{err}") ); Ok(()) @@ -733,8 +743,12 @@ mod tests { join.to_string() ); // test valid access - assert!(join.field_with_qualified_name("t1", "c0").is_ok()); - assert!(join.field_with_qualified_name("t2", "c0").is_ok()); + assert!(join + .field_with_qualified_name(&TableReference::bare("t1"), "c0") + .is_ok()); + assert!(join + .field_with_qualified_name(&TableReference::bare("t2"), "c0") + .is_ok()); // test invalid access assert!(join.field_with_unqualified_name("c0").is_err()); assert!(join.field_with_unqualified_name("t1.c0").is_err()); @@ -750,7 +764,7 @@ mod tests { assert!(join.is_err()); assert_eq!( "Schema error: Schema contains duplicate \ - qualified field name \'t1\'.\'c0\'", + qualified field name \"t1\".\"c0\"", &format!("{}", join.err().unwrap()) ); Ok(()) @@ -764,7 +778,7 @@ mod tests { assert!(join.is_err()); assert_eq!( "Schema error: Schema contains duplicate \ - unqualified field name \'c0\'", + unqualified field name \"c0\"", &format!("{}", join.err().unwrap()) ); Ok(()) @@ -780,14 +794,18 @@ mod tests { join.to_string() ); // test valid access - assert!(join.field_with_qualified_name("t1", "c0").is_ok()); + assert!(join + .field_with_qualified_name(&TableReference::bare("t1"), "c0") + .is_ok()); assert!(join.field_with_unqualified_name("c0").is_ok()); assert!(join.field_with_unqualified_name("c100").is_ok()); assert!(join.field_with_name(None, "c100").is_ok()); // test invalid access assert!(join.field_with_unqualified_name("t1.c0").is_err()); assert!(join.field_with_unqualified_name("t1.c100").is_err()); - assert!(join.field_with_qualified_name("", "c100").is_err()); + assert!(join + .field_with_qualified_name(&TableReference::bare(""), "c100") + .is_err()); Ok(()) } @@ -799,7 +817,7 @@ mod tests { assert!(join.is_err()); assert_eq!( "Schema error: Schema contains qualified \ - field name \'t1\'.\'c0\' and unqualified field name \'c0\' which would be ambiguous", + field name \"t1\".\"c0\" and unqualified field name \"c0\" which would be ambiguous", &format!("{}", join.err().unwrap()) ); Ok(()) @@ -809,11 +827,11 @@ mod tests { #[test] fn helpful_error_messages() -> Result<()> { let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; - let expected_help = "Valid fields are \'t1\'.\'c0\', \'t1\'.\'c1\'."; + let expected_help = "Valid fields are \"t1\".\"c0\", \"t1\".\"c1\"."; // Pertinent message parts - let expected_err_msg = "Fully qualified field name \'t1.c0\'"; + let expected_err_msg = "Fully qualified field name 't1.c0'"; assert!(schema - .field_with_qualified_name("x", "y") + .field_with_qualified_name(&TableReference::bare("x"), "y") .unwrap_err() .to_string() .contains(expected_help)); @@ -841,12 +859,15 @@ mod tests { let col = Column::from_qualified_name("t1.c0"); let err = schema.index_of_column(&col).err().unwrap(); - assert_eq!("Schema error: No field named 't1'.'c0'.", &format!("{err}")); + assert_eq!( + r#"Schema error: No field named "t1"."c0"."#, + &format!("{err}") + ); // the same check without qualifier let col = Column::from_name("c0"); let err = schema.index_of_column(&col).err().unwrap(); - assert_eq!("Schema error: No field named 'c0'.", &format!("{err}")); + assert_eq!(r#"Schema error: No field named "c0"."#, &format!("{err}")); } #[test] @@ -1079,7 +1100,7 @@ mod tests { let arrow_schema_ref = Arc::new(arrow_schema.clone()); let df_schema = DFSchema::new_with_metadata( - vec![DFField::new(None, "c0", DataType::Int64, true)], + vec![DFField::new::<&str>(None, "c0", DataType::Int64, true)], metadata, ) .unwrap(); diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index 3f10c1261cff..88b2b3c49bc2 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -23,7 +23,8 @@ use std::io; use std::result; use std::sync::Arc; -use crate::{Column, DFSchema}; +use crate::utils::quote_identifier; +use crate::{Column, DFSchema, OwnedTableReference}; #[cfg(feature = "avro")] use apache_avro::Error as AvroError; use arrow::error::ArrowError; @@ -120,29 +121,29 @@ macro_rules! plan_err { #[derive(Debug)] pub enum SchemaError { /// Schema contains a (possibly) qualified and unqualified field with same unqualified name - AmbiguousReference { - qualifier: Option, + AmbiguousReference { field: Column }, + /// Schema contains duplicate qualified field name + DuplicateQualifiedField { + qualifier: OwnedTableReference, name: String, }, - /// Schema contains duplicate qualified field name - DuplicateQualifiedField { qualifier: String, name: String }, /// Schema contains duplicate unqualified field name DuplicateUnqualifiedField { name: String }, /// No field with this name FieldNotFound { - field: Column, + field: Box, valid_fields: Vec, }, } /// Create a "field not found" DataFusion::SchemaError -pub fn field_not_found( - qualifier: Option, +pub fn field_not_found>( + qualifier: Option, name: &str, schema: &DFSchema, ) -> DataFusionError { DataFusionError::SchemaError(SchemaError::FieldNotFound { - field: Column::new(qualifier, name), + field: Box::new(Column::new(qualifier, name)), valid_fields: schema .fields() .iter() @@ -158,25 +159,14 @@ impl Display for SchemaError { field, valid_fields, } => { - write!(f, "No field named ")?; - if let Some(q) = &field.relation { - write!(f, "'{}'.'{}'", q, field.name)?; - } else { - write!(f, "'{}'", field.name)?; - } + write!(f, "No field named {}", field.quoted_flat_name())?; if !valid_fields.is_empty() { write!( f, ". Valid fields are {}", valid_fields .iter() - .map(|field| { - if let Some(q) = &field.relation { - format!("'{}'.'{}'", q, field.name) - } else { - format!("'{}'", field.name) - } - }) + .map(|field| field.quoted_flat_name()) .collect::>() .join(", ") )?; @@ -186,20 +176,32 @@ impl Display for SchemaError { Self::DuplicateQualifiedField { qualifier, name } => { write!( f, - "Schema contains duplicate qualified field name '{qualifier}'.'{name}'" + "Schema contains duplicate qualified field name {}.{}", + qualifier.to_quoted_string(), + quote_identifier(name) ) } Self::DuplicateUnqualifiedField { name } => { write!( f, - "Schema contains duplicate unqualified field name '{name}'" + "Schema contains duplicate unqualified field name {}", + quote_identifier(name) ) } - Self::AmbiguousReference { qualifier, name } => { - if let Some(q) = qualifier { - write!(f, "Schema contains qualified field name '{q}'.'{name}' and unqualified field name '{name}' which would be ambiguous") + Self::AmbiguousReference { field } => { + if field.relation.is_some() { + write!( + f, + "Schema contains qualified field name {} and unqualified field name {} which would be ambiguous", + field.quoted_flat_name(), + quote_identifier(&field.name) + ) } else { - write!(f, "Ambiguous reference to unqualified field '{name}'") + write!( + f, + "Ambiguous reference to unqualified field {}", + field.quoted_flat_name() + ) } } } diff --git a/datafusion/common/src/table_reference.rs b/datafusion/common/src/table_reference.rs index 4ca41edd9aa2..60973d46410d 100644 --- a/datafusion/common/src/table_reference.rs +++ b/datafusion/common/src/table_reference.rs @@ -15,13 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::error::Result; -use sqlparser::{ - ast::Ident, - dialect::GenericDialect, - parser::{Parser, ParserError}, - tokenizer::{Token, TokenWithLocation}, -}; +use crate::utils::{parse_identifiers_normalized, quote_identifier}; use std::borrow::Cow; /// A resolved path to a table of the form "catalog.schema.table" @@ -67,53 +61,51 @@ pub enum TableReference<'a> { }, } -/// Represents a path to a table that may require further resolution -/// that owns the underlying names -#[derive(Debug, Clone)] -pub enum OwnedTableReference { - /// An unqualified table reference, e.g. "table" - Bare { - /// The table name - table: String, - }, - /// A partially resolved table reference, e.g. "schema.table" - Partial { - /// The schema containing the table - schema: String, - /// The table name - table: String, - }, - /// A fully resolved table reference, e.g. "catalog.schema.table" - Full { - /// The catalog (aka database) containing the table - catalog: String, - /// The schema containing the table - schema: String, - /// The table name - table: String, - }, -} - -impl OwnedTableReference { - /// Return a `TableReference` view of this `OwnedTableReference` - pub fn as_table_reference(&self) -> TableReference<'_> { +impl std::fmt::Display for TableReference<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - Self::Bare { table } => TableReference::Bare { - table: table.into(), - }, - Self::Partial { schema, table } => TableReference::Partial { - schema: schema.into(), - table: table.into(), - }, - Self::Full { + TableReference::Bare { table } => write!(f, "{table}"), + TableReference::Partial { schema, table } => { + write!(f, "{schema}.{table}") + } + TableReference::Full { catalog, schema, table, - } => TableReference::Full { - catalog: catalog.into(), - schema: schema.into(), - table: table.into(), - }, + } => write!(f, "{catalog}.{schema}.{table}"), + } + } +} + +impl<'a> TableReference<'a> { + /// Convenience method for creating a `Bare` variant of `TableReference` + pub fn bare(table: impl Into>) -> TableReference<'a> { + TableReference::Bare { + table: table.into(), + } + } + + /// Convenience method for creating a `Partial` variant of `TableReference` + pub fn partial( + table: impl Into>, + schema: impl Into>, + ) -> TableReference<'a> { + TableReference::Partial { + table: table.into(), + schema: schema.into(), + } + } + + /// Convenience method for creating a `Full` variant of `TableReference` + pub fn full( + table: impl Into>, + schema: impl Into>, + catalog: impl Into>, + ) -> TableReference<'a> { + TableReference::Full { + table: table.into(), + schema: schema.into(), + catalog: catalog.into(), } } @@ -125,39 +117,44 @@ impl OwnedTableReference { | Self::Bare { table } => table, } } -} -impl std::fmt::Display for OwnedTableReference { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + /// Retrieve the schema name if in the `Partial` or `Full` qualification + pub fn schema(&self) -> Option<&str> { match self { - OwnedTableReference::Bare { table } => write!(f, "{table}"), - OwnedTableReference::Partial { schema, table } => { - write!(f, "{schema}.{table}") - } - OwnedTableReference::Full { - catalog, - schema, - table, - } => write!(f, "{catalog}.{schema}.{table}"), + Self::Full { schema, .. } | Self::Partial { schema, .. } => Some(schema), + _ => None, } } -} -/// Convert `OwnedTableReference` into a `TableReference`. Somewhat -/// awkward to use but 'idiomatic': `(&table_ref).into()` -impl<'a> From<&'a OwnedTableReference> for TableReference<'a> { - fn from(r: &'a OwnedTableReference) -> Self { - r.as_table_reference() + /// Retrieve the catalog name if in the `Full` qualification + pub fn catalog(&self) -> Option<&str> { + match self { + Self::Full { catalog, .. } => Some(catalog), + _ => None, + } } -} -impl<'a> TableReference<'a> { - /// Retrieve the actual table name, regardless of qualification - pub fn table(&self) -> &str { + /// Compare with another `TableReference` as if both are resolved. + /// This allows comparing across variants, where if a field is not present + /// in both variants being compared then it is ignored in the comparison. + /// + /// e.g. this allows a `TableReference::Bare` to be considered equal to a + /// fully qualified `TableReference::Full` if the table names match. + pub fn resolved_eq(&self, other: &Self) -> bool { match self { - Self::Full { table, .. } - | Self::Partial { table, .. } - | Self::Bare { table } => table, + TableReference::Bare { table } => table == other.table(), + TableReference::Partial { schema, table } => { + table == other.table() && other.schema().map_or(true, |s| s == schema) + } + TableReference::Full { + catalog, + schema, + table, + } => { + table == other.table() + && other.schema().map_or(true, |s| s == schema) + && other.catalog().map_or(true, |c| c == catalog) + } } } @@ -190,6 +187,48 @@ impl<'a> TableReference<'a> { } } + /// Converts directly into an [`OwnedTableReference`] + pub fn to_owned_reference(self) -> OwnedTableReference { + match self { + Self::Full { + catalog, + schema, + table, + } => OwnedTableReference::Full { + catalog: catalog.into(), + schema: schema.into(), + table: table.into(), + }, + Self::Partial { schema, table } => OwnedTableReference::Partial { + schema: schema.into(), + table: table.into(), + }, + Self::Bare { table } => OwnedTableReference::Bare { + table: table.into(), + }, + } + } + + /// Forms a string where the identifiers are quoted + pub fn to_quoted_string(&self) -> String { + match self { + TableReference::Bare { table } => quote_identifier(table), + TableReference::Partial { schema, table } => { + format!("{}.{}", quote_identifier(schema), quote_identifier(table)) + } + TableReference::Full { + catalog, + schema, + table, + } => format!( + "{}.{}.{}", + quote_identifier(catalog), + quote_identifier(schema), + quote_identifier(table) + ), + } + } + /// Forms a [`TableReference`] by attempting to parse `s` as a multipart identifier, /// failing that then taking the entire unnormalized input as the identifier itself. /// @@ -199,14 +238,7 @@ impl<'a> TableReference<'a> { /// `Foo".bar` (note the preserved case and requiring two double quotes to represent /// a single double quote in the identifier) pub fn parse_str(s: &'a str) -> Self { - let mut parts = parse_identifiers(s) - .unwrap_or_default() - .into_iter() - .map(|id| match id.quote_style { - Some(_) => id.value, - None => id.value.to_ascii_lowercase(), - }) - .collect::>(); + let mut parts = parse_identifiers_normalized(s); match parts.len() { 1 => Self::Bare { @@ -226,57 +258,156 @@ impl<'a> TableReference<'a> { } } -// TODO: remove when can use https://github.com/sqlparser-rs/sqlparser-rs/issues/805 -fn parse_identifiers(s: &str) -> Result> { - let dialect = GenericDialect; - let mut parser = Parser::new(&dialect).try_with_sql(s)?; - let mut idents = vec![]; - - // expecting at least one word for identifier - match parser.next_token_no_skip() { - Some(TokenWithLocation { - token: Token::Word(w), - .. - }) => idents.push(w.to_ident()), - Some(TokenWithLocation { token, .. }) => { - return Err(ParserError::ParserError(format!( - "Unexpected token in identifier: {token}" - )))? - } - None => { - return Err(ParserError::ParserError( - "Empty input when parsing identifier".to_string(), - ))? +/// Represents a path to a table that may require further resolution +/// that owns the underlying names +#[derive(Debug, Clone, PartialEq, Eq, Hash, Ord, PartialOrd)] +pub enum OwnedTableReference { + /// An unqualified table reference, e.g. "table" + Bare { + /// The table name + table: String, + }, + /// A partially resolved table reference, e.g. "schema.table" + Partial { + /// The schema containing the table + schema: String, + /// The table name + table: String, + }, + /// A fully resolved table reference, e.g. "catalog.schema.table" + Full { + /// The catalog (aka database) containing the table + catalog: String, + /// The schema containing the table + schema: String, + /// The table name + table: String, + }, +} + +impl std::fmt::Display for OwnedTableReference { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + OwnedTableReference::Bare { table } => write!(f, "{table}"), + OwnedTableReference::Partial { schema, table } => { + write!(f, "{schema}.{table}") + } + OwnedTableReference::Full { + catalog, + schema, + table, + } => write!(f, "{catalog}.{schema}.{table}"), } - }; - - while let Some(TokenWithLocation { token, .. }) = parser.next_token_no_skip() { - match token { - // ensure that optional period is succeeded by another identifier - Token::Period => match parser.next_token_no_skip() { - Some(TokenWithLocation { - token: Token::Word(w), - .. - }) => idents.push(w.to_ident()), - Some(TokenWithLocation { token, .. }) => { - return Err(ParserError::ParserError(format!( - "Unexpected token following period in identifier: {token}" - )))? - } - None => { - return Err(ParserError::ParserError( - "Trailing period in identifier".to_string(), - ))? - } + } +} + +impl OwnedTableReference { + /// Return a `TableReference` view of this `OwnedTableReference` + pub fn as_table_reference(&self) -> TableReference<'_> { + match self { + Self::Bare { table } => TableReference::Bare { + table: table.into(), + }, + Self::Partial { schema, table } => TableReference::Partial { + schema: schema.into(), + table: table.into(), + }, + Self::Full { + catalog, + schema, + table, + } => TableReference::Full { + catalog: catalog.into(), + schema: schema.into(), + table: table.into(), }, - _ => { - return Err(ParserError::ParserError(format!( - "Unexpected token in identifier: {token}" - )))? + } + } + + /// Retrieve the actual table name, regardless of qualification + pub fn table(&self) -> &str { + match self { + Self::Full { table, .. } + | Self::Partial { table, .. } + | Self::Bare { table } => table, + } + } + + /// Forms a string where the identifiers are quoted + pub fn to_quoted_string(&self) -> String { + match self { + OwnedTableReference::Bare { table } => quote_identifier(table), + OwnedTableReference::Partial { schema, table } => { + format!("{}.{}", quote_identifier(schema), quote_identifier(table)) } + OwnedTableReference::Full { + catalog, + schema, + table, + } => format!( + "{}.{}.{}", + quote_identifier(catalog), + quote_identifier(schema), + quote_identifier(table) + ), } } - Ok(idents) +} + +impl PartialEq> for OwnedTableReference { + fn eq(&self, other: &TableReference<'_>) -> bool { + self.as_table_reference().eq(other) + } +} + +impl PartialEq for TableReference<'_> { + fn eq(&self, other: &OwnedTableReference) -> bool { + self.eq(&other.as_table_reference()) + } +} + +/// Parse a `&str` into a OwnedTableReference +impl From<&str> for OwnedTableReference { + fn from(s: &str) -> Self { + let table_reference: TableReference = s.into(); + table_reference.to_owned_reference() + } +} + +/// Parse a `String` into a OwnedTableReference +impl From for OwnedTableReference { + fn from(s: String) -> Self { + Self::from(s.as_str()) + } +} + +/// Parse a `&String` into a OwnedTableReference +impl From<&String> for OwnedTableReference { + fn from(s: &String) -> Self { + Self::from(s.as_str()) + } +} + +/// Parse a `&String` into a OwnedTableReference +impl From<&OwnedTableReference> for OwnedTableReference { + fn from(s: &OwnedTableReference) -> Self { + s.clone() + } +} + +/// Parse a `TableReference` into a OwnedTableReference +impl From<&'_ TableReference<'_>> for OwnedTableReference { + fn from(s: &'_ TableReference) -> Self { + s.to_owned().to_owned_reference() + } +} + +/// Convert `OwnedTableReference` into a `TableReference`. Somewhat +/// awkward to use but 'idiomatic': `(&table_ref).into()` +impl<'a> From<&'a OwnedTableReference> for TableReference<'a> { + fn from(r: &'a OwnedTableReference) -> Self { + r.as_table_reference() + } } /// Parse a string into a TableReference, normalizing where appropriate @@ -288,6 +419,12 @@ impl<'a> From<&'a str> for TableReference<'a> { } } +impl<'a> From<&'a String> for TableReference<'a> { + fn from(s: &'a String) -> Self { + Self::parse_str(s) + } +} + impl<'a> From> for TableReference<'a> { fn from(resolved: ResolvedTableReference<'a>) -> Self { Self::Full { @@ -302,64 +439,6 @@ impl<'a> From> for TableReference<'a> { mod tests { use super::*; - #[test] - fn test_parse_identifiers() -> Result<()> { - let s = "CATALOG.\"F(o)o. \"\"bar\".table"; - let actual = parse_identifiers(s)?; - let expected = vec![ - Ident { - value: "CATALOG".to_string(), - quote_style: None, - }, - Ident { - value: "F(o)o. \"bar".to_string(), - quote_style: Some('"'), - }, - Ident { - value: "table".to_string(), - quote_style: None, - }, - ]; - assert_eq!(expected, actual); - - let s = ""; - let err = parse_identifiers(s).expect_err("didn't fail to parse"); - assert_eq!( - "SQL(ParserError(\"Empty input when parsing identifier\"))", - format!("{err:?}") - ); - - let s = "*schema.table"; - let err = parse_identifiers(s).expect_err("didn't fail to parse"); - assert_eq!( - "SQL(ParserError(\"Unexpected token in identifier: *\"))", - format!("{err:?}") - ); - - let s = "schema.table*"; - let err = parse_identifiers(s).expect_err("didn't fail to parse"); - assert_eq!( - "SQL(ParserError(\"Unexpected token in identifier: *\"))", - format!("{err:?}") - ); - - let s = "schema.table."; - let err = parse_identifiers(s).expect_err("didn't fail to parse"); - assert_eq!( - "SQL(ParserError(\"Trailing period in identifier\"))", - format!("{err:?}") - ); - - let s = "schema.*"; - let err = parse_identifiers(s).expect_err("didn't fail to parse"); - assert_eq!( - "SQL(ParserError(\"Unexpected token following period in identifier: *\"))", - format!("{err:?}") - ); - - Ok(()) - } - #[test] fn test_table_reference_from_str_normalizes() { let expected = TableReference::Full { diff --git a/datafusion/common/src/utils.rs b/datafusion/common/src/utils.rs index 3c073015343c..a1226def8e56 100644 --- a/datafusion/common/src/utils.rs +++ b/datafusion/common/src/utils.rs @@ -20,6 +20,10 @@ use crate::{DataFusionError, Result, ScalarValue}; use arrow::array::ArrayRef; use arrow::compute::SortOptions; +use sqlparser::ast::Ident; +use sqlparser::dialect::GenericDialect; +use sqlparser::parser::{Parser, ParserError}; +use sqlparser::tokenizer::{Token, TokenWithLocation}; use std::cmp::Ordering; /// Given column vectors, returns row at `idx`. @@ -158,6 +162,78 @@ where Ok(low) } +/// Wraps identifier string in double quotes, escaping any double quotes in +/// the identifier by replacing it with two double quotes +/// +/// e.g. identifier `tab.le"name` becomes `"tab.le""name"` +pub fn quote_identifier(s: &str) -> String { + format!("\"{}\"", s.replace('"', "\"\"")) +} + +// TODO: remove when can use https://github.com/sqlparser-rs/sqlparser-rs/issues/805 +pub(crate) fn parse_identifiers(s: &str) -> Result> { + let dialect = GenericDialect; + let mut parser = Parser::new(&dialect).try_with_sql(s)?; + let mut idents = vec![]; + + // expecting at least one word for identifier + match parser.next_token_no_skip() { + Some(TokenWithLocation { + token: Token::Word(w), + .. + }) => idents.push(w.to_ident()), + Some(TokenWithLocation { token, .. }) => { + return Err(ParserError::ParserError(format!( + "Unexpected token in identifier: {token}" + )))? + } + None => { + return Err(ParserError::ParserError( + "Empty input when parsing identifier".to_string(), + ))? + } + }; + + while let Some(TokenWithLocation { token, .. }) = parser.next_token_no_skip() { + match token { + // ensure that optional period is succeeded by another identifier + Token::Period => match parser.next_token_no_skip() { + Some(TokenWithLocation { + token: Token::Word(w), + .. + }) => idents.push(w.to_ident()), + Some(TokenWithLocation { token, .. }) => { + return Err(ParserError::ParserError(format!( + "Unexpected token following period in identifier: {token}" + )))? + } + None => { + return Err(ParserError::ParserError( + "Trailing period in identifier".to_string(), + ))? + } + }, + _ => { + return Err(ParserError::ParserError(format!( + "Unexpected token in identifier: {token}" + )))? + } + } + } + Ok(idents) +} + +pub(crate) fn parse_identifiers_normalized(s: &str) -> Vec { + parse_identifiers(s) + .unwrap_or_default() + .into_iter() + .map(|id| match id.quote_style { + Some(_) => id.value, + None => id.value.to_ascii_lowercase(), + }) + .collect::>() +} + #[cfg(test)] mod tests { use arrow::array::Float64Array; @@ -330,4 +406,62 @@ mod tests { assert_eq!(res, 2); Ok(()) } + + #[test] + fn test_parse_identifiers() -> Result<()> { + let s = "CATALOG.\"F(o)o. \"\"bar\".table"; + let actual = parse_identifiers(s)?; + let expected = vec![ + Ident { + value: "CATALOG".to_string(), + quote_style: None, + }, + Ident { + value: "F(o)o. \"bar".to_string(), + quote_style: Some('"'), + }, + Ident { + value: "table".to_string(), + quote_style: None, + }, + ]; + assert_eq!(expected, actual); + + let s = ""; + let err = parse_identifiers(s).expect_err("didn't fail to parse"); + assert_eq!( + "SQL(ParserError(\"Empty input when parsing identifier\"))", + format!("{err:?}") + ); + + let s = "*schema.table"; + let err = parse_identifiers(s).expect_err("didn't fail to parse"); + assert_eq!( + "SQL(ParserError(\"Unexpected token in identifier: *\"))", + format!("{err:?}") + ); + + let s = "schema.table*"; + let err = parse_identifiers(s).expect_err("didn't fail to parse"); + assert_eq!( + "SQL(ParserError(\"Unexpected token in identifier: *\"))", + format!("{err:?}") + ); + + let s = "schema.table."; + let err = parse_identifiers(s).expect_err("didn't fail to parse"); + assert_eq!( + "SQL(ParserError(\"Trailing period in identifier\"))", + format!("{err:?}") + ); + + let s = "schema.*"; + let err = parse_identifiers(s).expect_err("didn't fail to parse"); + assert_eq!( + "SQL(ParserError(\"Unexpected token following period in identifier: *\"))", + format!("{err:?}") + ); + + Ok(()) + } } diff --git a/datafusion/core/src/physical_plan/planner.rs b/datafusion/core/src/physical_plan/planner.rs index d0ee38ac90fd..93616f3fc684 100644 --- a/datafusion/core/src/physical_plan/planner.rs +++ b/datafusion/core/src/physical_plan/planner.rs @@ -2389,7 +2389,7 @@ Internal error: Optimizer rule 'type_coercion' failed due to unexpected error: E Self { schema: DFSchemaRef::new( DFSchema::new_with_metadata( - vec![DFField::new(None, "a", DataType::Int32, false)], + vec![DFField::new::<&str>(None, "a", DataType::Int32, false)], HashMap::new(), ) .unwrap(), diff --git a/datafusion/core/tests/sql/idenfifers.rs b/datafusion/core/tests/sql/idenfifers.rs index a305f23b4944..1b57f60bd435 100644 --- a/datafusion/core/tests/sql/idenfifers.rs +++ b/datafusion/core/tests/sql/idenfifers.rs @@ -211,28 +211,28 @@ async fn case_insensitive_in_sql_errors() { .await .unwrap_err() .to_string(); - assert_contains!(actual, "No field named 'column1'"); + assert_contains!(actual, r#"No field named "column1""#); let actual = ctx .sql("SELECT Column1 from test") .await .unwrap_err() .to_string(); - assert_contains!(actual, "No field named 'column1'"); + assert_contains!(actual, r#"No field named "column1""#); let actual = ctx .sql("SELECT column1 from test") .await .unwrap_err() .to_string(); - assert_contains!(actual, "No field named 'column1'"); + assert_contains!(actual, r#"No field named "column1""#); let actual = ctx .sql(r#"SELECT "column1" from test"#) .await .unwrap_err() .to_string(); - assert_contains!(actual, "No field named 'column1'"); + assert_contains!(actual, r#"No field named "column1""#); // This should pass (note the quotes) ctx.sql(r#"SELECT "Column1" from test"#).await.unwrap(); diff --git a/datafusion/core/tests/sql/references.rs b/datafusion/core/tests/sql/references.rs index f006cbb45984..335bc630861c 100644 --- a/datafusion/core/tests/sql/references.rs +++ b/datafusion/core/tests/sql/references.rs @@ -67,7 +67,7 @@ async fn qualified_table_references_and_fields() -> Result<()> { let error = ctx.sql(sql).await.unwrap_err(); assert_contains!( error.to_string(), - "No field named 'f1'.'c1'. Valid fields are 'test'.'f.c1', 'test'.'test.c2'" + r#"No field named "f1"."c1". Valid fields are "test"."f.c1", "test"."test.c2""# ); // however, enclosing it in double quotes is ok diff --git a/datafusion/core/tests/sqllogictests/test_files/information_schema.slt b/datafusion/core/tests/sqllogictests/test_files/information_schema.slt index 99fdfe30a666..7e4e98125548 100644 --- a/datafusion/core/tests/sqllogictests/test_files/information_schema.slt +++ b/datafusion/core/tests/sqllogictests/test_files/information_schema.slt @@ -84,6 +84,30 @@ datafusion information_schema views VIEW datafusion public t BASE TABLE datafusion public t2 BASE TABLE +query TTTT rowsort +SELECT * from information_schema.tables WHERE tables.table_schema='information_schema'; +---- +datafusion information_schema columns VIEW +datafusion information_schema df_settings VIEW +datafusion information_schema tables VIEW +datafusion information_schema views VIEW + +query TTTT rowsort +SELECT * from information_schema.tables WHERE information_schema.tables.table_schema='information_schema'; +---- +datafusion information_schema columns VIEW +datafusion information_schema df_settings VIEW +datafusion information_schema tables VIEW +datafusion information_schema views VIEW + +query TTTT rowsort +SELECT * from information_schema.tables WHERE datafusion.information_schema.tables.table_schema='information_schema'; +---- +datafusion information_schema columns VIEW +datafusion information_schema df_settings VIEW +datafusion information_schema tables VIEW +datafusion information_schema views VIEW + # Cleanup statement ok drop table t diff --git a/datafusion/core/tests/sqllogictests/test_files/join.slt b/datafusion/core/tests/sqllogictests/test_files/join.slt index e78925a7faab..a1b770f104e7 100644 --- a/datafusion/core/tests/sqllogictests/test_files/join.slt +++ b/datafusion/core/tests/sqllogictests/test_files/join.slt @@ -59,7 +59,7 @@ CREATE TABLE t2(t2_id INT, t2_name TEXT, t2_int INT) AS VALUES (55, 'w', 3); # left semi with wrong where clause -query error DataFusion error: Schema error: No field named 't2'.'t2_id'. Valid fields are 't1'.'t1_id', 't1'.'t1_name', 't1'.'t1_int'. +query error DataFusion error: Schema error: No field named "t2"."t2_id". Valid fields are "t1"."t1_id", "t1"."t1_name", "t1"."t1_int". SELECT t1.t1_id, t1.t1_name, t1.t1_int diff --git a/datafusion/expr/src/expr_rewriter.rs b/datafusion/expr/src/expr_rewriter.rs index af672a85ce26..aa537291f232 100644 --- a/datafusion/expr/src/expr_rewriter.rs +++ b/datafusion/expr/src/expr_rewriter.rs @@ -634,7 +634,8 @@ mod test { fn normalize_cols_non_exist() { // test normalizing columns when the name doesn't exist let expr = col("a") + col("b"); - let schema_a = make_schema_with_empty_metadata(vec![make_field("tableA", "a")]); + let schema_a = + make_schema_with_empty_metadata(vec![make_field("\"tableA\"", "a")]); let schemas = vec![schema_a].into_iter().map(Arc::new).collect::>(); let schemas = schemas.iter().collect::>(); @@ -643,7 +644,7 @@ mod test { .to_string(); assert_eq!( error, - "Schema error: No field named 'b'. Valid fields are 'tableA'.'a'." + r#"Schema error: No field named "b". Valid fields are "tableA"."a"."# ); } diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 493c425d7888..c778418b42c6 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -247,12 +247,12 @@ impl ExprSchemable for Expr { fn to_field(&self, input_schema: &DFSchema) -> Result { match self { Expr::Column(c) => Ok(DFField::new( - c.relation.as_deref(), + c.relation.clone(), &c.name, self.get_type(input_schema)?, self.nullable(input_schema)?, )), - _ => Ok(DFField::new( + _ => Ok(DFField::new::<&str>( None, &self.display_name()?, self.get_type(input_schema)?, diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index f979e1f76f98..d312a4c8e7a7 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -178,7 +178,7 @@ impl LogicalPlanBuilder { .map(|(j, data_type)| { // naming is following convention https://www.postgresql.org/docs/current/queries-values.html let name = &format!("column{}", j + 1); - DFField::new( + DFField::new::<&str>( None, name, data_type.clone().unwrap_or(DataType::Utf8), @@ -545,14 +545,22 @@ impl LogicalPlanBuilder { match (&l.relation, &r.relation) { (Some(lr), Some(rr)) => { - let l_is_left = - self.plan.schema().field_with_qualified_name(lr, &l.name); - let l_is_right = - right.schema().field_with_qualified_name(lr, &l.name); - let r_is_left = - self.plan.schema().field_with_qualified_name(rr, &r.name); - let r_is_right = - right.schema().field_with_qualified_name(rr, &r.name); + let l_is_left = self.plan.schema().field_with_qualified_name( + &lr.as_table_reference(), + &l.name, + ); + let l_is_right = right.schema().field_with_qualified_name( + &lr.as_table_reference(), + &l.name, + ); + let r_is_left = self.plan.schema().field_with_qualified_name( + &rr.as_table_reference(), + &r.name, + ); + let r_is_right = right.schema().field_with_qualified_name( + &rr.as_table_reference(), + &r.name, + ); match (l_is_left, l_is_right, r_is_left, r_is_right) { (_, Ok(_), Ok(_), _) => (Ok(r), Ok(l)), @@ -564,10 +572,14 @@ impl LogicalPlanBuilder { } } (Some(lr), None) => { - let l_is_left = - self.plan.schema().field_with_qualified_name(lr, &l.name); - let l_is_right = - right.schema().field_with_qualified_name(lr, &l.name); + let l_is_left = self.plan.schema().field_with_qualified_name( + &lr.as_table_reference(), + &l.name, + ); + let l_is_right = right.schema().field_with_qualified_name( + &lr.as_table_reference(), + &l.name, + ); match (l_is_left, l_is_right) { (Ok(_), _) => (Ok(l), Self::normalize(&right, r)), @@ -579,10 +591,14 @@ impl LogicalPlanBuilder { } } (None, Some(rr)) => { - let r_is_left = - self.plan.schema().field_with_qualified_name(rr, &r.name); - let r_is_right = - right.schema().field_with_qualified_name(rr, &r.name); + let r_is_left = self.plan.schema().field_with_qualified_name( + &rr.as_table_reference(), + &r.name, + ); + let r_is_right = right.schema().field_with_qualified_name( + &rr.as_table_reference(), + &r.name, + ); match (r_is_left, r_is_right) { (Ok(_), _) => (Ok(r), Self::normalize(&right, l)), @@ -1033,7 +1049,7 @@ pub fn union(left_plan: LogicalPlan, right_plan: LogicalPlan) -> Result( None, left_field.name(), data_type, @@ -1220,7 +1236,7 @@ pub fn unnest(input: LogicalPlan, column: Column) -> Result { DataType::List(field) | DataType::FixedSizeList(field, _) | DataType::LargeList(field) => DFField::new( - unnest_field.qualifier().map(String::as_str), + unnest_field.qualifier(), unnest_field.name(), field.data_type().clone(), unnest_field.is_nullable(), @@ -1261,7 +1277,7 @@ pub fn unnest(input: LogicalPlan, column: Column) -> Result { mod tests { use crate::{expr, expr_fn::exists}; use arrow::datatypes::{DataType, Field}; - use datafusion_common::SchemaError; + use datafusion_common::{OwnedTableReference, SchemaError, TableReference}; use crate::logical_plan::StringifiedPlan; @@ -1536,10 +1552,13 @@ mod tests { match plan { Err(DataFusionError::SchemaError(SchemaError::AmbiguousReference { - qualifier, - name, + field: + Column { + relation: Some(OwnedTableReference::Bare { table }), + name, + }, })) => { - assert_eq!("employee_csv", qualifier.unwrap().as_str()); + assert_eq!("employee_csv", table.as_str()); assert_eq!("id", &name); Ok(()) } @@ -1562,10 +1581,13 @@ mod tests { match plan { Err(DataFusionError::SchemaError(SchemaError::AmbiguousReference { - qualifier, - name, + field: + Column { + relation: Some(OwnedTableReference::Bare { table }), + name, + }, })) => { - assert_eq!("employee_csv", qualifier.unwrap().as_str()); + assert_eq!("employee_csv", table.as_str()); assert_eq!("state", &name); Ok(()) } @@ -1666,7 +1688,7 @@ mod tests { // Check unnested field is a scalar let field = plan .schema() - .field_with_name(Some("test_table"), "strings") + .field_with_name(Some(&TableReference::bare("test_table")), "strings") .unwrap(); assert_eq!(&DataType::Utf8, field.data_type()); @@ -1685,7 +1707,7 @@ mod tests { // Check unnested struct list field should be a struct. let field = plan .schema() - .field_with_name(Some("test_table"), "structs") + .field_with_name(Some(&TableReference::bare("test_table")), "structs") .unwrap(); assert!(matches!(field.data_type(), DataType::Struct(_))); diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 5706ef304d5a..5bc0131b73ab 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -35,6 +35,7 @@ use crate::{ use arrow::datatypes::{DataType, TimeUnit}; use datafusion_common::{ Column, DFField, DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue, + TableReference, }; use std::cmp::Ordering; use std::collections::{HashMap, HashSet}; @@ -188,8 +189,9 @@ pub fn expand_qualified_wildcard( qualifier: &str, schema: &DFSchema, ) -> Result> { + let qualifier = TableReference::from(qualifier); let qualified_fields: Vec = schema - .fields_with_qualified(qualifier) + .fields_with_qualified(&qualifier) .into_iter() .cloned() .collect(); diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index e1830390dee9..be8d4801b58c 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -312,7 +312,7 @@ fn build_common_expr_project_plan( match expr_set.get(&id) { Some((expr, _, data_type)) => { // todo: check `nullable` - let field = DFField::new(None, &id, data_type.clone(), true); + let field = DFField::new::<&str>(None, &id, data_type.clone(), true); fields_set.insert(field.name().to_owned()); project_exprs.push(expr.clone().alias(&id)); } @@ -624,8 +624,8 @@ mod test { let schema = Arc::new(DFSchema::new_with_metadata( vec![ - DFField::new(None, "a", DataType::Int64, false), - DFField::new(None, "c", DataType::Int64, false), + DFField::new::<&str>(None, "a", DataType::Int64, false), + DFField::new::<&str>(None, "c", DataType::Int64, false), ], Default::default(), )?); diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs index b4eea1db86b9..ffd303ea4c64 100644 --- a/datafusion/optimizer/src/optimizer.rs +++ b/datafusion/optimizer/src/optimizer.rs @@ -466,9 +466,9 @@ mod tests { Internal error: Optimizer rule 'get table_scan rule' failed, due to generate a different schema, \ original schema: DFSchema { fields: [], metadata: {} }, \ new schema: DFSchema { fields: [\ - DFField { qualifier: Some(\"test\"), field: Field { name: \"a\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, \ - DFField { qualifier: Some(\"test\"), field: Field { name: \"b\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, \ - DFField { qualifier: Some(\"test\"), field: Field { name: \"c\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }], \ + DFField { qualifier: Some(Bare { table: \"test\" }), field: Field { name: \"a\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, \ + DFField { qualifier: Some(Bare { table: \"test\" }), field: Field { name: \"b\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, \ + DFField { qualifier: Some(Bare { table: \"test\" }), field: Field { name: \"c\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }], \ metadata: {} }. \ This was likely caused by a bug in DataFusion's code \ and we would welcome that you file an bug report in our issue tracker", diff --git a/datafusion/optimizer/src/push_down_projection.rs b/datafusion/optimizer/src/push_down_projection.rs index b6ba3131c9a7..e7bbbdc8b3c9 100644 --- a/datafusion/optimizer/src/push_down_projection.rs +++ b/datafusion/optimizer/src/push_down_projection.rs @@ -469,10 +469,7 @@ fn replace_alias( let mut map = HashMap::new(); for field in input_schema.fields() { let col = field.qualified_column(); - let alias_col = Column { - relation: Some(alias.to_owned()), - name: col.name.clone(), - }; + let alias_col = Column::new(Some(alias), col.name.clone()); map.insert(alias_col, col); } required_columns @@ -497,8 +494,10 @@ fn push_down_scan( let schema = scan.source.schema(); let mut projection: BTreeSet = required_columns .iter() + // TODO: change scan.table_name from String? .filter(|c| { - c.relation.is_none() || c.relation.as_ref().unwrap() == &scan.table_name + c.relation.is_none() + || c.relation.as_ref().unwrap().to_string() == scan.table_name }) .map(|c| schema.index_of(&c.name)) .filter_map(ArrowResult::ok) diff --git a/datafusion/optimizer/src/scalar_subquery_to_join.rs b/datafusion/optimizer/src/scalar_subquery_to_join.rs index cfec3cb741bd..34ea208b30a1 100644 --- a/datafusion/optimizer/src/scalar_subquery_to_join.rs +++ b/datafusion/optimizer/src/scalar_subquery_to_join.rs @@ -272,16 +272,10 @@ fn optimize_scalar( // qualify the join columns for outside the subquery let mut subqry_cols: Vec<_> = subqry_cols .iter() - .map(|it| Column { - relation: Some(subqry_alias.clone()), - name: it.name.clone(), - }) + .map(|it| Column::new(Some(subqry_alias.clone()), it.name.clone())) .collect(); - let qry_expr = Expr::Column(Column { - relation: Some(subqry_alias), - name: "__value".to_string(), - }); + let qry_expr = Expr::Column(Column::new(Some(subqry_alias), "__value".to_string())); // if correlated subquery's operation is column equality, put the clause into join on clause. let mut restore_where_clause = true; diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 6ee1d017219b..3cab42de0f01 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1758,10 +1758,10 @@ mod tests { Arc::new( DFSchema::new_with_metadata( vec![ - DFField::new(None, "c1", DataType::Utf8, true), - DFField::new(None, "c2", DataType::Boolean, true), - DFField::new(None, "c1_non_null", DataType::Utf8, false), - DFField::new(None, "c2_non_null", DataType::Boolean, false), + DFField::new::<&str>(None, "c1", DataType::Utf8, true), + DFField::new::<&str>(None, "c2", DataType::Boolean, true), + DFField::new::<&str>(None, "c1_non_null", DataType::Utf8, false), + DFField::new::<&str>(None, "c2_non_null", DataType::Boolean, false), ], HashMap::new(), ) diff --git a/datafusion/optimizer/src/type_coercion.rs b/datafusion/optimizer/src/type_coercion.rs index 6b6cea82fcec..960dc9b376f0 100644 --- a/datafusion/optimizer/src/type_coercion.rs +++ b/datafusion/optimizer/src/type_coercion.rs @@ -630,7 +630,7 @@ mod test { produce_one_row: false, schema: Arc::new( DFSchema::new_with_metadata( - vec![DFField::new(None, "a", DataType::Float64, true)], + vec![DFField::new::<&str>(None, "a", DataType::Float64, true)], std::collections::HashMap::new(), ) .unwrap(), @@ -648,7 +648,7 @@ mod test { produce_one_row: false, schema: Arc::new( DFSchema::new_with_metadata( - vec![DFField::new(None, "a", DataType::Float64, true)], + vec![DFField::new::<&str>(None, "a", DataType::Float64, true)], std::collections::HashMap::new(), ) .unwrap(), @@ -847,7 +847,7 @@ mod test { produce_one_row: false, schema: Arc::new( DFSchema::new_with_metadata( - vec![DFField::new(None, "a", DataType::Int64, true)], + vec![DFField::new::<&str>(None, "a", DataType::Int64, true)], std::collections::HashMap::new(), ) .unwrap(), @@ -865,7 +865,12 @@ mod test { produce_one_row: false, schema: Arc::new( DFSchema::new_with_metadata( - vec![DFField::new(None, "a", DataType::Decimal128(12, 4), true)], + vec![DFField::new::<&str>( + None, + "a", + DataType::Decimal128(12, 4), + true, + )], std::collections::HashMap::new(), ) .unwrap(), @@ -1019,7 +1024,7 @@ mod test { produce_one_row: false, schema: Arc::new( DFSchema::new_with_metadata( - vec![DFField::new(None, "a", data_type, true)], + vec![DFField::new::<&str>(None, "a", data_type, true)], std::collections::HashMap::new(), ) .unwrap(), @@ -1031,7 +1036,7 @@ mod test { fn test_type_coercion_rewrite() -> Result<()> { let schema = Arc::new( DFSchema::new_with_metadata( - vec![DFField::new(None, "a", DataType::Int64, true)], + vec![DFField::new::<&str>(None, "a", DataType::Int64, true)], std::collections::HashMap::new(), ) .unwrap(), diff --git a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs index 604f3531418f..d6e1cb631125 100644 --- a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs +++ b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs @@ -695,14 +695,24 @@ mod tests { Arc::new( DFSchema::new_with_metadata( vec![ - DFField::new(None, "c1", DataType::Int32, false), - DFField::new(None, "c2", DataType::Int64, false), - DFField::new(None, "c3", DataType::Decimal128(18, 2), false), - DFField::new(None, "c4", DataType::Decimal128(38, 37), false), - DFField::new(None, "c5", DataType::Float32, false), - DFField::new(None, "c6", DataType::UInt32, false), - DFField::new(None, "ts_nano_none", timestamp_nano_none_type(), false), - DFField::new(None, "ts_nano_utf", timestamp_nano_utc_type(), false), + DFField::new::<&str>(None, "c1", DataType::Int32, false), + DFField::new::<&str>(None, "c2", DataType::Int64, false), + DFField::new::<&str>(None, "c3", DataType::Decimal128(18, 2), false), + DFField::new::<&str>(None, "c4", DataType::Decimal128(38, 37), false), + DFField::new::<&str>(None, "c5", DataType::Float32, false), + DFField::new::<&str>(None, "c6", DataType::UInt32, false), + DFField::new::<&str>( + None, + "ts_nano_none", + timestamp_nano_none_type(), + false, + ), + DFField::new::<&str>( + None, + "ts_nano_utf", + timestamp_nano_utc_type(), + false, + ), ], HashMap::new(), ) diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index 498563b2ab47..e54dee9f212c 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -145,10 +145,7 @@ impl From for Column { fn from(c: protobuf::Column) -> Self { let protobuf::Column { relation, name } = c; - Self { - relation: relation.map(|r| r.relation), - name, - } + Self::new(relation.map(|r| r.relation), name) } } diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index 90c99c0d96bd..dbd91faa4db8 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -237,9 +237,9 @@ impl TryFrom<&DataType> for protobuf::arrow_type::ArrowTypeEnum { impl From for protobuf::Column { fn from(c: Column) -> Self { Self { - relation: c - .relation - .map(|relation| protobuf::ColumnRelation { relation }), + relation: c.relation.map(|relation| protobuf::ColumnRelation { + relation: relation.to_string(), + }), name: c.name, } } diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs index a581d5f4720d..fb726f5e8713 100644 --- a/datafusion/sql/src/expr/identifier.rs +++ b/datafusion/sql/src/expr/identifier.rs @@ -15,12 +15,10 @@ // specific language governing permissions and limitations // under the License. -use crate::planner::{ - idents_to_table_reference, ContextProvider, PlannerContext, SqlToRel, -}; +use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use crate::utils::normalize_ident; use datafusion_common::{ - Column, DFSchema, DataFusionError, OwnedTableReference, Result, ScalarValue, + Column, DFSchema, DataFusionError, Result, ScalarValue, TableReference, }; use datafusion_expr::{Case, Expr, GetIndexedField}; use sqlparser::ast::{Expr as SQLExpr, Ident}; @@ -52,11 +50,49 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } } + // (relation, column name) + fn form_identifier(idents: &[String]) -> Result<(Option, &String)> { + match idents.len() { + 1 => Ok((None, &idents[0])), + 2 => Ok(( + Some(TableReference::Bare { + table: (&idents[0]).into(), + }), + &idents[1], + )), + 3 => Ok(( + Some(TableReference::Partial { + schema: (&idents[0]).into(), + table: (&idents[1]).into(), + }), + &idents[2], + )), + 4 => Ok(( + Some(TableReference::Full { + catalog: (&idents[0]).into(), + schema: (&idents[1]).into(), + table: (&idents[2]).into(), + }), + &idents[3], + )), + _ => Err(DataFusionError::Internal(format!( + "Incorrect number of identifiers: {}", + idents.len() + ))), + } + } + pub(super) fn sql_compound_identifier_to_expr( &self, ids: Vec, schema: &DFSchema, ) -> Result { + if ids.len() < 2 { + return Err(DataFusionError::Internal(format!( + "Not a compound identifier: {ids:?}" + ))); + } + if ids[0].value.starts_with('@') { let var_names: Vec<_> = ids.into_iter().map(normalize_ident).collect(); let ty = self @@ -69,44 +105,100 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { })?; Ok(Expr::ScalarVariable(ty, var_names)) } else { - // only support "schema.table" type identifiers here - let (name, relation) = match idents_to_table_reference( - ids, - self.options.enable_ident_normalization, - )? { - OwnedTableReference::Partial { schema, table } => (table, schema), - r @ OwnedTableReference::Bare { .. } - | r @ OwnedTableReference::Full { .. } => { - return Err(DataFusionError::Plan(format!( - "Unsupported compound identifier '{r:?}'", - ))); - } - }; + let ids = ids + .into_iter() + .map(|id| { + if self.options.enable_ident_normalization { + normalize_ident(id) + } else { + id.value + } + }) + .collect::>(); - // Try and find the reference in schema - match schema.field_with_qualified_name(&relation, &name) { - Ok(_) => { - // found an exact match on a qualified name so this is a table.column identifier - Ok(Expr::Column(Column { - relation: Some(relation), - name, - })) + // Possibilities we search with, in order from top to bottom for each len: + // + // len = 2: + // 1. (table.column) + // 2. (column).nested + // + // len = 3: + // 1. (schema.table.column) + // 2. (table.column).nested + // 3. (column).nested1.nested2 + // + // len = 4: + // 1. (catalog.schema.table.column) + // 2. (schema.table.column).nested1 + // 3. (table.column).nested1.nested2 + // 4. (column).nested1.nested2.nested3 + // + // len = 5: + // 1. (catalog.schema.table.column).nested + // 2. (schema.table.column).nested1.nested2 + // 3. (table.column).nested1.nested2.nested3 + // 4. (column).nested1.nested2.nested3.nested4 + // + // len > 5: + // 1. (catalog.schema.table.column).nested[.nestedN]+ + // 2. (schema.table.column).nested1.nested2[.nestedN]+ + // 3. (table.column).nested1.nested2.nested3[.nestedN]+ + // 4. (column).nested1.nested2.nested3.nested4[.nestedN]+ + // + // Currently not supporting more than one nested level + // Though ideally once that support is in place, this code should work with it + + // TODO: remove when can support multiple nested identifiers + if ids.len() > 5 { + return Err(DataFusionError::Internal(format!( + "Unsupported compound identifier: {ids:?}" + ))); + } + + // take at most 4 identifiers to form a Column to search with + // - 1 for the column name + // - 0 to 3 for the TableReference + let bound = ids.len().min(4); + // search from most specific to least specific + let search_result = (0..bound).rev().find_map(|i| { + let nested_names_index = i + 1; + let s = &ids[0..nested_names_index]; + let (relation, column_name) = Self::form_identifier(s).unwrap(); + let field = schema.field_with_name(relation.as_ref(), column_name).ok(); + field.map(|f| (f, nested_names_index)) + }); + + match search_result { + // found matching field with spare identifier(s) for nested field(s) in structure + Some((field, index)) if index < ids.len() => { + // TODO: remove when can support multiple nested identifiers + if index < ids.len() - 1 { + return Err(DataFusionError::Internal(format!( + "Nested identifiers not yet supported for column {}", + field.qualified_column().quoted_flat_name() + ))); + } + let nested_name = ids[index].to_string(); + Ok(Expr::GetIndexedField(GetIndexedField::new( + Box::new(Expr::Column(field.qualified_column())), + ScalarValue::Utf8(Some(nested_name)), + ))) } - Err(_) => { - if let Some(field) = - schema.fields().iter().find(|f| f.name().eq(&relation)) - { - // Access to a field of a column which is a structure, example: SELECT my_struct.key - Ok(Expr::GetIndexedField(GetIndexedField::new( - Box::new(Expr::Column(field.qualified_column())), - ScalarValue::Utf8(Some(name)), + // found matching field with no spare identifier(s) + Some((field, _index)) => Ok(Expr::Column(field.qualified_column())), + // found no matching field, will return a default + None => { + // return default where use all identifiers to not have a nested field + // this len check is because at 5 identifiers will have to have a nested field + if ids.len() == 5 { + Err(DataFusionError::Internal(format!( + "Unsupported compound identifier: {ids:?}" ))) } else { - // table.column identifier - Ok(Expr::Column(Column { - relation: Some(relation), - name, - })) + let s = &ids[0..ids.len()]; + let (relation, column_name) = Self::form_identifier(s).unwrap(); + let relation = relation.map(|r| r.to_owned_reference()); + Ok(Expr::Column(Column::new(relation, column_name))) } } } diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index f226924516ef..3e601d42d0f0 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -92,7 +92,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .find(|field| match field.qualifier() { Some(field_q) => { field.name() == &col.name - && field_q.ends_with(&format!(".{q}")) + && field_q.to_string().ends_with(&format!(".{q}")) } _ => false, }) { diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index e9252db9feb6..4c8dd4fcd869 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -194,23 +194,22 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .try_for_each(|col| match col { Expr::Column(col) => match &col.relation { Some(r) => { - schema.field_with_qualified_name(r, &col.name)?; + schema.field_with_qualified_name( + &r.as_table_reference(), + &col.name, + )?; Ok(()) } None => { if !schema.fields_with_unqualified_name(&col.name).is_empty() { Ok(()) } else { - Err(field_not_found(None, col.name.as_str(), schema)) + Err(field_not_found::<&str>(None, col.name.as_str(), schema)) } } } .map_err(|_: DataFusionError| { - field_not_found( - col.relation.as_ref().map(|s| s.to_owned()), - col.name.as_str(), - schema, - ) + field_not_found(col.relation.clone(), col.name.as_str(), schema) }), _ => Err(DataFusionError::Internal("Not a column".to_string())), }) diff --git a/datafusion/sql/tests/integration_test.rs b/datafusion/sql/tests/integration_test.rs index 44c0559ef35a..f9213e8a328d 100644 --- a/datafusion/sql/tests/integration_test.rs +++ b/datafusion/sql/tests/integration_test.rs @@ -226,11 +226,11 @@ Dml: op=[Insert] table=[test_decimal] #[rstest] #[case::duplicate_columns( "INSERT INTO test_decimal (id, price, price) VALUES (1, 2, 3), (4, 5, 6)", - "Schema error: Schema contains duplicate unqualified field name 'price'" + "Schema error: Schema contains duplicate unqualified field name \"price\"" )] #[case::non_existing_column( "INSERT INTO test_decimal (nonexistent, price) VALUES (1, 2), (4, 5)", - "Schema error: No field named 'nonexistent'. Valid fields are 'id', 'price'." + "Schema error: No field named \"nonexistent\". Valid fields are \"id\", \"price\"." )] #[case::type_mismatch( "INSERT INTO test_decimal SELECT '2022-01-01', to_timestamp('2022-01-01T12:00:00')", @@ -1121,9 +1121,9 @@ fn select_simple_aggregate_with_groupby_column_unselected() { fn select_simple_aggregate_with_groupby_and_column_in_group_by_does_not_exist() { let sql = "SELECT SUM(age) FROM person GROUP BY doesnotexist"; let err = logical_plan(sql).expect_err("query should have failed"); - assert_eq!("Schema error: No field named 'doesnotexist'. Valid fields are 'SUM(person.age)', \ - 'person'.'id', 'person'.'first_name', 'person'.'last_name', 'person'.'age', 'person'.'state', \ - 'person'.'salary', 'person'.'birth_date', 'person'.'😀'.", format!("{err}")); + assert_eq!("Schema error: No field named \"doesnotexist\". Valid fields are \"SUM(person.age)\", \ + \"person\".\"id\", \"person\".\"first_name\", \"person\".\"last_name\", \"person\".\"age\", \"person\".\"state\", \ + \"person\".\"salary\", \"person\".\"birth_date\", \"person\".\"😀\".", format!("{err}")); } #[test] @@ -1432,6 +1432,17 @@ fn select_where_with_positive_operator() { quick_test(sql, expected); } +#[test] +fn select_where_compound_identifiers() { + let sql = "SELECT aggregate_test_100.c3 \ + FROM public.aggregate_test_100 \ + WHERE aggregate_test_100.c3 > 0.1"; + let expected = "Projection: public.aggregate_test_100.c3\ + \n Filter: public.aggregate_test_100.c3 > Float64(0.1)\ + \n TableScan: public.aggregate_test_100"; + quick_test(sql, expected); +} + #[test] fn select_order_by_index() { let sql = "SELECT id FROM person ORDER BY 1"; @@ -3849,7 +3860,7 @@ fn assert_field_not_found(err: DataFusionError, name: &str) { match err { DataFusionError::SchemaError { .. } => { let msg = format!("{err}"); - let expected = format!("Schema error: No field named '{name}'."); + let expected = format!("Schema error: No field named \"{name}\"."); if !msg.starts_with(&expected) { panic!("error [{msg}] did not start with [{expected}]"); } diff --git a/docs/source/user-guide/example-usage.md b/docs/source/user-guide/example-usage.md index c497c66e597b..d7782b511a1d 100644 --- a/docs/source/user-guide/example-usage.md +++ b/docs/source/user-guide/example-usage.md @@ -118,8 +118,8 @@ async fn main() -> datafusion::error::Result<()> { let ctx = SessionContext::new(); let df = ctx.read_csv("tests/data/capitalized_example.csv", CsvReadOptions::new()).await?; - let df = df.filter(col("A").lt_eq(col("c")))? - .aggregate(vec![col("A")], vec![min(col("b"))])? + let df = df.filter(col("\"A\"").lt_eq(col("c")))? + .aggregate(vec![col("\"A\"")], vec![min(col("b"))])? .limit(0, Some(100))?; // execute and print results From 5b358a9a752022f5da5b99ed7247685150b6e844 Mon Sep 17 00:00:00 2001 From: Jefffrey <22608443+Jefffrey@users.noreply.github.com> Date: Tue, 28 Feb 2023 22:16:47 +1100 Subject: [PATCH 02/13] Merge branch 'main' into support_catalog_schema_in_ident --- .github/dependabot.yml | 4 + README.md | 6 +- benchmarks/Cargo.toml | 10 +- benchmarks/expected-plans/q15.txt | 19 +- benchmarks/expected-plans/q16.txt | 25 +- benchmarks/expected-plans/q18.txt | 25 +- benchmarks/expected-plans/q2.txt | 47 +- benchmarks/expected-plans/q21.txt | 5 +- benchmarks/expected-plans/q22.txt | 3 +- benchmarks/expected-plans/q8.txt | 41 +- benchmarks/expected-plans/q9.txt | 25 +- datafusion-cli/Cargo.lock | 192 ++- datafusion-cli/Cargo.toml | 8 +- datafusion-examples/Cargo.toml | 8 +- datafusion-examples/examples/catalog.rs | 289 ++++ datafusion/CHANGELOG.md | 188 +++ datafusion/common/Cargo.toml | 6 +- datafusion/common/src/config.rs | 2 +- datafusion/common/src/dfschema.rs | 33 +- datafusion/common/src/parsers.rs | 4 + datafusion/core/Cargo.toml | 27 +- datafusion/core/benches/merge.rs | 72 + datafusion/core/benches/sort.rs | 6 +- datafusion/core/src/dataframe.rs | 39 +- datafusion/core/src/datasource/datasource.rs | 14 + .../src/datasource/default_table_source.rs | 10 +- .../core/src/datasource/file_format/avro.rs | 8 +- .../src/datasource/file_format/file_type.rs | 233 ++- .../core/src/datasource/file_format/mod.rs | 1 + .../file_format}/options.rs | 2 +- datafusion/core/src/datasource/view.rs | 13 +- datafusion/core/src/execution/context.rs | 138 +- datafusion/core/src/execution/mod.rs | 3 +- .../core/src/physical_optimizer/pruning.rs | 95 +- .../core/src/physical_plan/aggregates/mod.rs | 72 +- .../src/physical_plan/coalesce_batches.rs | 16 +- .../core/src/physical_plan/file_format/csv.rs | 18 +- .../src/physical_plan/file_format/json.rs | 31 +- .../src/physical_plan/file_format/parquet.rs | 7 + .../file_format/parquet/page_filter.rs | 14 +- .../physical_plan/joins/sort_merge_join.rs | 14 +- datafusion/core/src/physical_plan/mod.rs | 11 +- datafusion/core/src/physical_plan/planner.rs | 21 +- .../core/src/physical_plan/sorts/sort.rs | 2 +- datafusion/core/src/test/mod.rs | 15 +- datafusion/core/tests/custom_sources.rs | 34 +- datafusion/core/tests/dataframe.rs | 170 ++- datafusion/core/tests/dataframe_functions.rs | 8 +- datafusion/core/tests/sql/aggregates.rs | 28 +- datafusion/core/tests/sql/avro.rs | 16 +- datafusion/core/tests/sql/explain_analyze.rs | 15 +- datafusion/core/tests/sql/expr.rs | 60 +- datafusion/core/tests/sql/functions.rs | 16 +- datafusion/core/tests/sql/group_by.rs | 18 +- datafusion/core/tests/sql/intersection.rs | 87 -- datafusion/core/tests/sql/joins.rs | 644 ++++----- datafusion/core/tests/sql/json.rs | 24 +- datafusion/core/tests/sql/limit.rs | 162 --- datafusion/core/tests/sql/mod.rs | 24 +- datafusion/core/tests/sql/parquet.rs | 20 +- datafusion/core/tests/sql/predicates.rs | 505 +------ datafusion/core/tests/sql/projection.rs | 50 +- datafusion/core/tests/sql/select.rs | 28 +- datafusion/core/tests/sql/set_variable.rs | 54 +- datafusion/core/tests/sql/subqueries.rs | 60 + datafusion/core/tests/sql/udf.rs | 60 +- datafusion/core/tests/sql/union.rs | 20 +- datafusion/core/tests/sql/wildcard.rs | 76 +- datafusion/core/tests/sql/window.rs | 1273 +---------------- datafusion/core/tests/sqllogictests/README.md | 2 +- .../sqllogictests/test_files/explain.slt | 5 +- .../test_files/information_schema.slt | 2 +- .../sqllogictests/test_files/intersection.slt | 45 + .../tests/sqllogictests/test_files/limit.slt | 302 ++++ .../tests/sqllogictests/test_files/order.slt | 35 + .../sqllogictests/test_files/predicates.slt | 279 ++++ .../tests/sqllogictests/test_files/window.slt | 846 ++++++++++- datafusion/core/tests/tpcds_planning.rs | 9 + .../core/tests/user_defined_aggregates.rs | 12 +- datafusion/core/tests/user_defined_plan.rs | 3 +- datafusion/expr/Cargo.toml | 6 +- datafusion/expr/src/expr.rs | 33 +- datafusion/expr/src/logical_plan/builder.rs | 91 +- datafusion/expr/src/logical_plan/plan.rs | 17 +- datafusion/expr/src/operator.rs | 29 + datafusion/expr/src/table_source.rs | 17 +- datafusion/expr/src/type_coercion.rs | 15 +- datafusion/expr/src/window_frame.rs | 2 +- datafusion/jit/Cargo.toml | 8 +- datafusion/optimizer/Cargo.toml | 13 +- .../optimizer/src/decorrelate_where_exists.rs | 26 + .../optimizer/src/decorrelate_where_in.rs | 31 + .../optimizer/src/eliminate_cross_join.rs | 12 +- .../optimizer/src/eliminate_outer_join.rs | 4 +- datafusion/optimizer/src/eliminate_project.rs | 96 ++ datafusion/optimizer/src/inline_table_scan.rs | 45 +- datafusion/optimizer/src/lib.rs | 3 + datafusion/optimizer/src/merge_projection.rs | 166 +++ datafusion/optimizer/src/optimizer.rs | 8 +- datafusion/optimizer/src/push_down_filter.rs | 154 +- .../src/replace_distinct_aggregate.rs | 103 ++ .../simplify_expressions/expr_simplifier.rs | 2 +- datafusion/optimizer/src/type_coercion.rs | 206 ++- .../src/unwrap_cast_in_comparison.rs | 2 +- datafusion/optimizer/src/utils.rs | 4 +- .../optimizer/tests/integration-test.rs | 111 +- datafusion/physical-expr/Cargo.toml | 14 +- .../physical-expr/src/expressions/binary.rs | 92 +- .../physical-expr/src/expressions/try_cast.rs | 14 +- datafusion/physical-expr/src/planner.rs | 5 +- datafusion/proto/Cargo.toml | 12 +- datafusion/proto/proto/datafusion.proto | 6 + datafusion/proto/src/generated/pbjson.rs | 123 ++ datafusion/proto/src/generated/prost.rs | 12 +- .../proto/src/logical_plan/from_proto.rs | 6 + datafusion/proto/src/logical_plan/mod.rs | 11 + datafusion/proto/src/logical_plan/to_proto.rs | 9 +- datafusion/row/Cargo.toml | 8 +- datafusion/sql/Cargo.toml | 8 +- datafusion/sql/src/parser.rs | 3 +- datafusion/sql/src/query.rs | 53 +- datafusion/substrait/Cargo.toml | 4 +- .../substrait/tests/roundtrip_logical_plan.rs | 17 +- docs/source/user-guide/configs.md | 2 +- docs/source/user-guide/dataframe.md | 2 +- parquet-test-utils/Cargo.toml | 2 +- test-utils/Cargo.toml | 2 +- 127 files changed, 4723 insertions(+), 3700 deletions(-) create mode 100644 datafusion-examples/examples/catalog.rs rename datafusion/core/src/{execution => datasource/file_format}/options.rs (99%) delete mode 100644 datafusion/core/tests/sql/intersection.rs create mode 100644 datafusion/core/tests/sqllogictests/test_files/intersection.slt create mode 100644 datafusion/core/tests/sqllogictests/test_files/limit.slt create mode 100644 datafusion/core/tests/sqllogictests/test_files/predicates.slt create mode 100644 datafusion/optimizer/src/eliminate_project.rs create mode 100644 datafusion/optimizer/src/merge_projection.rs create mode 100644 datafusion/optimizer/src/replace_distinct_aggregate.rs diff --git a/.github/dependabot.yml b/.github/dependabot.yml index a5d36ce96a20..3116a720a9c8 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -24,6 +24,10 @@ updates: open-pull-requests-limit: 10 target-branch: main labels: [auto-dependencies] + ignore: + # arrow is bumped manually + - dependency-name: "arrow*" + update-types: ["version-update:semver-major"] - package-ecosystem: "github-actions" directory: "/" schedule: diff --git a/README.md b/README.md index 0069d6c94f59..48d99ed88e9b 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ DataFusion can be used without modification as an embedded SQL engine or can be customized and used as a foundation for building new systems. Here are some examples of systems built using DataFusion: -- Specialized Analytical Database systems such as [CeresDB] and more general spark like system such a [Ballista]. +- Specialized Analytical Database systems such as [CeresDB] and more general Apache Spark like system such a [Ballista]. - New query language engines such as [prql-query] and accelerators such as [VegaFusion] - Research platform for new Database Systems, such as [Flock] - SQL support to another library, such as [dask sql] @@ -145,6 +145,7 @@ Here are some of the projects known to use DataFusion: - [Synnada](https://synnada.ai/) Streaming-first framework for data products - [Tensorbase](https://github.com/tensorbase/tensorbase) - [VegaFusion](https://vegafusion.io/) Server-side acceleration for the [Vega](https://vega.github.io/) visualization grammar +- [ZincObserve](https://github.com/zinclabs/zincobserve) Distributed cloud native observability platform [ballista]: https://github.com/apache/arrow-ballista [blaze]: https://github.com/blaze-init/blaze @@ -166,7 +167,8 @@ Here are some of the projects known to use DataFusion: [seafowl]: https://github.com/splitgraph/seafowl [synnada]: https://synnada.ai/ [tensorbase]: https://github.com/tensorbase/tensorbase -[vegafusion]: https://vegafusion.io/ "if you know of another project, please submit a PR to add a link!" +[vegafusion]: https://vegafusion.io/ +[zincobserve]: https://github.com/zinclabs/zincobserve "if you know of another project, please submit a PR to add a link!" ## Examples diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 02eeb00a3de5..37b6bfc4ead6 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "datafusion-benchmarks" description = "DataFusion Benchmarks" -version = "18.0.0" +version = "19.0.0" edition = "2021" authors = ["Apache Arrow "] homepage = "https://github.com/apache/arrow-datafusion" @@ -33,14 +33,14 @@ simd = ["datafusion/simd"] snmalloc = ["snmalloc-rs"] [dependencies] -arrow = "32.0.0" -datafusion = { path = "../datafusion/core", version = "18.0.0", features = ["scheduler"] } +arrow = "34.0.0" +datafusion = { path = "../datafusion/core", version = "19.0.0", features = ["scheduler"] } env_logger = "0.10" futures = "0.3" mimalloc = { version = "0.1", optional = true, default-features = false } num_cpus = "1.13.0" object_store = "0.5.4" -parquet = "32.0.0" +parquet = "34.0.0" parquet-test-utils = { path = "../parquet-test-utils/", version = "0.1.0" } rand = "0.8.4" serde = { version = "1.0.136", features = ["derive"] } @@ -51,4 +51,4 @@ test-utils = { path = "../test-utils/", version = "0.1.0" } tokio = { version = "^1.0", features = ["macros", "rt", "rt-multi-thread", "parking_lot"] } [dev-dependencies] -datafusion-proto = { path = "../datafusion/proto", version = "18.0.0" } +datafusion-proto = { path = "../datafusion/proto", version = "19.0.0" } diff --git a/benchmarks/expected-plans/q15.txt b/benchmarks/expected-plans/q15.txt index f4e053f8d421..99ec50eb84b0 100644 --- a/benchmarks/expected-plans/q15.txt +++ b/benchmarks/expected-plans/q15.txt @@ -5,19 +5,16 @@ Sort: supplier.s_suppkey ASC NULLS LAST Inner Join: supplier.s_suppkey = revenue0.supplier_no TableScan: supplier projection=[s_suppkey, s_name, s_address, s_phone] SubqueryAlias: revenue0 - Projection: supplier_no, total_revenue - Projection: lineitem.l_suppkey AS supplier_no, SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS total_revenue - Aggregate: groupBy=[[lineitem.l_suppkey]], aggr=[[SUM(CAST(lineitem.l_extendedprice AS Decimal128(38, 4)) * CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4))) AS SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] - Filter: lineitem.l_shipdate >= Date32("9496") AND lineitem.l_shipdate < Date32("9587") - TableScan: lineitem projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate] + Projection: lineitem.l_suppkey AS supplier_no, SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS total_revenue + Aggregate: groupBy=[[lineitem.l_suppkey]], aggr=[[SUM(CAST(lineitem.l_extendedprice AS Decimal128(38, 4)) * CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4))) AS SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] + Filter: lineitem.l_shipdate >= Date32("9496") AND lineitem.l_shipdate < Date32("9587") + TableScan: lineitem projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate] SubqueryAlias: __scalar_sq_1 Projection: MAX(revenue0.total_revenue) AS __value Aggregate: groupBy=[[]], aggr=[[MAX(revenue0.total_revenue)]] SubqueryAlias: revenue0 - Projection: total_revenue - Projection: SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS total_revenue - Projection: SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) - Aggregate: groupBy=[[lineitem.l_suppkey]], aggr=[[SUM(CAST(lineitem.l_extendedprice AS Decimal128(38, 4)) * CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4))) AS SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] - Filter: lineitem.l_shipdate >= Date32("9496") AND lineitem.l_shipdate < Date32("9587") - TableScan: lineitem projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate] + Projection: SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS total_revenue + Aggregate: groupBy=[[lineitem.l_suppkey]], aggr=[[SUM(CAST(lineitem.l_extendedprice AS Decimal128(38, 4)) * CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4))) AS SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] + Filter: lineitem.l_shipdate >= Date32("9496") AND lineitem.l_shipdate < Date32("9587") + TableScan: lineitem projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate] EmptyRelation \ No newline at end of file diff --git a/benchmarks/expected-plans/q16.txt b/benchmarks/expected-plans/q16.txt index 6af486a2a0ab..014b1b7b4161 100644 --- a/benchmarks/expected-plans/q16.txt +++ b/benchmarks/expected-plans/q16.txt @@ -1,14 +1,13 @@ Sort: supplier_cnt DESC NULLS FIRST, part.p_brand ASC NULLS LAST, part.p_type ASC NULLS LAST, part.p_size ASC NULLS LAST - Projection: part.p_brand, part.p_type, part.p_size, COUNT(DISTINCT partsupp.ps_suppkey) AS supplier_cnt - Projection: group_alias_0 AS part.p_brand, group_alias_1 AS part.p_type, group_alias_2 AS part.p_size, COUNT(alias1) AS COUNT(DISTINCT partsupp.ps_suppkey) - Aggregate: groupBy=[[group_alias_0, group_alias_1, group_alias_2]], aggr=[[COUNT(alias1)]] - Aggregate: groupBy=[[part.p_brand AS group_alias_0, part.p_type AS group_alias_1, part.p_size AS group_alias_2, partsupp.ps_suppkey AS alias1]], aggr=[[]] - LeftAnti Join: partsupp.ps_suppkey = __correlated_sq_1.s_suppkey - Inner Join: partsupp.ps_partkey = part.p_partkey - TableScan: partsupp projection=[ps_partkey, ps_suppkey] - Filter: part.p_brand != Utf8("Brand#45") AND part.p_type NOT LIKE Utf8("MEDIUM POLISHED%") AND part.p_size IN ([Int32(49), Int32(14), Int32(23), Int32(45), Int32(19), Int32(3), Int32(36), Int32(9)]) - TableScan: part projection=[p_partkey, p_brand, p_type, p_size] - SubqueryAlias: __correlated_sq_1 - Projection: supplier.s_suppkey AS s_suppkey - Filter: supplier.s_comment LIKE Utf8("%Customer%Complaints%") - TableScan: supplier projection=[s_suppkey, s_comment] + Projection: group_alias_0 AS part.p_brand, group_alias_1 AS part.p_type, group_alias_2 AS part.p_size, COUNT(alias1) AS supplier_cnt + Aggregate: groupBy=[[group_alias_0, group_alias_1, group_alias_2]], aggr=[[COUNT(alias1)]] + Aggregate: groupBy=[[part.p_brand AS group_alias_0, part.p_type AS group_alias_1, part.p_size AS group_alias_2, partsupp.ps_suppkey AS alias1]], aggr=[[]] + LeftAnti Join: partsupp.ps_suppkey = __correlated_sq_1.s_suppkey + Inner Join: partsupp.ps_partkey = part.p_partkey + TableScan: partsupp projection=[ps_partkey, ps_suppkey] + Filter: part.p_brand != Utf8("Brand#45") AND part.p_type NOT LIKE Utf8("MEDIUM POLISHED%") AND part.p_size IN ([Int32(49), Int32(14), Int32(23), Int32(45), Int32(19), Int32(3), Int32(36), Int32(9)]) + TableScan: part projection=[p_partkey, p_brand, p_type, p_size] + SubqueryAlias: __correlated_sq_1 + Projection: supplier.s_suppkey AS s_suppkey + Filter: supplier.s_comment LIKE Utf8("%Customer%Complaints%") + TableScan: supplier projection=[s_suppkey, s_comment] \ No newline at end of file diff --git a/benchmarks/expected-plans/q18.txt b/benchmarks/expected-plans/q18.txt index 639598725ce0..01f60ba55f31 100644 --- a/benchmarks/expected-plans/q18.txt +++ b/benchmarks/expected-plans/q18.txt @@ -1,14 +1,13 @@ Sort: orders.o_totalprice DESC NULLS FIRST, orders.o_orderdate ASC NULLS LAST - Projection: customer.c_name, customer.c_custkey, orders.o_orderkey, orders.o_orderdate, orders.o_totalprice, SUM(lineitem.l_quantity) - Aggregate: groupBy=[[customer.c_name, customer.c_custkey, orders.o_orderkey, orders.o_orderdate, orders.o_totalprice]], aggr=[[SUM(lineitem.l_quantity)]] - LeftSemi Join: orders.o_orderkey = __correlated_sq_1.l_orderkey - Inner Join: orders.o_orderkey = lineitem.l_orderkey - Inner Join: customer.c_custkey = orders.o_custkey - TableScan: customer projection=[c_custkey, c_name] - TableScan: orders projection=[o_orderkey, o_custkey, o_totalprice, o_orderdate] - TableScan: lineitem projection=[l_orderkey, l_quantity] - SubqueryAlias: __correlated_sq_1 - Projection: lineitem.l_orderkey AS l_orderkey - Filter: SUM(lineitem.l_quantity) > Decimal128(Some(30000),25,2) - Aggregate: groupBy=[[lineitem.l_orderkey]], aggr=[[SUM(lineitem.l_quantity)]] - TableScan: lineitem projection=[l_orderkey, l_quantity] \ No newline at end of file + Aggregate: groupBy=[[customer.c_name, customer.c_custkey, orders.o_orderkey, orders.o_orderdate, orders.o_totalprice]], aggr=[[SUM(lineitem.l_quantity)]] + LeftSemi Join: orders.o_orderkey = __correlated_sq_1.l_orderkey + Inner Join: orders.o_orderkey = lineitem.l_orderkey + Inner Join: customer.c_custkey = orders.o_custkey + TableScan: customer projection=[c_custkey, c_name] + TableScan: orders projection=[o_orderkey, o_custkey, o_totalprice, o_orderdate] + TableScan: lineitem projection=[l_orderkey, l_quantity] + SubqueryAlias: __correlated_sq_1 + Projection: lineitem.l_orderkey AS l_orderkey + Filter: SUM(lineitem.l_quantity) > Decimal128(Some(30000),25,2) + Aggregate: groupBy=[[lineitem.l_orderkey]], aggr=[[SUM(lineitem.l_quantity)]] + TableScan: lineitem projection=[l_orderkey, l_quantity] \ No newline at end of file diff --git a/benchmarks/expected-plans/q2.txt b/benchmarks/expected-plans/q2.txt index 571c320e9e1a..d291548c2db8 100644 --- a/benchmarks/expected-plans/q2.txt +++ b/benchmarks/expected-plans/q2.txt @@ -1,26 +1,25 @@ Sort: supplier.s_acctbal DESC NULLS FIRST, nation.n_name ASC NULLS LAST, supplier.s_name ASC NULLS LAST, part.p_partkey ASC NULLS LAST Projection: supplier.s_acctbal, supplier.s_name, nation.n_name, part.p_partkey, part.p_mfgr, supplier.s_address, supplier.s_phone, supplier.s_comment - Projection: part.p_partkey, part.p_mfgr, supplier.s_name, supplier.s_address, supplier.s_phone, supplier.s_acctbal, supplier.s_comment, nation.n_name - Inner Join: part.p_partkey = __scalar_sq_1.ps_partkey, partsupp.ps_supplycost = __scalar_sq_1.__value - Inner Join: nation.n_regionkey = region.r_regionkey - Inner Join: supplier.s_nationkey = nation.n_nationkey - Inner Join: partsupp.ps_suppkey = supplier.s_suppkey - Inner Join: part.p_partkey = partsupp.ps_partkey - Filter: part.p_size = Int32(15) AND part.p_type LIKE Utf8("%BRASS") - TableScan: part projection=[p_partkey, p_mfgr, p_type, p_size] - TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_supplycost] - TableScan: supplier projection=[s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment] - TableScan: nation projection=[n_nationkey, n_name, n_regionkey] - Filter: region.r_name = Utf8("EUROPE") - TableScan: region projection=[r_regionkey, r_name] - SubqueryAlias: __scalar_sq_1 - Projection: partsupp.ps_partkey, MIN(partsupp.ps_supplycost) AS __value - Aggregate: groupBy=[[partsupp.ps_partkey]], aggr=[[MIN(partsupp.ps_supplycost)]] - Inner Join: nation.n_regionkey = region.r_regionkey - Inner Join: supplier.s_nationkey = nation.n_nationkey - Inner Join: partsupp.ps_suppkey = supplier.s_suppkey - TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_supplycost] - TableScan: supplier projection=[s_suppkey, s_nationkey] - TableScan: nation projection=[n_nationkey, n_regionkey] - Filter: region.r_name = Utf8("EUROPE") - TableScan: region projection=[r_regionkey, r_name] \ No newline at end of file + Inner Join: part.p_partkey = __scalar_sq_1.ps_partkey, partsupp.ps_supplycost = __scalar_sq_1.__value + Inner Join: nation.n_regionkey = region.r_regionkey + Inner Join: supplier.s_nationkey = nation.n_nationkey + Inner Join: partsupp.ps_suppkey = supplier.s_suppkey + Inner Join: part.p_partkey = partsupp.ps_partkey + Filter: part.p_size = Int32(15) AND part.p_type LIKE Utf8("%BRASS") + TableScan: part projection=[p_partkey, p_mfgr, p_type, p_size] + TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_supplycost] + TableScan: supplier projection=[s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment] + TableScan: nation projection=[n_nationkey, n_name, n_regionkey] + Filter: region.r_name = Utf8("EUROPE") + TableScan: region projection=[r_regionkey, r_name] + SubqueryAlias: __scalar_sq_1 + Projection: partsupp.ps_partkey, MIN(partsupp.ps_supplycost) AS __value + Aggregate: groupBy=[[partsupp.ps_partkey]], aggr=[[MIN(partsupp.ps_supplycost)]] + Inner Join: nation.n_regionkey = region.r_regionkey + Inner Join: supplier.s_nationkey = nation.n_nationkey + Inner Join: partsupp.ps_suppkey = supplier.s_suppkey + TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_supplycost] + TableScan: supplier projection=[s_suppkey, s_nationkey] + TableScan: nation projection=[n_nationkey, n_regionkey] + Filter: region.r_name = Utf8("EUROPE") + TableScan: region projection=[r_regionkey, r_name] \ No newline at end of file diff --git a/benchmarks/expected-plans/q21.txt b/benchmarks/expected-plans/q21.txt index a91632df4e47..c1d7417d8318 100644 --- a/benchmarks/expected-plans/q21.txt +++ b/benchmarks/expected-plans/q21.txt @@ -14,9 +14,8 @@ Sort: numwait DESC NULLS FIRST, supplier.s_name ASC NULLS LAST TableScan: orders projection=[o_orderkey, o_orderstatus] Filter: nation.n_name = Utf8("SAUDI ARABIA") TableScan: nation projection=[n_nationkey, n_name] - Projection: l2.l_orderkey, l2.l_suppkey - SubqueryAlias: l2 - TableScan: lineitem projection=[l_orderkey, l_suppkey] + SubqueryAlias: l2 + TableScan: lineitem projection=[l_orderkey, l_suppkey] Projection: l3.l_orderkey, l3.l_suppkey SubqueryAlias: l3 Filter: lineitem.l_receiptdate > lineitem.l_commitdate diff --git a/benchmarks/expected-plans/q22.txt b/benchmarks/expected-plans/q22.txt index 11b438085a0b..0fd7a590ac19 100644 --- a/benchmarks/expected-plans/q22.txt +++ b/benchmarks/expected-plans/q22.txt @@ -8,8 +8,7 @@ Sort: custsale.cntrycode ASC NULLS LAST LeftAnti Join: customer.c_custkey = orders.o_custkey Filter: substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")]) TableScan: customer projection=[c_custkey, c_phone, c_acctbal] - Projection: orders.o_custkey - TableScan: orders projection=[o_custkey] + TableScan: orders projection=[o_custkey] SubqueryAlias: __scalar_sq_1 Projection: AVG(customer.c_acctbal) AS __value Aggregate: groupBy=[[]], aggr=[[AVG(customer.c_acctbal)]] diff --git a/benchmarks/expected-plans/q8.txt b/benchmarks/expected-plans/q8.txt index 75e65d835bb4..e8c47b2ab7d3 100644 --- a/benchmarks/expected-plans/q8.txt +++ b/benchmarks/expected-plans/q8.txt @@ -3,24 +3,23 @@ Sort: all_nations.o_year ASC NULLS LAST Aggregate: groupBy=[[all_nations.o_year]], aggr=[[SUM(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Decimal128(Some(0),38,4) END) AS SUM(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END), SUM(all_nations.volume)]] SubqueryAlias: all_nations Projection: datepart(Utf8("YEAR"), orders.o_orderdate) AS o_year, CAST(lineitem.l_extendedprice AS Decimal128(38, 4)) * CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4)) AS volume, n2.n_name AS nation - Projection: lineitem.l_extendedprice, lineitem.l_discount, orders.o_orderdate, n2.n_name - Inner Join: n1.n_regionkey = region.r_regionkey - Inner Join: supplier.s_nationkey = n2.n_nationkey - Inner Join: customer.c_nationkey = n1.n_nationkey - Inner Join: orders.o_custkey = customer.c_custkey - Inner Join: lineitem.l_orderkey = orders.o_orderkey - Inner Join: lineitem.l_suppkey = supplier.s_suppkey - Inner Join: part.p_partkey = lineitem.l_partkey - Filter: part.p_type = Utf8("ECONOMY ANODIZED STEEL") - TableScan: part projection=[p_partkey, p_type] - TableScan: lineitem projection=[l_orderkey, l_partkey, l_suppkey, l_extendedprice, l_discount] - TableScan: supplier projection=[s_suppkey, s_nationkey] - Filter: orders.o_orderdate >= Date32("9131") AND orders.o_orderdate <= Date32("9861") - TableScan: orders projection=[o_orderkey, o_custkey, o_orderdate] - TableScan: customer projection=[c_custkey, c_nationkey] - SubqueryAlias: n1 - TableScan: nation projection=[n_nationkey, n_regionkey] - SubqueryAlias: n2 - TableScan: nation projection=[n_nationkey, n_name] - Filter: region.r_name = Utf8("AMERICA") - TableScan: region projection=[r_regionkey, r_name] \ No newline at end of file + Inner Join: n1.n_regionkey = region.r_regionkey + Inner Join: supplier.s_nationkey = n2.n_nationkey + Inner Join: customer.c_nationkey = n1.n_nationkey + Inner Join: orders.o_custkey = customer.c_custkey + Inner Join: lineitem.l_orderkey = orders.o_orderkey + Inner Join: lineitem.l_suppkey = supplier.s_suppkey + Inner Join: part.p_partkey = lineitem.l_partkey + Filter: part.p_type = Utf8("ECONOMY ANODIZED STEEL") + TableScan: part projection=[p_partkey, p_type] + TableScan: lineitem projection=[l_orderkey, l_partkey, l_suppkey, l_extendedprice, l_discount] + TableScan: supplier projection=[s_suppkey, s_nationkey] + Filter: orders.o_orderdate >= Date32("9131") AND orders.o_orderdate <= Date32("9861") + TableScan: orders projection=[o_orderkey, o_custkey, o_orderdate] + TableScan: customer projection=[c_custkey, c_nationkey] + SubqueryAlias: n1 + TableScan: nation projection=[n_nationkey, n_regionkey] + SubqueryAlias: n2 + TableScan: nation projection=[n_nationkey, n_name] + Filter: region.r_name = Utf8("AMERICA") + TableScan: region projection=[r_regionkey, r_name] \ No newline at end of file diff --git a/benchmarks/expected-plans/q9.txt b/benchmarks/expected-plans/q9.txt index 166c98d97106..c83f9d945da5 100644 --- a/benchmarks/expected-plans/q9.txt +++ b/benchmarks/expected-plans/q9.txt @@ -3,16 +3,15 @@ Sort: profit.nation ASC NULLS LAST, profit.o_year DESC NULLS FIRST Aggregate: groupBy=[[profit.nation, profit.o_year]], aggr=[[SUM(profit.amount)]] SubqueryAlias: profit Projection: nation.n_name AS nation, datepart(Utf8("YEAR"), orders.o_orderdate) AS o_year, CAST(lineitem.l_extendedprice AS Decimal128(38, 4)) * CAST(Decimal128(Some(100),23,2) - CAST(lineitem.l_discount AS Decimal128(23, 2)) AS Decimal128(38, 4)) - CAST(partsupp.ps_supplycost * lineitem.l_quantity AS Decimal128(38, 4)) AS amount - Projection: lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, partsupp.ps_supplycost, orders.o_orderdate, nation.n_name - Inner Join: supplier.s_nationkey = nation.n_nationkey - Inner Join: lineitem.l_orderkey = orders.o_orderkey - Inner Join: lineitem.l_suppkey = partsupp.ps_suppkey, lineitem.l_partkey = partsupp.ps_partkey - Inner Join: lineitem.l_suppkey = supplier.s_suppkey - Inner Join: part.p_partkey = lineitem.l_partkey - Filter: part.p_name LIKE Utf8("%green%") - TableScan: part projection=[p_partkey, p_name] - TableScan: lineitem projection=[l_orderkey, l_partkey, l_suppkey, l_quantity, l_extendedprice, l_discount] - TableScan: supplier projection=[s_suppkey, s_nationkey] - TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_supplycost] - TableScan: orders projection=[o_orderkey, o_orderdate] - TableScan: nation projection=[n_nationkey, n_name] \ No newline at end of file + Inner Join: supplier.s_nationkey = nation.n_nationkey + Inner Join: lineitem.l_orderkey = orders.o_orderkey + Inner Join: lineitem.l_suppkey = partsupp.ps_suppkey, lineitem.l_partkey = partsupp.ps_partkey + Inner Join: lineitem.l_suppkey = supplier.s_suppkey + Inner Join: part.p_partkey = lineitem.l_partkey + Filter: part.p_name LIKE Utf8("%green%") + TableScan: part projection=[p_partkey, p_name] + TableScan: lineitem projection=[l_orderkey, l_partkey, l_suppkey, l_quantity, l_extendedprice, l_discount] + TableScan: supplier projection=[s_suppkey, s_nationkey] + TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_supplycost] + TableScan: orders projection=[o_orderkey, o_orderdate] + TableScan: nation projection=[n_nationkey, n_name] \ No newline at end of file diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 955861be1b51..e83c77d7a15d 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -68,9 +68,9 @@ checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" [[package]] name = "arrow" -version = "32.0.0" +version = "34.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87d948f553cf556656eb89265700258e1032d26fec9b7920cd20319336e06afd" +checksum = "f410d3907b6b3647b9e7bca4551274b2e3d716aa940afb67b7287257401da921" dependencies = [ "ahash", "arrow-arith", @@ -91,9 +91,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "32.0.0" +version = "34.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf30d4ebc3df9dfd8bd26883aa30687d4ddcfd7b2443e62bd7c8fedf153b8e45" +checksum = "f87391cf46473c9bc53dab68cb8872c3a81d4dfd1703f1c8aa397dba9880a043" dependencies = [ "arrow-array", "arrow-buffer", @@ -106,9 +106,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "32.0.0" +version = "34.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fe66ec388d882a61fff3eb613b5266af133aa08a3318e5e493daf0f5c1696cb" +checksum = "d35d5475e65c57cffba06d0022e3006b677515f99b54af33a7cd54f6cdd4a5b5" dependencies = [ "ahash", "arrow-buffer", @@ -122,9 +122,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "32.0.0" +version = "34.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ef967dadbccd4586ec8d7aab27d7033ecb5dfae8a605c839613039eac227bda" +checksum = "68b4ec72eda7c0207727df96cf200f539749d736b21f3e782ece113e18c1a0a7" dependencies = [ "half", "num", @@ -132,9 +132,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "32.0.0" +version = "34.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "491a7979ea9e76dc218f532896e2d245fde5235e2e6420ce80d27cf6395dda84" +checksum = "0a7285272c9897321dfdba59de29f5b05aeafd3cdedf104a941256d155f6d304" dependencies = [ "arrow-array", "arrow-buffer", @@ -148,9 +148,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "32.0.0" +version = "34.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b1d4fc91078dbe843c2c50d90f8119c96e8dfac2f78d30f7a8cb9397399c61d" +checksum = "981ee4e7f6a120da04e00d0b39182e1eeacccb59c8da74511de753c56b7fddf7" dependencies = [ "arrow-array", "arrow-buffer", @@ -167,9 +167,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "32.0.0" +version = "34.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee0c0e3c5d3b80be8f267f4b2af714c08cad630569be01a8379cfe27b4866495" +checksum = "27cc673ee6989ea6e4b4e8c7d461f7e06026a096c8f0b1a7288885ff71ae1e56" dependencies = [ "arrow-buffer", "arrow-schema", @@ -179,9 +179,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "32.0.0" +version = "34.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a3ca7eb8d23c83fe40805cbafec70a6a31df72de47355545ff34c850f715403" +checksum = "e37b8b69d9e59116b6b538e8514e0ec63a30f08b617ce800d31cb44e3ef64c1a" dependencies = [ "arrow-array", "arrow-buffer", @@ -193,9 +193,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "32.0.0" +version = "34.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf65aff76d2e340d827d5cab14759e7dd90891a288347e2202e4ee28453d9bed" +checksum = "80c3fa0bed7cfebf6d18e46b733f9cb8a1cb43ce8e6539055ca3e1e48a426266" dependencies = [ "arrow-array", "arrow-buffer", @@ -212,9 +212,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "32.0.0" +version = "34.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "074a5a55c37ae4750af4811c8861c0378d8ab2ff6c262622ad24efae6e0b73b3" +checksum = "d247dce7bed6a8d6a3c6debfa707a3a2f694383f0c692a39d736a593eae5ef94" dependencies = [ "arrow-array", "arrow-buffer", @@ -226,9 +226,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "32.0.0" +version = "34.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e064ac4e64960ebfbe35f218f5e7d9dc9803b59c2e56f611da28ce6d008f839e" +checksum = "8d609c0181f963cea5c70fddf9a388595b5be441f3aa1d1cdbf728ca834bbd3a" dependencies = [ "ahash", "arrow-array", @@ -241,15 +241,15 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "32.0.0" +version = "34.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ead3f373b9173af52f2fdefcb5a7dd89f453fbc40056f574a8aeb23382a4ef81" +checksum = "64951898473bfb8e22293e83a44f02874d2257514d49cd95f9aa4afcff183fbc" [[package]] name = "arrow-select" -version = "32.0.0" +version = "34.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "646b4f15b5a77c970059e748aeb1539705c68cd397ecf0f0264c4ef3737d35f3" +checksum = "2a513d89c2e1ac22b28380900036cf1f3992c6443efc5e079de631dcf83c6888" dependencies = [ "arrow-array", "arrow-buffer", @@ -260,9 +260,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "32.0.0" +version = "34.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8b8bf150caaeca03f39f1a91069701387d93f7cfd256d27f423ac8496d99a51" +checksum = "5288979b2705dae1114c864d73150629add9153b9b8f1d7ee3963db94c372ba5" dependencies = [ "arrow-array", "arrow-buffer", @@ -287,6 +287,8 @@ dependencies = [ "pin-project-lite", "tokio", "xz2", + "zstd 0.11.2+zstd.1.5.2", + "zstd-safe 5.0.2+zstd.1.5.2", ] [[package]] @@ -382,18 +384,6 @@ dependencies = [ "alloc-stdlib", ] -[[package]] -name = "bstr" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" -dependencies = [ - "lazy_static", - "memchr", - "regex-automata", - "serde", -] - [[package]] name = "bumpalo" version = "3.12.0" @@ -602,13 +592,12 @@ dependencies = [ [[package]] name = "csv" -version = "1.1.6" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" +checksum = "af91f40b7355f82b0a891f50e70399475945bb0b0da4f1700ce60761c9d3e359" dependencies = [ - "bstr", "csv-core", - "itoa 0.4.8", + "itoa", "ryu", "serde", ] @@ -624,9 +613,9 @@ dependencies = [ [[package]] name = "cxx" -version = "1.0.89" +version = "1.0.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc831ee6a32dd495436e317595e639a587aa9907bef96fe6e6abc290ab6204e9" +checksum = "86d3488e7665a7a483b57e25bdd90d0aeb2bc7608c8d0346acf2ad3f1caf1d62" dependencies = [ "cc", "cxxbridge-flags", @@ -636,9 +625,9 @@ dependencies = [ [[package]] name = "cxx-build" -version = "1.0.89" +version = "1.0.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94331d54f1b1a8895cd81049f7eaaaef9d05a7dcb4d1fd08bf3ff0806246789d" +checksum = "48fcaf066a053a41a81dfb14d57d99738b767febb8b735c3016e469fac5da690" dependencies = [ "cc", "codespan-reporting", @@ -651,15 +640,15 @@ dependencies = [ [[package]] name = "cxxbridge-flags" -version = "1.0.89" +version = "1.0.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48dcd35ba14ca9b40d6e4b4b39961f23d835dbb8eed74565ded361d93e1feb8a" +checksum = "a2ef98b8b717a829ca5603af80e1f9e2e48013ab227b68ef37872ef84ee479bf" [[package]] name = "cxxbridge-macro" -version = "1.0.89" +version = "1.0.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81bbeb29798b407ccd82a3324ade1a7286e0d29851475990b612670f6f5124d2" +checksum = "086c685979a698443656e5cf7856c95c642295a38599f12fb1ff76fb28d19892" dependencies = [ "proc-macro2", "quote", @@ -681,7 +670,7 @@ dependencies = [ [[package]] name = "datafusion" -version = "18.0.0" +version = "19.0.0" dependencies = [ "ahash", "arrow", @@ -722,11 +711,12 @@ dependencies = [ "url", "uuid", "xz2", + "zstd 0.11.2+zstd.1.5.2", ] [[package]] name = "datafusion-cli" -version = "18.0.0" +version = "19.0.0" dependencies = [ "arrow", "async-trait", @@ -744,7 +734,7 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "18.0.0" +version = "19.0.0" dependencies = [ "arrow", "chrono", @@ -756,7 +746,7 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "18.0.0" +version = "19.0.0" dependencies = [ "ahash", "arrow", @@ -767,7 +757,7 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "18.0.0" +version = "19.0.0" dependencies = [ "arrow", "async-trait", @@ -776,13 +766,14 @@ dependencies = [ "datafusion-expr", "datafusion-physical-expr", "hashbrown 0.13.2", + "itertools", "log", "regex-syntax", ] [[package]] name = "datafusion-physical-expr" -version = "18.0.0" +version = "19.0.0" dependencies = [ "ahash", "arrow", @@ -811,7 +802,7 @@ dependencies = [ [[package]] name = "datafusion-row" -version = "18.0.0" +version = "19.0.0" dependencies = [ "arrow", "datafusion-common", @@ -821,7 +812,7 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "18.0.0" +version = "19.0.0" dependencies = [ "arrow-schema", "datafusion-common", @@ -955,9 +946,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "1.8.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" dependencies = [ "instant", ] @@ -1194,13 +1185,13 @@ dependencies = [ [[package]] name = "http" -version = "0.2.8" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75f43d41e26995c17e71ee126451dd3941010b0514a81a9d11f3b341debc2399" +checksum = "bd6effc99afb63425aff9b05836f029929e345a6148a14b7ecd5ab67af944482" dependencies = [ "bytes", "fnv", - "itoa 1.0.5", + "itoa", ] [[package]] @@ -1247,7 +1238,7 @@ dependencies = [ "http-body", "httparse", "httpdate", - "itoa 1.0.5", + "itoa", "pin-project-lite", "socket2", "tokio", @@ -1353,12 +1344,6 @@ dependencies = [ "either", ] -[[package]] -name = "itoa" -version = "0.4.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" - [[package]] name = "itoa" version = "1.0.5" @@ -1581,14 +1566,14 @@ dependencies = [ [[package]] name = "mio" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5d732bc30207a6423068df043e3d02e0735b155ad7ce1a6f76fe2baa5b158de" +checksum = "5b9d9a46eff5b4ff64b45a9e316a6d1e0bc719ef429cbec4dc630684212bfdf9" dependencies = [ "libc", "log", "wasi", - "windows-sys 0.42.0", + "windows-sys 0.45.0", ] [[package]] @@ -1729,9 +1714,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.17.0" +version = "1.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f61fba1741ea2b3d6a1e3178721804bb716a68a6aeba1149b5d52e3d464ea66" +checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" [[package]] name = "ordered-float" @@ -1773,9 +1758,9 @@ dependencies = [ [[package]] name = "parquet" -version = "32.0.0" +version = "34.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23b3d4917209e17e1da5fb07d276da237a42465f0def2b8d5fa5ce0e85855b4c" +checksum = "7ac135ecf63ebb5f53dda0921b0b76d6048b3ef631a5f4760b9e8f863ff00cfa" dependencies = [ "ahash", "arrow-array", @@ -1801,7 +1786,7 @@ dependencies = [ "thrift", "tokio", "twox-hash", - "zstd", + "zstd 0.12.3+zstd.1.5.2", ] [[package]] @@ -1969,12 +1954,6 @@ dependencies = [ "regex-syntax", ] -[[package]] -name = "regex-automata" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" - [[package]] name = "regex-syntax" version = "0.6.28" @@ -2194,7 +2173,7 @@ version = "1.0.93" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cad406b69c91885b5107daf2c29572f6c8cdb3c66826821e286c533490c0bc76" dependencies = [ - "itoa 1.0.5", + "itoa", "ryu", "serde", ] @@ -2206,7 +2185,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" dependencies = [ "form_urlencoded", - "itoa 1.0.5", + "itoa", "ryu", "serde", ] @@ -2224,9 +2203,9 @@ dependencies = [ [[package]] name = "slab" -version = "0.4.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef" +checksum = "6528351c9bc8ab22353f9d776db39a20288e8d6c37ef8cfe3317cf875eecfc2d" dependencies = [ "autocfg", ] @@ -2347,9 +2326,9 @@ checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" [[package]] name = "syn" -version = "1.0.107" +version = "1.0.109" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" dependencies = [ "proc-macro2", "quote", @@ -2483,9 +2462,9 @@ dependencies = [ [[package]] name = "tokio-stream" -version = "0.1.11" +version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d660770404473ccd7bc9f8b28494a811bc18542b915c0855c51e8f419d5223ce" +checksum = "8fb52b74f05dbf495a8fba459fdc331812b96aa086d9eb78101fa0d4569c3313" dependencies = [ "futures-core", "pin-project-lite", @@ -2494,9 +2473,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.6" +version = "0.7.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc6a3b08b64e6dfad376fa2432c7b1f01522e37a623c3050bc95db2d3ff21583" +checksum = "5427d89453009325de0d8f342c9490009f76e999cb7672d77e46267448f7e6b2" dependencies = [ "bytes", "futures-core", @@ -2902,13 +2881,32 @@ dependencies = [ "lzma-sys", ] +[[package]] +name = "zstd" +version = "0.11.2+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4" +dependencies = [ + "zstd-safe 5.0.2+zstd.1.5.2", +] + [[package]] name = "zstd" version = "0.12.3+zstd.1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76eea132fb024e0e13fd9c2f5d5d595d8a967aa72382ac2f9d39fcc95afd0806" dependencies = [ - "zstd-safe", + "zstd-safe 6.0.4+zstd.1.5.4", +] + +[[package]] +name = "zstd-safe" +version = "5.0.2+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" +dependencies = [ + "libc", + "zstd-sys", ] [[package]] diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 375e86b7b654..53404dcfe0cf 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -18,10 +18,10 @@ [package] name = "datafusion-cli" description = "Command Line Client for DataFusion query engine." -version = "18.0.0" +version = "19.0.0" authors = ["Apache Arrow "] edition = "2021" -keywords = [ "arrow", "datafusion", "query", "sql" ] +keywords = ["arrow", "datafusion", "query", "sql"] license = "Apache-2.0" homepage = "https://github.com/apache/arrow-datafusion" repository = "https://github.com/apache/arrow-datafusion" @@ -29,10 +29,10 @@ rust-version = "1.62" readme = "README.md" [dependencies] -arrow = "32.0.0" +arrow = "34.0.0" async-trait = "0.1.41" clap = { version = "3", features = ["derive", "cargo"] } -datafusion = { path = "../datafusion/core", version = "18.0.0" } +datafusion = { path = "../datafusion/core", version = "19.0.0" } dirs = "4.0.0" env_logger = "0.9" mimalloc = { version = "0.1", default-features = false } diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 4f50ca614730..4cccad47b925 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "datafusion-examples" description = "DataFusion usage examples" -version = "18.0.0" +version = "19.0.0" homepage = "https://github.com/apache/arrow-datafusion" repository = "https://github.com/apache/arrow-datafusion" authors = ["Apache Arrow "] @@ -34,9 +34,9 @@ path = "examples/avro_sql.rs" required-features = ["datafusion/avro"] [dev-dependencies] -arrow = "32.0.0" -arrow-flight = { version = "32.0.0", features = ["flight-sql-experimental"] } -arrow-schema = "32.0.0" +arrow = "34.0.0" +arrow-flight = { version = "34.0.0", features = ["flight-sql-experimental"] } +arrow-schema = "34.0.0" async-trait = "0.1.41" dashmap = "5.4" datafusion = { path = "../datafusion/core" } diff --git a/datafusion-examples/examples/catalog.rs b/datafusion-examples/examples/catalog.rs new file mode 100644 index 000000000000..30cc2c8bd618 --- /dev/null +++ b/datafusion-examples/examples/catalog.rs @@ -0,0 +1,289 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Simple example of a catalog/schema implementation. +//! +//! Example requires git submodules to be initialized in repo as it uses data from +//! the `parquet-testing` repo. +use async_trait::async_trait; +use datafusion::{ + arrow::util::pretty, + catalog::{ + catalog::{CatalogList, CatalogProvider}, + schema::SchemaProvider, + }, + datasource::{ + file_format::{csv::CsvFormat, parquet::ParquetFormat, FileFormat}, + listing::{ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl}, + TableProvider, + }, + error::Result, + execution::context::SessionState, + prelude::SessionContext, +}; +use std::sync::RwLock; +use std::{ + any::Any, + collections::HashMap, + path::{Path, PathBuf}, + sync::Arc, +}; + +#[tokio::main] +async fn main() -> Result<()> { + let repo_dir = std::fs::canonicalize( + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + // parent dir of datafusion-examples = repo root + .join(".."), + ) + .unwrap(); + let mut ctx = SessionContext::new(); + let state = ctx.state(); + let catlist = Arc::new(CustomCatalogList::new()); + // use our custom catalog list for context. each context has a single catalog list. + // context will by default have MemoryCatalogList + ctx.register_catalog_list(catlist.clone()); + + // intitialize our catalog and schemas + let catalog = DirCatalog::new(); + let parquet_schema = DirSchema::create( + &state, + DirSchemaOpts { + format: Arc::new(ParquetFormat::default()), + dir: &repo_dir.join("parquet-testing").join("data"), + ext: "parquet", + }, + ) + .await?; + let csv_schema = DirSchema::create( + &state, + DirSchemaOpts { + format: Arc::new(CsvFormat::default()), + dir: &repo_dir.join("testing").join("data").join("csv"), + ext: "csv", + }, + ) + .await?; + // register schemas into catalog + catalog.register_schema("parquet", parquet_schema.clone())?; + catalog.register_schema("csv", csv_schema.clone())?; + // register our catalog in the context + ctx.register_catalog("dircat", Arc::new(catalog)); + { + // catalog was passed down into our custom catalog list since we overide the ctx's default + let catalogs = catlist.catalogs.read().unwrap(); + assert!(catalogs.contains_key("dircat")); + }; + // take the first 5 (arbitrary amount) keys from our schema's hashmap. + // in our `DirSchema`, the table names are equivalent to their key in the hashmap, + // so any key in the hashmap will now be a queryable in our datafusion context. + let parquet_tables = { + let tables = parquet_schema.tables.read().unwrap(); + tables.keys().take(5).cloned().collect::>() + }; + for table in parquet_tables { + println!("querying table {table} from parquet schema"); + let df = ctx + .sql(&format!("select * from dircat.parquet.\"{table}\" ")) + .await? + .limit(0, Some(5))?; + let result = df.collect().await; + match result { + Ok(batches) => { + pretty::print_batches(&batches).unwrap(); + } + Err(e) => { + println!("table '{table}' query failed due to {e}"); + } + } + } + let table_to_drop = { + let parquet_tables = parquet_schema.tables.read().unwrap(); + parquet_tables.keys().next().unwrap().to_owned() + }; + // DDL example + let df = ctx + .sql(&format!("DROP TABLE dircat.parquet.\"{table_to_drop}\"")) + .await?; + df.collect().await?; + let parquet_tables = parquet_schema.tables.read().unwrap(); + // datafusion has deregistered the table from our schema + // (called our schema's deregister func) + assert!(!parquet_tables.contains_key(&table_to_drop)); + Ok(()) +} + +struct DirSchemaOpts<'a> { + ext: &'a str, + dir: &'a Path, + format: Arc, +} +/// Schema where every file with extension `ext` in a given `dir` is a table. +struct DirSchema { + ext: String, + tables: RwLock>>, +} +impl DirSchema { + async fn create(state: &SessionState, opts: DirSchemaOpts<'_>) -> Result> { + let DirSchemaOpts { ext, dir, format } = opts; + let mut tables = HashMap::new(); + let listdir = std::fs::read_dir(dir).unwrap(); + for res in listdir { + let entry = res.unwrap(); + let filename = entry.file_name().to_str().unwrap().to_string(); + if !filename.ends_with(ext) { + continue; + } + + let table_path = ListingTableUrl::parse(entry.path().to_str().unwrap())?; + let opts = ListingOptions::new(format.clone()); + let conf = ListingTableConfig::new(table_path) + .with_listing_options(opts) + .infer_schema(state) + .await?; + let table = ListingTable::try_new(conf)?; + tables.insert(filename, Arc::new(table) as Arc); + } + Ok(Arc::new(Self { + tables: RwLock::new(tables), + ext: ext.to_string(), + })) + } + #[allow(unused)] + fn name(&self) -> &str { + &self.ext + } +} + +#[async_trait] +impl SchemaProvider for DirSchema { + fn as_any(&self) -> &dyn Any { + self + } + + fn table_names(&self) -> Vec { + let tables = self.tables.read().unwrap(); + tables.keys().cloned().collect::>() + } + + async fn table(&self, name: &str) -> Option> { + let tables = self.tables.read().unwrap(); + tables.get(name).cloned() + } + + fn table_exist(&self, name: &str) -> bool { + let tables = self.tables.read().unwrap(); + tables.contains_key(name) + } + fn register_table( + &self, + name: String, + table: Arc, + ) -> Result>> { + let mut tables = self.tables.write().unwrap(); + println!("adding table {name}"); + tables.insert(name, table.clone()); + Ok(Some(table)) + } + + /// If supported by the implementation, removes an existing table from this schema and returns it. + /// If no table of that name exists, returns Ok(None). + #[allow(unused_variables)] + fn deregister_table(&self, name: &str) -> Result>> { + let mut tables = self.tables.write().unwrap(); + println!("dropping table {name}"); + Ok(tables.remove(name)) + } +} +/// Catalog holds multiple schemas +struct DirCatalog { + schemas: RwLock>>, +} +impl DirCatalog { + fn new() -> Self { + Self { + schemas: RwLock::new(HashMap::new()), + } + } +} +impl CatalogProvider for DirCatalog { + fn as_any(&self) -> &dyn Any { + self + } + fn register_schema( + &self, + name: &str, + schema: Arc, + ) -> Result>> { + let mut schema_map = self.schemas.write().unwrap(); + schema_map.insert(name.to_owned(), schema.clone()); + Ok(Some(schema)) + } + + fn schema_names(&self) -> Vec { + let schemas = self.schemas.read().unwrap(); + schemas.keys().cloned().collect() + } + + fn schema(&self, name: &str) -> Option> { + let schemas = self.schemas.read().unwrap(); + let maybe_schema = schemas.get(name); + if let Some(schema) = maybe_schema { + let schema = schema.clone() as Arc; + Some(schema) + } else { + None + } + } +} +/// Catalog lists holds multiple catalogs. Each context has a single catalog list. +struct CustomCatalogList { + catalogs: RwLock>>, +} +impl CustomCatalogList { + fn new() -> Self { + Self { + catalogs: RwLock::new(HashMap::new()), + } + } +} +impl CatalogList for CustomCatalogList { + fn as_any(&self) -> &dyn Any { + self + } + fn register_catalog( + &self, + name: String, + catalog: Arc, + ) -> Option> { + let mut cats = self.catalogs.write().unwrap(); + cats.insert(name, catalog.clone()); + Some(catalog) + } + + /// Retrieves the list of available catalog names + fn catalog_names(&self) -> Vec { + let cats = self.catalogs.read().unwrap(); + cats.keys().cloned().collect() + } + + /// Retrieves a specific catalog by name, provided it exists. + fn catalog(&self, name: &str) -> Option> { + let cats = self.catalogs.read().unwrap(); + cats.get(name).cloned() + } +} diff --git a/datafusion/CHANGELOG.md b/datafusion/CHANGELOG.md index 05785604c9d9..69519094b229 100644 --- a/datafusion/CHANGELOG.md +++ b/datafusion/CHANGELOG.md @@ -19,6 +19,194 @@ # Changelog +## [19.0.0](https://github.com/apache/arrow-datafusion/tree/19.0.0) (2023-02-24) + +[Full Changelog](https://github.com/apache/arrow-datafusion/compare/18.0.0...19.0.0) + +**Breaking changes:** + +- Use DataFusionError instead of ArrowError in SendableRecordBatchStream [\#5101](https://github.com/apache/arrow-datafusion/pull/5101) ([comphead](https://github.com/comphead)) +- Update to arrow 32 and Switch to RawDecoder for JSON [\#5056](https://github.com/apache/arrow-datafusion/pull/5056) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([tustvold](https://github.com/tustvold)) +- Allow `SessionContext::read_csv`, etc to read multiple files [\#4908](https://github.com/apache/arrow-datafusion/pull/4908) ([saikrishna1-bidgely](https://github.com/saikrishna1-bidgely)) + +**Implemented enhancements:** + +- Ignore Arrow in dependabot [\#5340](https://github.com/apache/arrow-datafusion/issues/5340) +- Provide access to internal fields of SessionContext [\#5317](https://github.com/apache/arrow-datafusion/issues/5317) +- Investigate performance drop for DISTINCT queries [\#5313](https://github.com/apache/arrow-datafusion/issues/5313) +- \[DOC\] Update math expression documentation [\#5312](https://github.com/apache/arrow-datafusion/issues/5312) +- Replace merge\_batches with concat\_batches [\#5297](https://github.com/apache/arrow-datafusion/issues/5297) +- Support for some of the window frame range queries [\#5275](https://github.com/apache/arrow-datafusion/issues/5275) +- Make `log` function to be in sync with PostgresSql [\#5259](https://github.com/apache/arrow-datafusion/issues/5259) +- \[SQLLogicTest\] Make schema validation ignore nullable and metadata attributes [\#5231](https://github.com/apache/arrow-datafusion/issues/5231) +- Add support for linear groups search [\#5213](https://github.com/apache/arrow-datafusion/issues/5213) +- Add SQL function overload `LOG(base, x)` for logarithm of x to base [\#5206](https://github.com/apache/arrow-datafusion/issues/5206) +- `all_schema()` will get schema of child of child of .... [\#5192](https://github.com/apache/arrow-datafusion/issues/5192) +- Enable parquet parallel scans by default [\#5125](https://github.com/apache/arrow-datafusion/issues/5125) +- Don't repartition ProjectionExec when it does not compute anything [\#4968](https://github.com/apache/arrow-datafusion/issues/4968) +- Support non-tuple expression for Exists Subquery to Join [\#4934](https://github.com/apache/arrow-datafusion/issues/4934) +- Read multiple files/folders using `read_csv` [\#4909](https://github.com/apache/arrow-datafusion/issues/4909) + +**Fixed bugs:** + +- Make inline\_table\_scan optimize whole plan during first optimization stage. [\#5364](https://github.com/apache/arrow-datafusion/issues/5364) +- tpcds\_logical\_q8 ambiguous name. [\#5334](https://github.com/apache/arrow-datafusion/issues/5334) +- Protobuf serialisation is missing for GetIndexedFieldExpr [\#5323](https://github.com/apache/arrow-datafusion/issues/5323) +- Indexing a nested list with 0 or an index larger than list size is not handled correctly [\#5310](https://github.com/apache/arrow-datafusion/issues/5310) +- Protobuf serialization drops `preserve_partitioning` from `SortExec` [\#5305](https://github.com/apache/arrow-datafusion/issues/5305) +- data file without suffix can't be read correctly [\#5301](https://github.com/apache/arrow-datafusion/issues/5301) +- Idk [\#5298](https://github.com/apache/arrow-datafusion/issues/5298) +- Error with query that has DISTINCT with ORDER BY and aliased select list [\#5293](https://github.com/apache/arrow-datafusion/issues/5293) +- Optimizer prunes UnnestExec on aggregate count [\#5281](https://github.com/apache/arrow-datafusion/issues/5281) +- Strange Behaviour on RepartitionExec with CoalescePartitionsExec. [\#5278](https://github.com/apache/arrow-datafusion/issues/5278) +- Error "For SELECT DISTINCT, ORDER BY expressions id must appear in select list" may be over eager [\#5255](https://github.com/apache/arrow-datafusion/issues/5255) +- SQL allows SORT BY keyword [\#5247](https://github.com/apache/arrow-datafusion/issues/5247) +- test `sort_on_window_null_string` failed after disable `skip_fail`. [\#5233](https://github.com/apache/arrow-datafusion/issues/5233) +- Dataframe API adds ?table? qualifier [\#5187](https://github.com/apache/arrow-datafusion/issues/5187) +- Re-ordering Projections in scan are not working anymore \(since DF15\) [\#5146](https://github.com/apache/arrow-datafusion/issues/5146) +- parquet page level skipping \(page index pruning\) doesn't work with evolved schemas [\#5104](https://github.com/apache/arrow-datafusion/issues/5104) +- Incorrect results on queries with `distinct` and orderby [\#5065](https://github.com/apache/arrow-datafusion/issues/5065) +- NestedLoopJoin will panic when right child contains RepartitionExec [\#5022](https://github.com/apache/arrow-datafusion/issues/5022) +- JSON projection only work when the index is in ascending order [\#4832](https://github.com/apache/arrow-datafusion/issues/4832) +- Stack overflows when planning tpcds 22 in debug mode [\#4786](https://github.com/apache/arrow-datafusion/issues/4786) +- Failed to create Left anti join physical plan due to SchemaError::FieldNotFound [\#4366](https://github.com/apache/arrow-datafusion/issues/4366) +- Filters/limit are not pushdown druing optimalization for table with alias [\#2270](https://github.com/apache/arrow-datafusion/issues/2270) + +**Documentation updates:** + +- Update README.md fix \[welcoming community\] links [\#5232](https://github.com/apache/arrow-datafusion/pull/5232) ([jiangzhx](https://github.com/jiangzhx)) +- Update README.md update blaze-rs link to https://github.com/blaze-init/blaze [\#5190](https://github.com/apache/arrow-datafusion/pull/5190) ([jiangzhx](https://github.com/jiangzhx)) +- Typo of greptimedb [\#5103](https://github.com/apache/arrow-datafusion/pull/5103) ([fengjiachun](https://github.com/fengjiachun)) +- chore: change `DataBend` to `Databend` [\#5096](https://github.com/apache/arrow-datafusion/pull/5096) ([xudong963](https://github.com/xudong963)) + +**Closed issues:** + +- Implement column number / column type verification for sqllogictest [\#4499](https://github.com/apache/arrow-datafusion/issues/4499) + +**Merged pull requests:** + +- generate new projection plan in inline\_table\_scan instead of discarding [\#5371](https://github.com/apache/arrow-datafusion/pull/5371) ([jackwener](https://github.com/jackwener)) +- minor: fix rule name and comment. [\#5370](https://github.com/apache/arrow-datafusion/pull/5370) ([jackwener](https://github.com/jackwener)) +- minor: port limit tests to sqllogictests [\#5355](https://github.com/apache/arrow-datafusion/pull/5355) ([jackwener](https://github.com/jackwener)) +- feat: add rule to merge projection. [\#5349](https://github.com/apache/arrow-datafusion/pull/5349) ([jackwener](https://github.com/jackwener)) +- Ignore Arrow in dependabot [\#5341](https://github.com/apache/arrow-datafusion/pull/5341) ([iajoiner](https://github.com/iajoiner)) +- minor: remove useless `.get()` [\#5336](https://github.com/apache/arrow-datafusion/pull/5336) ([jackwener](https://github.com/jackwener)) +- bugfix: fix tpcds\_logical\_q8 ambiguous name. [\#5335](https://github.com/apache/arrow-datafusion/pull/5335) ([jackwener](https://github.com/jackwener)) +- minor: disable tpcds\_logical\_q10/q35 [\#5333](https://github.com/apache/arrow-datafusion/pull/5333) ([jackwener](https://github.com/jackwener)) +- minor: port intersection sql tests to sqllogictests [\#5331](https://github.com/apache/arrow-datafusion/pull/5331) ([alamb](https://github.com/alamb)) +- minor: port more window tests to sqllogictests [\#5330](https://github.com/apache/arrow-datafusion/pull/5330) ([alamb](https://github.com/alamb)) +- MINOR: nicer error messages for cli, use display format rather than debug [\#5329](https://github.com/apache/arrow-datafusion/pull/5329) ([kmitchener](https://github.com/kmitchener)) +- Add missing protobuf serialisation functionality GetIndexedFieldExpr. [\#5324](https://github.com/apache/arrow-datafusion/pull/5324) ([ahmedriza](https://github.com/ahmedriza)) +- chore: small typo in the example README [\#5319](https://github.com/apache/arrow-datafusion/pull/5319) ([gianarb](https://github.com/gianarb)) +- feat: add accessor to SessionContext fields for ContextProvider impl [\#5318](https://github.com/apache/arrow-datafusion/pull/5318) ([sunng87](https://github.com/sunng87)) +- \[DOC\] Update math expression documentation [\#5316](https://github.com/apache/arrow-datafusion/pull/5316) ([comphead](https://github.com/comphead)) +- Fix nested list indexing when the index is 0 or larger than the list size [\#5311](https://github.com/apache/arrow-datafusion/pull/5311) ([ahmedriza](https://github.com/ahmedriza)) +- Fix SortExec bench case and Add SortExec input cases to bench for SortPreservingMergeExec [\#5308](https://github.com/apache/arrow-datafusion/pull/5308) ([jaylmiller](https://github.com/jaylmiller)) +- Allow DISTINCT with ORDER BY and an aliased select list [\#5307](https://github.com/apache/arrow-datafusion/pull/5307) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([alamb](https://github.com/alamb)) +- Serialize preserve\_partitioning in SortExec [\#5306](https://github.com/apache/arrow-datafusion/pull/5306) ([thinkharderdev](https://github.com/thinkharderdev)) +- fix: correct plan builder when test `scalar_subquery_project_expr` [\#5304](https://github.com/apache/arrow-datafusion/pull/5304) ([jackwener](https://github.com/jackwener)) +- Make SQL query consistent with API syntax expression in code examples [\#5303](https://github.com/apache/arrow-datafusion/pull/5303) ([ongchi](https://github.com/ongchi)) +- enable tpcds-64 test [\#5302](https://github.com/apache/arrow-datafusion/pull/5302) ([jackwener](https://github.com/jackwener)) +- Feature/merge batches removal [\#5300](https://github.com/apache/arrow-datafusion/pull/5300) ([berkaysynnada](https://github.com/berkaysynnada)) +- fix: add yield point to `RepartitionExec` [\#5299](https://github.com/apache/arrow-datafusion/pull/5299) ([crepererum](https://github.com/crepererum)) +- `datafusion.optimizer.repartition_file_scans` enabled by default [\#5295](https://github.com/apache/arrow-datafusion/pull/5295) ([korowa](https://github.com/korowa)) +- minor: derive Ord/PartialOrd/Eq/PartialEq traits for `ObjectStoreUrl` [\#5288](https://github.com/apache/arrow-datafusion/pull/5288) ([crepererum](https://github.com/crepererum)) +- Fix the potential bug of check\_all\_column\_from\_schema [\#5287](https://github.com/apache/arrow-datafusion/pull/5287) ([ygf11](https://github.com/ygf11)) +- Linear search support for Window Group queries [\#5286](https://github.com/apache/arrow-datafusion/pull/5286) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([mustafasrepo](https://github.com/mustafasrepo)) +- Prevent optimizer from pruning UnnestExec. [\#5282](https://github.com/apache/arrow-datafusion/pull/5282) ([vincev](https://github.com/vincev)) +- Minor: Add fetch to SortExec display [\#5279](https://github.com/apache/arrow-datafusion/pull/5279) ([thinkharderdev](https://github.com/thinkharderdev)) +- Set `catalog_list` from outside for `SessionState`. [\#5277](https://github.com/apache/arrow-datafusion/pull/5277) ([MichaelScofield](https://github.com/MichaelScofield)) +- Support page skipping / page\_index pushdown for evolved schemas [\#5268](https://github.com/apache/arrow-datafusion/pull/5268) ([alamb](https://github.com/alamb)) +- Use upstream newline\_delimited\_stream [\#5267](https://github.com/apache/arrow-datafusion/pull/5267) ([tustvold](https://github.com/tustvold)) +- Support non-tuple expression for exists-subquery to join [\#5264](https://github.com/apache/arrow-datafusion/pull/5264) ([ygf11](https://github.com/ygf11)) +- minor: Fix cargo fmt [\#5263](https://github.com/apache/arrow-datafusion/pull/5263) ([alamb](https://github.com/alamb)) +- minor: replace `unwrap()` with `?` [\#5262](https://github.com/apache/arrow-datafusion/pull/5262) ([jackwener](https://github.com/jackwener)) +- Preserve `TableScan.projection` order in `push_down_projection` optimizer rule [\#5261](https://github.com/apache/arrow-datafusion/pull/5261) ([korowa](https://github.com/korowa)) +- Minor: refactor ParquetExec roundtrip tests [\#5260](https://github.com/apache/arrow-datafusion/pull/5260) ([alamb](https://github.com/alamb)) +- \[fix\]\[plan\] relax the check for distinct, order by for dataframe [\#5258](https://github.com/apache/arrow-datafusion/pull/5258) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([xiaoyong-z](https://github.com/xiaoyong-z)) +- enhance the checking of type errors in the test `window_frame_creation` [\#5257](https://github.com/apache/arrow-datafusion/pull/5257) ([HaoYang670](https://github.com/HaoYang670)) +- SQL planning benchmarks for very wide tables [\#5256](https://github.com/apache/arrow-datafusion/pull/5256) ([alamb](https://github.com/alamb)) +- Minor: Add negative test for SORT BY [\#5254](https://github.com/apache/arrow-datafusion/pull/5254) ([alamb](https://github.com/alamb)) +- \[sqllogictest\] Define output types and check them in tests [\#5253](https://github.com/apache/arrow-datafusion/pull/5253) ([melgenek](https://github.com/melgenek)) +- Minor: port some explain test to sqllogictest, add filename normalization [\#5252](https://github.com/apache/arrow-datafusion/pull/5252) ([alamb](https://github.com/alamb)) +- Disallow SORT BY in SQL [\#5249](https://github.com/apache/arrow-datafusion/pull/5249) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([Jefffrey](https://github.com/Jefffrey)) +- \[SQLLogicTest\] Make schema validation ignore nullable and metadata attributes [\#5246](https://github.com/apache/arrow-datafusion/pull/5246) ([comphead](https://github.com/comphead)) +- Add SQL function overload LOG\(base, x\) for logarithm of x to base [\#5245](https://github.com/apache/arrow-datafusion/pull/5245) ([comphead](https://github.com/comphead)) +- Update sqllogictest requirement from 0.11.1 to 0.12.0 \#5237 [\#5244](https://github.com/apache/arrow-datafusion/pull/5244) ([alamb](https://github.com/alamb)) +- Test case for NDJsonExec with randomly ordered projection [\#5243](https://github.com/apache/arrow-datafusion/pull/5243) ([korowa](https://github.com/korowa)) +- Update to arrow `33.0.0` [\#5241](https://github.com/apache/arrow-datafusion/pull/5241) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([tustvold](https://github.com/tustvold)) +- DataFusion 18.0.0 Release [\#5240](https://github.com/apache/arrow-datafusion/pull/5240) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([andygrove](https://github.com/andygrove)) +- fix clippy in nightly [\#5238](https://github.com/apache/arrow-datafusion/pull/5238) ([jackwener](https://github.com/jackwener)) +- refactor: correct the implementation of `all_schemas()` [\#5236](https://github.com/apache/arrow-datafusion/pull/5236) ([jackwener](https://github.com/jackwener)) +- bugfix: fix error when `get_coerced_window_frame` meet `utf8` [\#5234](https://github.com/apache/arrow-datafusion/pull/5234) ([jackwener](https://github.com/jackwener)) +- Feature/sort enforcement refactor [\#5228](https://github.com/apache/arrow-datafusion/pull/5228) ([mustafasrepo](https://github.com/mustafasrepo)) +- Minor: Fix doc links and typos [\#5225](https://github.com/apache/arrow-datafusion/pull/5225) ([Jefffrey](https://github.com/Jefffrey)) +- fix: correct expected error in test [\#5224](https://github.com/apache/arrow-datafusion/pull/5224) ([jackwener](https://github.com/jackwener)) +- bugfix: fix propagating empty\_relation generates an illegal plan [\#5219](https://github.com/apache/arrow-datafusion/pull/5219) ([yukkit](https://github.com/yukkit)) +- Replace placeholders in ScalarSubqueries [\#5216](https://github.com/apache/arrow-datafusion/pull/5216) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([avantgardnerio](https://github.com/avantgardnerio)) +- Dataframe join\_on method [\#5210](https://github.com/apache/arrow-datafusion/pull/5210) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([Jefffrey](https://github.com/Jefffrey)) +- bugfix: fix eval `nullalbe()` in `simplify_exprs` [\#5208](https://github.com/apache/arrow-datafusion/pull/5208) ([jackwener](https://github.com/jackwener)) +- minor: remove unnecessary clone [\#5207](https://github.com/apache/arrow-datafusion/pull/5207) ([Ted-Jiang](https://github.com/Ted-Jiang)) +- minor: extract `merge_schema()` function. [\#5203](https://github.com/apache/arrow-datafusion/pull/5203) ([jackwener](https://github.com/jackwener)) +- minor: remove unnecessary `continue` [\#5200](https://github.com/apache/arrow-datafusion/pull/5200) ([xiaoyong-z](https://github.com/xiaoyong-z)) +- Minor: Begin porting some window tests to sqllogictests [\#5199](https://github.com/apache/arrow-datafusion/pull/5199) ([alamb](https://github.com/alamb)) +- fix\(MemTable\): make it cancel-safe and fix parallelism [\#5197](https://github.com/apache/arrow-datafusion/pull/5197) ([DDtKey](https://github.com/DDtKey)) +- fix: make `write_csv/json/parquet` cancel-safe [\#5196](https://github.com/apache/arrow-datafusion/pull/5196) ([DDtKey](https://github.com/DDtKey)) +- Support arithmetic operation on DictionaryArray [\#5194](https://github.com/apache/arrow-datafusion/pull/5194) ([viirya](https://github.com/viirya)) +- sqllogicaltest: add cleanup and use rowsort. [\#5189](https://github.com/apache/arrow-datafusion/pull/5189) ([jackwener](https://github.com/jackwener)) +- bugfix: fix `TableScan` may contain fields not included in `schema` [\#5188](https://github.com/apache/arrow-datafusion/pull/5188) ([jackwener](https://github.com/jackwener)) +- Create disk manager spill folder if doesn't exist [\#5185](https://github.com/apache/arrow-datafusion/pull/5185) ([comphead](https://github.com/comphead)) +- Parse identifiers properly for TableReferences [\#5183](https://github.com/apache/arrow-datafusion/pull/5183) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([Jefffrey](https://github.com/Jefffrey)) +- Fix decimal scalar dyn kernels [\#5179](https://github.com/apache/arrow-datafusion/pull/5179) ([viirya](https://github.com/viirya)) +- Patch git Safe Paths in CI [\#5177](https://github.com/apache/arrow-datafusion/pull/5177) ([tustvold](https://github.com/tustvold)) +- Add initial support for serializing physical plans with Substrait [\#5176](https://github.com/apache/arrow-datafusion/pull/5176) ([andygrove](https://github.com/andygrove)) +- Bump tokio from 1.24.1 to 1.24.2 in /datafusion-cli [\#5172](https://github.com/apache/arrow-datafusion/pull/5172) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Make EnforceSorting global sort aware, fix sort mis-optimizations involving unions, support parallel sort + merge transformations [\#5171](https://github.com/apache/arrow-datafusion/pull/5171) ([mustafasrepo](https://github.com/mustafasrepo)) +- Update substrait README.md [\#5168](https://github.com/apache/arrow-datafusion/pull/5168) ([jiangzhx](https://github.com/jiangzhx)) +- Switch to use sum kernel from arrow-rs for Decimal128 [\#5167](https://github.com/apache/arrow-datafusion/pull/5167) ([sunchao](https://github.com/sunchao)) +- FileStream: Open next file in parallel while decoding [\#5161](https://github.com/apache/arrow-datafusion/pull/5161) ([thinkharderdev](https://github.com/thinkharderdev)) +- Fix FairSpillPool try\_grow for non-spillable consumers [\#5160](https://github.com/apache/arrow-datafusion/pull/5160) ([tustvold](https://github.com/tustvold)) +- fix: treat unsupported SQL plans as "not implemented" [\#5159](https://github.com/apache/arrow-datafusion/pull/5159) ([crepererum](https://github.com/crepererum)) +- Compare NULL types [\#5158](https://github.com/apache/arrow-datafusion/pull/5158) ([melgenek](https://github.com/melgenek)) +- Always wrapping OnceAsync for the inner table side in NestedLoopJoinExec [\#5156](https://github.com/apache/arrow-datafusion/pull/5156) ([ygf11](https://github.com/ygf11)) +- chore: add object\_name\_to\_table\_reference in SqlToRel [\#5155](https://github.com/apache/arrow-datafusion/pull/5155) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([jiacai2050](https://github.com/jiacai2050)) +- Ambiguity check for where selection [\#5153](https://github.com/apache/arrow-datafusion/pull/5153) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([Jefffrey](https://github.com/Jefffrey)) +- feat: Type coercion for Dictionary\(\_, \_\) to Utf8 for regex conditions [\#5152](https://github.com/apache/arrow-datafusion/pull/5152) ([stuartcarnie](https://github.com/stuartcarnie)) +- Support arithmetic scalar operation with DictionaryArray [\#5151](https://github.com/apache/arrow-datafusion/pull/5151) ([viirya](https://github.com/viirya)) +- \[sqllogictest\] Support `pg_typeof` [\#5148](https://github.com/apache/arrow-datafusion/pull/5148) ([melgenek](https://github.com/melgenek)) +- Date to Timestamp cast [\#5140](https://github.com/apache/arrow-datafusion/pull/5140) ([comphead](https://github.com/comphead)) +- add example for Flight SQL server that supports JDBC driver [\#5138](https://github.com/apache/arrow-datafusion/pull/5138) ([kmitchener](https://github.com/kmitchener)) +- Add in-list test [\#5135](https://github.com/apache/arrow-datafusion/pull/5135) ([nseekhao](https://github.com/nseekhao)) +- \[BugFix\] abort plan if order by column not in select list [\#5132](https://github.com/apache/arrow-datafusion/pull/5132) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([xiaoyong-z](https://github.com/xiaoyong-z)) +- Bug fix: Empty Record Batch handling [\#5131](https://github.com/apache/arrow-datafusion/pull/5131) ([mustafasrepo](https://github.com/mustafasrepo)) +- Add option to control whether to normalize ident [\#5124](https://github.com/apache/arrow-datafusion/pull/5124) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([jiacai2050](https://github.com/jiacai2050)) +- Make `parse_physical_expr` public [\#5118](https://github.com/apache/arrow-datafusion/pull/5118) ([comphead](https://github.com/comphead)) +- Support coercing `utf8` to `interval` and `timestamp` \(including arguments to `date_bin`\) [\#5117](https://github.com/apache/arrow-datafusion/pull/5117) ([alamb](https://github.com/alamb)) +- Fix release issues [\#5116](https://github.com/apache/arrow-datafusion/pull/5116) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([andygrove](https://github.com/andygrove)) +- minor: port date\_bin tests to sqllogictests [\#5115](https://github.com/apache/arrow-datafusion/pull/5115) ([alamb](https://github.com/alamb)) +- Minor: reduce code duplication using `rewrite_expr` [\#5114](https://github.com/apache/arrow-datafusion/pull/5114) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([alamb](https://github.com/alamb)) +- Replace &Option\ with Option\<&T\> [\#5113](https://github.com/apache/arrow-datafusion/pull/5113) ([gaoxinge](https://github.com/gaoxinge)) +- Improve `get_meet_of_orderings` to check for common prefixes [\#5111](https://github.com/apache/arrow-datafusion/pull/5111) ([ozankabak](https://github.com/ozankabak)) +- \[sqllogictest\] Apply rowsort when there is no explicit order by [\#5110](https://github.com/apache/arrow-datafusion/pull/5110) ([melgenek](https://github.com/melgenek)) +- Add unnest\_column to DataFrame [\#5106](https://github.com/apache/arrow-datafusion/pull/5106) ([vincev](https://github.com/vincev)) +- Minor: reduce indent level in page filter pruning code [\#5105](https://github.com/apache/arrow-datafusion/pull/5105) ([alamb](https://github.com/alamb)) +- Replace &Option\ with Option\<&T\> [\#5102](https://github.com/apache/arrow-datafusion/pull/5102) ([gaoxinge](https://github.com/gaoxinge)) +- Minor: remove unused methods in datafusion/optimizer/src/utils.rs [\#5098](https://github.com/apache/arrow-datafusion/pull/5098) ([ygf11](https://github.com/ygf11)) +- ci: don't trigger rust ci for doc changes [\#5097](https://github.com/apache/arrow-datafusion/pull/5097) ([xudong963](https://github.com/xudong963)) +- sqllogicaltest: fix unstable slt case. [\#5095](https://github.com/apache/arrow-datafusion/pull/5095) ([jackwener](https://github.com/jackwener)) +- chore: update cranelift-module [\#5094](https://github.com/apache/arrow-datafusion/pull/5094) ([jackwener](https://github.com/jackwener)) +- refactor: Add `rewrite_expr` convenience method for rewriting `Expr`s [\#5092](https://github.com/apache/arrow-datafusion/pull/5092) ([alamb](https://github.com/alamb)) +- Minor: extract sort col rewrite into its own module, add unit tests [\#5088](https://github.com/apache/arrow-datafusion/pull/5088) ([alamb](https://github.com/alamb)) +- \[sqllogictest\] Move `decimal.rs` tests [\#5086](https://github.com/apache/arrow-datafusion/pull/5086) ([melgenek](https://github.com/melgenek)) +- Insert target columns empty fix [\#5079](https://github.com/apache/arrow-datafusion/pull/5079) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([gruuya](https://github.com/gruuya)) +- sqllogicaltest: move union.rs [\#5075](https://github.com/apache/arrow-datafusion/pull/5075) ([jackwener](https://github.com/jackwener)) +- \[Enhancement\] Don't repartition ProjectionExec when it does not compute anything [\#5074](https://github.com/apache/arrow-datafusion/pull/5074) ([xiaoyong-z](https://github.com/xiaoyong-z)) +- Support ORDER BY an aliased column [\#5067](https://github.com/apache/arrow-datafusion/pull/5067) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([alamb](https://github.com/alamb)) +- Parquet parallel scan [\#5057](https://github.com/apache/arrow-datafusion/pull/5057) ([korowa](https://github.com/korowa)) +- \[BugFix\] fix file stream time scanning metrics bug [\#5020](https://github.com/apache/arrow-datafusion/pull/5020) ([xiaoyong-z](https://github.com/xiaoyong-z)) +- Show optimization errors in explain [\#4819](https://github.com/apache/arrow-datafusion/pull/4819) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([Jefffrey](https://github.com/Jefffrey)) + + ## [18.0.0](https://github.com/apache/arrow-datafusion/tree/18.0.0) (2023-02-10) [Full Changelog](https://github.com/apache/arrow-datafusion/compare/17.0.0...18.0.0) diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 0dcae674f5c9..96367f0c1959 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "datafusion-common" description = "Common functionality for DataFusion query engine" -version = "18.0.0" +version = "19.0.0" homepage = "https://github.com/apache/arrow-datafusion" repository = "https://github.com/apache/arrow-datafusion" readme = "README.md" @@ -40,11 +40,11 @@ pyarrow = ["pyo3", "arrow/pyarrow"] [dependencies] apache-avro = { version = "0.14", default-features = false, features = ["snappy"], optional = true } -arrow = { version = "32.0.0", default-features = false } +arrow = { version = "34.0.0", default-features = false } chrono = { version = "0.4", default-features = false } cranelift-module = { version = "0.92.0", optional = true } num_cpus = "1.13.0" object_store = { version = "0.5.4", default-features = false, optional = true } -parquet = { version = "32.0.0", default-features = false, optional = true } +parquet = { version = "34.0.0", default-features = false, optional = true } pyo3 = { version = "0.18.0", optional = true } sqlparser = "0.30" diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 6033d4a0ccf0..03ec745416f4 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -284,7 +284,7 @@ config_namespace! { /// Currently supported only for Parquet format in which case /// multiple row groups from the same file may be read concurrently. If false then each /// row group is read serially, though different files may be read in parallel. - pub repartition_file_scans: bool, default = false + pub repartition_file_scans: bool, default = true /// Should DataFusion repartition data using the partitions keys to execute window /// functions in parallel using the provided `target_partitions` level diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 023b0873d344..61b343b63f17 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -286,14 +286,31 @@ impl DFSchema { match matches.len() { 0 => Err(field_not_found::<&str>(None, name, self)), 1 => Ok(matches[0]), - _ => Err(DataFusionError::SchemaError( - SchemaError::AmbiguousReference { - field: Column { - relation: None, - name: name.to_string(), - }, - }, - )), + _ => { + // When `matches` size > 1, it doesn't necessarily mean an `ambiguous name` problem. + // Because name may generate from Alias/... . It means that it don't own qualifier. + // For example: + // Join on id = b.id + // Project a.id as id TableScan b id + // In this case, there isn't `ambiguous name` problem. When `matches` just contains + // one field without qualifier, we should return it. + let fields_without_qualifier = matches + .iter() + .filter(|f| f.qualifier.is_none()) + .collect::>(); + if fields_without_qualifier.len() == 1 { + Ok(fields_without_qualifier[0]) + } else { + Err(DataFusionError::SchemaError( + SchemaError::AmbiguousReference { + field: Column { + relation: None, + name: name.to_string(), + }, + }, + )) + } + } } } diff --git a/datafusion/common/src/parsers.rs b/datafusion/common/src/parsers.rs index 1c31d61d143c..4aff7c7eb477 100644 --- a/datafusion/common/src/parsers.rs +++ b/datafusion/common/src/parsers.rs @@ -34,6 +34,8 @@ pub enum CompressionTypeVariant { BZIP2, /// Xz-ed file (liblzma) XZ, + /// Zstd-ed file, + ZSTD, /// Uncompressed file UNCOMPRESSED, } @@ -47,6 +49,7 @@ impl FromStr for CompressionTypeVariant { "GZIP" | "GZ" => Ok(Self::GZIP), "BZIP2" | "BZ2" => Ok(Self::BZIP2), "XZ" => Ok(Self::XZ), + "ZST" | "ZSTD" => Ok(Self::ZSTD), "" => Ok(Self::UNCOMPRESSED), _ => Err(ParserError::ParserError(format!( "Unsupported file compression type {s}" @@ -61,6 +64,7 @@ impl ToString for CompressionTypeVariant { Self::GZIP => "GZIP", Self::BZIP2 => "BZIP2", Self::XZ => "XZ", + Self::ZSTD => "ZSTD", Self::UNCOMPRESSED => "", } .to_string() diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 15702c29c525..ecc7945f3829 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "datafusion" description = "DataFusion is an in-memory query engine that uses Apache Arrow as the memory model" -version = "18.0.0" +version = "19.0.0" homepage = "https://github.com/apache/arrow-datafusion" repository = "https://github.com/apache/arrow-datafusion" readme = "../../README.md" @@ -40,7 +40,7 @@ path = "src/lib.rs" [features] # Used to enable the avro format avro = ["apache-avro", "num-traits", "datafusion-common/avro"] -compression = ["xz2", "bzip2", "flate2", "async-compression"] +compression = ["xz2", "bzip2", "flate2", "zstd", "async-compression"] crypto_expressions = ["datafusion-physical-expr/crypto_expressions"] default = ["crypto_expressions", "regex_expressions", "unicode_expressions", "compression"] # Enables support for non-scalar, binary operations on dictionaries @@ -60,20 +60,20 @@ unicode_expressions = ["datafusion-physical-expr/regex_expressions", "datafusion [dependencies] ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } apache-avro = { version = "0.14", optional = true } -arrow = { version = "32.0.0", features = ["prettyprint"] } -async-compression = { version = "0.3.14", features = ["bzip2", "gzip", "xz", "futures-io", "tokio"], optional = true } +arrow = { version = "34.0.0", features = ["prettyprint"] } +async-compression = { version = "0.3.14", features = ["bzip2", "gzip", "xz", "zstd", "futures-io", "tokio"], optional = true } async-trait = "0.1.41" bytes = "1.1" bzip2 = { version = "0.4.3", optional = true } chrono = { version = "0.4.23", default-features = false } dashmap = "5.4.0" -datafusion-common = { path = "../common", version = "18.0.0", features = ["parquet", "object_store"] } -datafusion-expr = { path = "../expr", version = "18.0.0" } -datafusion-jit = { path = "../jit", version = "18.0.0", optional = true } -datafusion-optimizer = { path = "../optimizer", version = "18.0.0" } -datafusion-physical-expr = { path = "../physical-expr", version = "18.0.0" } -datafusion-row = { path = "../row", version = "18.0.0" } -datafusion-sql = { path = "../sql", version = "18.0.0" } +datafusion-common = { path = "../common", version = "19.0.0", features = ["parquet", "object_store"] } +datafusion-expr = { path = "../expr", version = "19.0.0" } +datafusion-jit = { path = "../jit", version = "19.0.0", optional = true } +datafusion-optimizer = { path = "../optimizer", version = "19.0.0" } +datafusion-physical-expr = { path = "../physical-expr", version = "19.0.0" } +datafusion-row = { path = "../row", version = "19.0.0" } +datafusion-sql = { path = "../sql", version = "19.0.0" } flate2 = { version = "1.0.24", optional = true } futures = "0.3" glob = "0.3.0" @@ -84,9 +84,9 @@ lazy_static = { version = "^1.4.0" } log = "^0.4" num-traits = { version = "0.2", optional = true } num_cpus = "1.13.0" -object_store = "0.5.3" +object_store = "0.5.4" parking_lot = "0.12" -parquet = { version = "32.0.0", features = ["arrow", "async"] } +parquet = { version = "34.0.0", features = ["arrow", "async"] } paste = "^1.0" percent-encoding = "2.2.0" pin-project-lite = "^0.2.7" @@ -101,6 +101,7 @@ tokio-util = { version = "0.7.4", features = ["io"] } url = "2.2" uuid = { version = "1.0", features = ["v4"] } xz2 = { version = "0.1", optional = true } +zstd = { version = "0.11", optional = true, default-features = false } [dev-dependencies] diff --git a/datafusion/core/benches/merge.rs b/datafusion/core/benches/merge.rs index a7ac6cd41dac..f1c4736039f9 100644 --- a/datafusion/core/benches/merge.rs +++ b/datafusion/core/benches/merge.rs @@ -80,6 +80,7 @@ use arrow::{ /// Benchmarks for SortPreservingMerge stream use criterion::{criterion_group, criterion_main, Criterion}; +use datafusion::physical_plan::sorts::sort::SortExec; use datafusion::{ execution::context::TaskContext, physical_plan::{ @@ -136,11 +137,22 @@ fn criterion_benchmark(c: &mut Criterion) { b.iter(move || case.run()) }); + c.bench_function("merge i64 SortExec input", |b| { + let case = MergeBenchCase::new_with_sort_input(&I64_STREAMS); + + b.iter(move || case.run()) + }); + c.bench_function("merge f64", |b| { let case = MergeBenchCase::new(&F64_STREAMS); b.iter(move || case.run()) }); + c.bench_function("merge f64 SortExec input", |b| { + let case = MergeBenchCase::new_with_sort_input(&F64_STREAMS); + + b.iter(move || case.run()) + }); c.bench_function("merge utf8 low cardinality", |b| { let case = MergeBenchCase::new(&UTF8_LOW_CARDINALITY_STREAMS); @@ -148,39 +160,79 @@ fn criterion_benchmark(c: &mut Criterion) { b.iter(move || case.run()) }); + c.bench_function("merge utf8 low cardinality SortExec", |b| { + let case = MergeBenchCase::new_with_sort_input(&UTF8_LOW_CARDINALITY_STREAMS); + + b.iter(move || case.run()) + }); + c.bench_function("merge utf8 high cardinality", |b| { let case = MergeBenchCase::new(&UTF8_HIGH_CARDINALITY_STREAMS); b.iter(move || case.run()) }); + c.bench_function("merge utf8 high cardinality SortExec input", |b| { + let case = MergeBenchCase::new_with_sort_input(&UTF8_HIGH_CARDINALITY_STREAMS); + + b.iter(move || case.run()) + }); + c.bench_function("merge utf8 tuple", |b| { let case = MergeBenchCase::new(&UTF8_TUPLE_STREAMS); b.iter(move || case.run()) }); + c.bench_function("merge utf8 tuple SortExec input", |b| { + let case = MergeBenchCase::new_with_sort_input(&UTF8_TUPLE_STREAMS); + + b.iter(move || case.run()) + }); + c.bench_function("merge utf8 dictionary", |b| { let case = MergeBenchCase::new(&DICTIONARY_STREAMS); b.iter(move || case.run()) }); + c.bench_function("merge utf8 dictionary SortExec input", |b| { + let case = MergeBenchCase::new_with_sort_input(&DICTIONARY_STREAMS); + + b.iter(move || case.run()) + }); + c.bench_function("merge utf8 dictionary tuple", |b| { let case = MergeBenchCase::new(&DICTIONARY_TUPLE_STREAMS); b.iter(move || case.run()) }); + c.bench_function("merge utf8 dictionary tuple SortExec input", |b| { + let case = MergeBenchCase::new_with_sort_input(&DICTIONARY_TUPLE_STREAMS); + b.iter(move || case.run()) + }); + c.bench_function("merge mixed utf8 dictionary tuple", |b| { let case = MergeBenchCase::new(&MIXED_DICTIONARY_TUPLE_STREAMS); b.iter(move || case.run()) }); + c.bench_function("merge mixed utf8 dictionary tuple SortExec input", |b| { + let case = MergeBenchCase::new_with_sort_input(&MIXED_DICTIONARY_TUPLE_STREAMS); + b.iter(move || case.run()) + }); + c.bench_function("merge mixed tuple", |b| { let case = MergeBenchCase::new(&MIXED_TUPLE_STREAMS); b.iter(move || case.run()) }); + + c.bench_function("merge mixed tuple SortExec input", |b| { + let case = MergeBenchCase::new_with_sort_input(&MIXED_TUPLE_STREAMS); + + b.iter(move || case.run()) + }); } /// Encapsulates running each test case @@ -214,6 +266,26 @@ impl MergeBenchCase { } } + fn new_with_sort_input(partitions: &[Vec]) -> Self { + let runtime = tokio::runtime::Builder::new_multi_thread().build().unwrap(); + let session_ctx = SessionContext::new(); + let task_ctx = session_ctx.task_ctx(); + + let schema = partitions[0][0].schema(); + let sort = make_sort_exprs(schema.as_ref()); + + let projection = None; + let exec = Arc::new(MemoryExec::try_new(partitions, schema, projection).unwrap()); + let sort_exec = SortExec::try_new(sort.to_owned(), exec, None).unwrap(); + let plan = Arc::new(SortPreservingMergeExec::new(sort, Arc::new(sort_exec))); + + Self { + runtime, + task_ctx, + plan, + } + } + /// runs the specified plan to completion, draining all input and /// panic'ing on error fn run(&self) { diff --git a/datafusion/core/benches/sort.rs b/datafusion/core/benches/sort.rs index 2d9417d8bd3b..0507a9308a28 100644 --- a/datafusion/core/benches/sort.rs +++ b/datafusion/core/benches/sort.rs @@ -29,6 +29,7 @@ use arrow::{ /// Benchmarks for SortExec use criterion::{criterion_group, criterion_main, Criterion}; +use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; use datafusion::{ execution::context::TaskContext, physical_plan::{memory::MemoryExec, sorts::sort::SortExec, ExecutionPlan}, @@ -104,7 +105,7 @@ fn criterion_benchmark(c: &mut Criterion) { b.iter(move || case.run()) }); c.bench_function("sort utf8 low cardinality preserve partitioning", |b| { - let case = SortBenchCase::new(&UTF8_LOW_CARDINALITY_STREAMS); + let case = SortBenchCasePreservePartitioning::new(&UTF8_LOW_CARDINALITY_STREAMS); b.iter(move || case.run()) }); @@ -199,7 +200,8 @@ impl SortBenchCase { let projection = None; let exec = MemoryExec::try_new(partitions, schema, projection).unwrap(); - let plan = Arc::new(SortExec::try_new(sort, Arc::new(exec), None).unwrap()); + let exec = Arc::new(CoalescePartitionsExec::new(Arc::new(exec))); + let plan = Arc::new(SortExec::try_new(sort, exec, None).unwrap()); Self { runtime, diff --git a/datafusion/core/src/dataframe.rs b/datafusion/core/src/dataframe.rs index 36135bd1eb36..1d5396219584 100644 --- a/datafusion/core/src/dataframe.rs +++ b/datafusion/core/src/dataframe.rs @@ -1097,15 +1097,22 @@ mod tests { .unwrap() .distinct() .unwrap() - .sort(vec![col("c2").sort(true, true)]) + .sort(vec![col("c1").sort(true, true)]) .unwrap(); + let df_results = plan.clone().collect().await?; + + #[rustfmt::skip] assert_batches_sorted_eq!( vec![ - "+----+", "| c1 |", "+----+", "| a |", "| a |", "| a |", "| a |", - "| a |", "| b |", "| b |", "| b |", "| b |", "| b |", "| c |", - "| c |", "| c |", "| c |", "| c |", "| d |", "| d |", "| d |", - "| d |", "| d |", "| e |", "| e |", "| e |", "| e |", "| e |", + "+----+", + "| c1 |", + "+----+", + "| a |", + "| b |", + "| c |", + "| d |", + "| e |", "+----+", ], &df_results @@ -1114,6 +1121,22 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_distinct_sort_by_unprojected() -> Result<()> { + let t = test_table().await?; + let err = t + .select(vec![col("c1")]) + .unwrap() + .distinct() + .unwrap() + // try to sort on some value not present in input to distinct + .sort(vec![col("c2").sort(true, true)]) + .unwrap_err(); + assert_eq!(err.to_string(), "Error during planning: For SELECT DISTINCT, ORDER BY expressions c2 must appear in select list"); + + Ok(()) + } + #[tokio::test] async fn join() -> Result<()> { let left = test_table().await?.select_columns(&["c1", "c2"])?; @@ -1552,11 +1575,9 @@ mod tests { \n Sort: t1.c1 ASC NULLS FIRST, t1.c2 ASC NULLS FIRST, t1.c3 ASC NULLS FIRST, t2.c1 ASC NULLS FIRST, t2.c2 ASC NULLS FIRST, t2.c3 ASC NULLS FIRST, fetch=1\ \n Inner Join: t1.c1 = t2.c1\ \n SubqueryAlias: t1\ - \n Projection: aggregate_test_100.c1, aggregate_test_100.c2, aggregate_test_100.c3\ - \n TableScan: aggregate_test_100 projection=[c1, c2, c3]\ + \n TableScan: aggregate_test_100 projection=[c1, c2, c3]\ \n SubqueryAlias: t2\ - \n Projection: aggregate_test_100.c1, aggregate_test_100.c2, aggregate_test_100.c3\ - \n TableScan: aggregate_test_100 projection=[c1, c2, c3]", + \n TableScan: aggregate_test_100 projection=[c1, c2, c3]", format!("{:?}", df_renamed.clone().into_optimized_plan()?) ); diff --git a/datafusion/core/src/datasource/datasource.rs b/datafusion/core/src/datasource/datasource.rs index 8b0c823acf65..6277ce146adf 100644 --- a/datafusion/core/src/datasource/datasource.rs +++ b/datafusion/core/src/datasource/datasource.rs @@ -72,6 +72,7 @@ pub trait TableProvider: Sync + Send { /// Tests whether the table provider can make use of a filter expression /// to optimise data retrieval. + #[deprecated(since = "20.0.0", note = "use supports_filters_pushdown instead")] fn supports_filter_pushdown( &self, _filter: &Expr, @@ -79,6 +80,19 @@ pub trait TableProvider: Sync + Send { Ok(TableProviderFilterPushDown::Unsupported) } + /// Tests whether the table provider can make use of any or all filter expressions + /// to optimise data retrieval. + #[allow(deprecated)] + fn supports_filters_pushdown( + &self, + filters: &[&Expr], + ) -> Result> { + filters + .iter() + .map(|f| self.supports_filter_pushdown(f)) + .collect() + } + /// Get statistics for this table, if available fn statistics(&self) -> Option { None diff --git a/datafusion/core/src/datasource/default_table_source.rs b/datafusion/core/src/datasource/default_table_source.rs index bbb9fbdd6492..c6fd87e7f18b 100644 --- a/datafusion/core/src/datasource/default_table_source.rs +++ b/datafusion/core/src/datasource/default_table_source.rs @@ -52,13 +52,13 @@ impl TableSource for DefaultTableSource { self.table_provider.schema() } - /// Tests whether the table provider can make use of a filter expression + /// Tests whether the table provider can make use of any or all filter expressions /// to optimise data retrieval. - fn supports_filter_pushdown( + fn supports_filters_pushdown( &self, - filter: &Expr, - ) -> datafusion_common::Result { - self.table_provider.supports_filter_pushdown(filter) + filter: &[&Expr], + ) -> datafusion_common::Result> { + self.table_provider.supports_filters_pushdown(filter) } fn get_logical_plan(&self) -> Option<&datafusion_expr::LogicalPlan> { diff --git a/datafusion/core/src/datasource/file_format/avro.rs b/datafusion/core/src/datasource/file_format/avro.rs index 75649c2a309d..1b6d2b3bc6f2 100644 --- a/datafusion/core/src/datasource/file_format/avro.rs +++ b/datafusion/core/src/datasource/file_format/avro.rs @@ -179,13 +179,13 @@ mod tests { "+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+", "| id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col | string_col | timestamp_col |", "+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+", - "| 4 | true | 0 | 0 | 0 | 0 | 0 | 0 | 30332f30312f3039 | 30 | 2009-03-01T00:00:00 |", + "| 4 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30332f30312f3039 | 30 | 2009-03-01T00:00:00 |", "| 5 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30332f30312f3039 | 31 | 2009-03-01T00:01:00 |", - "| 6 | true | 0 | 0 | 0 | 0 | 0 | 0 | 30342f30312f3039 | 30 | 2009-04-01T00:00:00 |", + "| 6 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30342f30312f3039 | 30 | 2009-04-01T00:00:00 |", "| 7 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30342f30312f3039 | 31 | 2009-04-01T00:01:00 |", - "| 2 | true | 0 | 0 | 0 | 0 | 0 | 0 | 30322f30312f3039 | 30 | 2009-02-01T00:00:00 |", + "| 2 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30322f30312f3039 | 30 | 2009-02-01T00:00:00 |", "| 3 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30322f30312f3039 | 31 | 2009-02-01T00:01:00 |", - "| 0 | true | 0 | 0 | 0 | 0 | 0 | 0 | 30312f30312f3039 | 30 | 2009-01-01T00:00:00 |", + "| 0 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30312f30312f3039 | 30 | 2009-01-01T00:00:00 |", "| 1 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30312f30312f3039 | 31 | 2009-01-01T00:01:00 |", "+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+", ]; diff --git a/datafusion/core/src/datasource/file_format/file_type.rs b/datafusion/core/src/datasource/file_format/file_type.rs index 1a9973c68a64..59c95962a992 100644 --- a/datafusion/core/src/datasource/file_format/file_type.rs +++ b/datafusion/core/src/datasource/file_format/file_type.rs @@ -26,7 +26,7 @@ use crate::datasource::file_format::parquet::DEFAULT_PARQUET_EXTENSION; #[cfg(feature = "compression")] use async_compression::tokio::bufread::{ BzDecoder as AsyncBzDecoder, GzipDecoder as AsyncGzDecoder, - XzDecoder as AsyncXzDecoder, + XzDecoder as AsyncXzDecoder, ZstdDecoder as AsyncZstdDecoer, }; use bytes::Bytes; #[cfg(feature = "compression")] @@ -42,6 +42,8 @@ use std::str::FromStr; use tokio_util::io::{ReaderStream, StreamReader}; #[cfg(feature = "compression")] use xz2::read::XzDecoder; +#[cfg(feature = "compression")] +use zstd::Decoder as ZstdDecoder; use CompressionTypeVariant::*; /// Define each `FileType`/`FileCompressionType`'s extension @@ -62,6 +64,7 @@ impl GetExt for FileCompressionType { GZIP => ".gz".to_owned(), BZIP2 => ".bz2".to_owned(), XZ => ".xz".to_owned(), + ZSTD => ".zst".to_owned(), UNCOMPRESSED => "".to_owned(), } } @@ -95,6 +98,9 @@ impl FileCompressionType { /// Xz-ed file (liblzma) pub const XZ: Self = Self { variant: XZ }; + /// Zstd-ed file + pub const ZSTD: Self = Self { variant: ZSTD }; + /// Uncompressed file pub const UNCOMPRESSED: Self = Self { variant: UNCOMPRESSED, @@ -140,8 +146,13 @@ impl FileCompressionType { ReaderStream::new(AsyncXzDecoder::new(StreamReader::new(s))) .map_err(err_converter), ), + #[cfg(feature = "compression")] + ZSTD => Box::new( + ReaderStream::new(AsyncZstdDecoer::new(StreamReader::new(s))) + .map_err(err_converter), + ), #[cfg(not(feature = "compression"))] - GZIP | BZIP2 | XZ => { + GZIP | BZIP2 | XZ | ZSTD => { return Err(DataFusionError::NotImplemented( "Compression feature is not enabled".to_owned(), )) @@ -162,8 +173,13 @@ impl FileCompressionType { BZIP2 => Box::new(BzDecoder::new(r)), #[cfg(feature = "compression")] XZ => Box::new(XzDecoder::new(r)), + #[cfg(feature = "compression")] + ZSTD => match ZstdDecoder::new(r) { + Ok(decoder) => Box::new(decoder), + Err(e) => return Err(DataFusionError::External(Box::new(e))), + }, #[cfg(not(feature = "compression"))] - GZIP | BZIP2 | XZ => { + GZIP | BZIP2 | XZ | ZSTD => { return Err(DataFusionError::NotImplemented( "Compression feature is not enabled".to_owned(), )) @@ -239,155 +255,90 @@ mod tests { #[test] fn get_ext_with_compression() { - let file_type = FileType::CSV; - assert_eq!( - file_type - .get_ext_with_compression(FileCompressionType::UNCOMPRESSED) - .unwrap(), - ".csv" - ); - assert_eq!( - file_type - .get_ext_with_compression(FileCompressionType::GZIP) - .unwrap(), - ".csv.gz" - ); - assert_eq!( - file_type - .get_ext_with_compression(FileCompressionType::XZ) - .unwrap(), - ".csv.xz" - ); - assert_eq!( - file_type - .get_ext_with_compression(FileCompressionType::BZIP2) - .unwrap(), - ".csv.bz2" - ); - - let file_type = FileType::JSON; - assert_eq!( - file_type - .get_ext_with_compression(FileCompressionType::UNCOMPRESSED) - .unwrap(), - ".json" - ); - assert_eq!( - file_type - .get_ext_with_compression(FileCompressionType::GZIP) - .unwrap(), - ".json.gz" - ); - assert_eq!( - file_type - .get_ext_with_compression(FileCompressionType::XZ) - .unwrap(), - ".json.xz" - ); - assert_eq!( - file_type - .get_ext_with_compression(FileCompressionType::BZIP2) - .unwrap(), - ".json.bz2" - ); - - let file_type = FileType::AVRO; - assert_eq!( - file_type - .get_ext_with_compression(FileCompressionType::UNCOMPRESSED) - .unwrap(), - ".avro" - ); - assert!(matches!( - file_type.get_ext_with_compression(FileCompressionType::GZIP), - Err(DataFusionError::Internal(_)) - )); - assert!(matches!( - file_type.get_ext_with_compression(FileCompressionType::BZIP2), - Err(DataFusionError::Internal(_)) - )); + for (file_type, compression, extension) in [ + (FileType::CSV, FileCompressionType::UNCOMPRESSED, ".csv"), + (FileType::CSV, FileCompressionType::GZIP, ".csv.gz"), + (FileType::CSV, FileCompressionType::XZ, ".csv.xz"), + (FileType::CSV, FileCompressionType::BZIP2, ".csv.bz2"), + (FileType::CSV, FileCompressionType::ZSTD, ".csv.zst"), + (FileType::JSON, FileCompressionType::UNCOMPRESSED, ".json"), + (FileType::JSON, FileCompressionType::GZIP, ".json.gz"), + (FileType::JSON, FileCompressionType::XZ, ".json.xz"), + (FileType::JSON, FileCompressionType::BZIP2, ".json.bz2"), + (FileType::JSON, FileCompressionType::ZSTD, ".json.zst"), + ] { + assert_eq!( + file_type.get_ext_with_compression(compression).unwrap(), + extension + ); + } - let file_type = FileType::PARQUET; - assert_eq!( - file_type - .get_ext_with_compression(FileCompressionType::UNCOMPRESSED) - .unwrap(), - ".parquet" - ); - assert!(matches!( - file_type.get_ext_with_compression(FileCompressionType::GZIP), - Err(DataFusionError::Internal(_)) - )); - assert!(matches!( - file_type.get_ext_with_compression(FileCompressionType::BZIP2), - Err(DataFusionError::Internal(_)) - )); + // Cannot specify compression for these file types + for (file_type, extension) in + [(FileType::AVRO, ".avro"), (FileType::PARQUET, ".parquet")] + { + assert_eq!( + file_type + .get_ext_with_compression(FileCompressionType::UNCOMPRESSED) + .unwrap(), + extension + ); + for compression in [ + FileCompressionType::GZIP, + FileCompressionType::XZ, + FileCompressionType::BZIP2, + FileCompressionType::ZSTD, + ] { + assert!(matches!( + file_type.get_ext_with_compression(compression), + Err(DataFusionError::Internal(_)) + )); + } + } } #[test] fn from_str() { - assert_eq!(FileType::from_str("csv").unwrap(), FileType::CSV); - assert_eq!(FileType::from_str("CSV").unwrap(), FileType::CSV); - - assert_eq!(FileType::from_str("json").unwrap(), FileType::JSON); - assert_eq!(FileType::from_str("JSON").unwrap(), FileType::JSON); - - assert_eq!(FileType::from_str("avro").unwrap(), FileType::AVRO); - assert_eq!(FileType::from_str("AVRO").unwrap(), FileType::AVRO); - - assert_eq!(FileType::from_str("parquet").unwrap(), FileType::PARQUET); - assert_eq!(FileType::from_str("PARQUET").unwrap(), FileType::PARQUET); + for (ext, file_type) in [ + ("csv", FileType::CSV), + ("CSV", FileType::CSV), + ("json", FileType::JSON), + ("JSON", FileType::JSON), + ("avro", FileType::AVRO), + ("AVRO", FileType::AVRO), + ("parquet", FileType::PARQUET), + ("PARQUET", FileType::PARQUET), + ] { + assert_eq!(FileType::from_str(ext).unwrap(), file_type); + } assert!(matches!( FileType::from_str("Unknown"), Err(DataFusionError::NotImplemented(_)) )); - assert_eq!( - FileCompressionType::from_str("gz").unwrap(), - FileCompressionType::GZIP - ); - assert_eq!( - FileCompressionType::from_str("GZ").unwrap(), - FileCompressionType::GZIP - ); - assert_eq!( - FileCompressionType::from_str("gzip").unwrap(), - FileCompressionType::GZIP - ); - assert_eq!( - FileCompressionType::from_str("GZIP").unwrap(), - FileCompressionType::GZIP - ); - assert_eq!( - FileCompressionType::from_str("xz").unwrap(), - FileCompressionType::XZ - ); - assert_eq!( - FileCompressionType::from_str("XZ").unwrap(), - FileCompressionType::XZ - ); - assert_eq!( - FileCompressionType::from_str("bz2").unwrap(), - FileCompressionType::BZIP2 - ); - assert_eq!( - FileCompressionType::from_str("BZ2").unwrap(), - FileCompressionType::BZIP2 - ); - assert_eq!( - FileCompressionType::from_str("bzip2").unwrap(), - FileCompressionType::BZIP2 - ); - assert_eq!( - FileCompressionType::from_str("BZIP2").unwrap(), - FileCompressionType::BZIP2 - ); - - assert_eq!( - FileCompressionType::from_str("").unwrap(), - FileCompressionType::UNCOMPRESSED - ); + for (ext, compression_type) in [ + ("gz", FileCompressionType::GZIP), + ("GZ", FileCompressionType::GZIP), + ("gzip", FileCompressionType::GZIP), + ("GZIP", FileCompressionType::GZIP), + ("xz", FileCompressionType::XZ), + ("XZ", FileCompressionType::XZ), + ("bz2", FileCompressionType::BZIP2), + ("BZ2", FileCompressionType::BZIP2), + ("bzip2", FileCompressionType::BZIP2), + ("BZIP2", FileCompressionType::BZIP2), + ("zst", FileCompressionType::ZSTD), + ("ZST", FileCompressionType::ZSTD), + ("zstd", FileCompressionType::ZSTD), + ("ZSTD", FileCompressionType::ZSTD), + ("", FileCompressionType::UNCOMPRESSED), + ] { + assert_eq!( + FileCompressionType::from_str(ext).unwrap(), + compression_type + ); + } assert!(matches!( FileCompressionType::from_str("Unknown"), diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs index e753364a556e..947327630d9c 100644 --- a/datafusion/core/src/datasource/file_format/mod.rs +++ b/datafusion/core/src/datasource/file_format/mod.rs @@ -24,6 +24,7 @@ pub mod avro; pub mod csv; pub mod file_type; pub mod json; +pub mod options; pub mod parquet; use std::any::Any; diff --git a/datafusion/core/src/execution/options.rs b/datafusion/core/src/datasource/file_format/options.rs similarity index 99% rename from datafusion/core/src/execution/options.rs rename to datafusion/core/src/datasource/file_format/options.rs index c7a5dea7e272..e51edf829e85 100644 --- a/datafusion/core/src/execution/options.rs +++ b/datafusion/core/src/datasource/file_format/options.rs @@ -23,7 +23,6 @@ use arrow::datatypes::{DataType, Schema, SchemaRef}; use async_trait::async_trait; use datafusion_common::DataFusionError; -use super::context::{SessionConfig, SessionState}; use crate::datasource::file_format::avro::DEFAULT_AVRO_EXTENSION; use crate::datasource::file_format::csv::DEFAULT_CSV_EXTENSION; use crate::datasource::file_format::file_type::FileCompressionType; @@ -38,6 +37,7 @@ use crate::datasource::{ listing::ListingOptions, }; use crate::error::Result; +use crate::execution::context::{SessionConfig, SessionState}; /// Options that control the reading of CSV files. /// diff --git a/datafusion/core/src/datasource/view.rs b/datafusion/core/src/datasource/view.rs index 524ad9f5c2ad..dea7ecadf7f4 100644 --- a/datafusion/core/src/datasource/view.rs +++ b/datafusion/core/src/datasource/view.rs @@ -500,8 +500,7 @@ mod tests { let expected = "\ Explain\ \n CreateView: Bare { table: \"xyz\" }\ - \n Projection: abc.column1, abc.column2, abc.column3\ - \n TableScan: abc projection=[column1, column2, column3]"; + \n TableScan: abc projection=[column1, column2, column3]"; assert_eq!(expected, actual); let dataframe = session_ctx @@ -512,9 +511,8 @@ mod tests { let expected = "\ Explain\ \n CreateView: Bare { table: \"xyz\" }\ - \n Projection: abc.column1, abc.column2, abc.column3\ - \n Filter: abc.column2 = Int64(5)\ - \n TableScan: abc projection=[column1, column2, column3]"; + \n Filter: abc.column2 = Int64(5)\ + \n TableScan: abc projection=[column1, column2, column3]"; assert_eq!(expected, actual); let dataframe = session_ctx @@ -525,9 +523,8 @@ mod tests { let expected = "\ Explain\ \n CreateView: Bare { table: \"xyz\" }\ - \n Projection: abc.column1, abc.column2\ - \n Filter: abc.column2 = Int64(5)\ - \n TableScan: abc projection=[column1, column2]"; + \n Filter: abc.column2 = Int64(5)\ + \n TableScan: abc projection=[column1, column2]"; assert_eq!(expected, actual); Ok(()) diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs index 8944b4f90503..c7f56733a501 100644 --- a/datafusion/core/src/execution/context.rs +++ b/datafusion/core/src/execution/context.rs @@ -31,6 +31,7 @@ use datafusion_expr::{DescribeTable, StringifiedPlan}; pub use datafusion_physical_expr::execution_props::ExecutionProps; use datafusion_physical_expr::var_provider::is_system_variables; use parking_lot::RwLock; +use std::collections::hash_map::Entry; use std::sync::Arc; use std::{ any::{Any, TypeId}, @@ -82,7 +83,7 @@ use crate::physical_plan::PhysicalPlanner; use crate::variable::{VarProvider, VarType}; use async_trait::async_trait; use chrono::{DateTime, Utc}; -use datafusion_common::ScalarValue; +use datafusion_common::{OwnedTableReference, ScalarValue}; use datafusion_sql::{ parser::DFParser, planner::{ContextProvider, SqlToRel}, @@ -106,6 +107,43 @@ use super::options::{ AvroReadOptions, CsvReadOptions, NdJsonReadOptions, ParquetReadOptions, ReadOptions, }; +/// DataFilePaths adds a method to convert strings and vector of strings to vector of [`ListingTableUrl`] URLs. +/// This allows methods such [`SessionContext::read_csv`] and `[`SessionContext::read_avro`] +/// to take either a single file or multiple files. +pub trait DataFilePaths { + /// Parse to a vector of [`ListingTableUrl`] URLs. + fn to_urls(self) -> Result>; +} + +impl DataFilePaths for &str { + fn to_urls(self) -> Result> { + Ok(vec![ListingTableUrl::parse(self)?]) + } +} + +impl DataFilePaths for String { + fn to_urls(self) -> Result> { + Ok(vec![ListingTableUrl::parse(self)?]) + } +} + +impl DataFilePaths for &String { + fn to_urls(self) -> Result> { + Ok(vec![ListingTableUrl::parse(self)?]) + } +} + +impl

DataFilePaths for Vec

+where + P: AsRef, +{ + fn to_urls(self) -> Result> { + self.iter() + .map(ListingTableUrl::parse) + .collect::>>() + } +} + /// SessionContext is the main interface for executing queries with DataFusion. It stands for /// the connection between user and DataFusion/Ballista cluster. /// The context provides the following functionality @@ -627,22 +665,18 @@ impl SessionContext { /// /// For more control such as reading multiple files, you can use /// [`read_table`](Self::read_table) with a [`ListingTable`]. - async fn _read_type<'a>( + async fn _read_type<'a, P: DataFilePaths>( &self, - table_path: impl AsRef, + table_paths: P, options: impl ReadOptions<'a>, ) -> Result { - let table_path = ListingTableUrl::parse(table_path)?; + let table_paths = table_paths.to_urls()?; let session_config = self.copied_config(); let listing_options = options.to_listing_options(&session_config); - let resolved_schema = match options - .get_resolved_schema(&session_config, self.state(), table_path.clone()) - .await - { - Ok(resolved_schema) => resolved_schema, - Err(e) => return Err(e), - }; - let config = ListingTableConfig::new(table_path) + let resolved_schema = options + .get_resolved_schema(&session_config, self.state(), table_paths[0].clone()) + .await?; + let config = ListingTableConfig::new_with_multi_paths(table_paths) .with_listing_options(listing_options) .with_schema(resolved_schema); let provider = ListingTable::try_new(config)?; @@ -653,24 +687,28 @@ impl SessionContext { /// /// For more control such as reading multiple files, you can use /// [`read_table`](Self::read_table) with a [`ListingTable`]. - pub async fn read_avro( + /// + /// For an example, see [`read_csv`](Self::read_csv) + pub async fn read_avro( &self, - table_path: impl AsRef, + table_paths: P, options: AvroReadOptions<'_>, ) -> Result { - self._read_type(table_path, options).await + self._read_type(table_paths, options).await } /// Creates a [`DataFrame`] for reading an JSON data source. /// /// For more control such as reading multiple files, you can use /// [`read_table`](Self::read_table) with a [`ListingTable`]. - pub async fn read_json( + /// + /// For an example, see [`read_csv`](Self::read_csv) + pub async fn read_json( &self, - table_path: impl AsRef, + table_paths: P, options: NdJsonReadOptions<'_>, ) -> Result { - self._read_type(table_path, options).await + self._read_type(table_paths, options).await } /// Creates an empty DataFrame. @@ -685,24 +723,42 @@ impl SessionContext { /// /// For more control such as reading multiple files, you can use /// [`read_table`](Self::read_table) with a [`ListingTable`]. - pub async fn read_csv( + /// + /// Example usage is given below: + /// + /// ``` + /// use datafusion::prelude::*; + /// # use datafusion::error::Result; + /// # #[tokio::main] + /// # async fn main() -> Result<()> { + /// let ctx = SessionContext::new(); + /// // You can read a single file using `read_csv` + /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// // you can also read multiple files: + /// let df = ctx.read_csv(vec!["tests/data/example.csv", "tests/data/example.csv"], CsvReadOptions::new()).await?; + /// # Ok(()) + /// # } + /// ``` + pub async fn read_csv( &self, - table_path: impl AsRef, + table_paths: P, options: CsvReadOptions<'_>, ) -> Result { - self._read_type(table_path, options).await + self._read_type(table_paths, options).await } /// Creates a [`DataFrame`] for reading a Parquet data source. /// /// For more control such as reading multiple files, you can use /// [`read_table`](Self::read_table) with a [`ListingTable`]. - pub async fn read_parquet( + /// + /// For an example, see [`read_csv`](Self::read_csv) + pub async fn read_parquet( &self, - table_path: impl AsRef, + table_paths: P, options: ParquetReadOptions<'_>, ) -> Result { - self._read_type(table_path, options).await + self._read_type(table_paths, options).await } /// Creates a [`DataFrame`] for a [`TableProvider`] such as a @@ -1719,21 +1775,20 @@ impl SessionState { Ok(statement) } - /// Convert an AST Statement into a LogicalPlan - pub async fn statement_to_plan( + /// Resolve all table references in the SQL statement. + pub fn resolve_table_references( &self, - statement: datafusion_sql::parser::Statement, - ) -> Result { + statement: &datafusion_sql::parser::Statement, + ) -> Result> { use crate::catalog::information_schema::INFORMATION_SCHEMA_TABLES; use datafusion_sql::parser::Statement as DFStatement; use sqlparser::ast::*; - use std::collections::hash_map::Entry; // Getting `TableProviders` is async but planing is not -- thus pre-fetch // table providers for all relations referenced in this query let mut relations = hashbrown::HashSet::with_capacity(10); - match &statement { + match statement { DFStatement::Statement(s) => { struct RelationVisitor<'a>(&'a mut hashbrown::HashSet); @@ -1784,18 +1839,31 @@ impl SessionState { } } + let enable_ident_normalization = + self.config.options.sql_parser.enable_ident_normalization; + relations + .into_iter() + .map(|x| object_name_to_table_reference(x, enable_ident_normalization)) + .collect::>() + } + + /// Convert an AST Statement into a LogicalPlan + pub async fn statement_to_plan( + &self, + statement: datafusion_sql::parser::Statement, + ) -> Result { + let references = self.resolve_table_references(&statement)?; + let mut provider = SessionContextProvider { state: self, - tables: HashMap::with_capacity(relations.len()), + tables: HashMap::with_capacity(references.len()), }; let enable_ident_normalization = self.config.options.sql_parser.enable_ident_normalization; let parse_float_as_decimal = self.config.options.sql_parser.parse_float_as_decimal; - for relation in relations { - let reference = - object_name_to_table_reference(relation, enable_ident_normalization)?; + for reference in references { let table = reference.table(); let resolved = self.resolve_table_ref(reference.as_table_reference()); if let Entry::Vacant(v) = provider.tables.entry(resolved.to_string()) { @@ -2319,7 +2387,7 @@ mod tests { "+-------------+", "| MY_AVG(t.i) |", "+-------------+", - "| 1 |", + "| 1.0 |", "+-------------+", ]; assert_batches_eq!(expected, &result); diff --git a/datafusion/core/src/execution/mod.rs b/datafusion/core/src/execution/mod.rs index 5eb859df9304..8761237bd014 100644 --- a/datafusion/core/src/execution/mod.rs +++ b/datafusion/core/src/execution/mod.rs @@ -43,7 +43,8 @@ pub mod context; pub mod disk_manager; pub mod memory_pool; -pub mod options; +// backwards compatibility +pub use crate::datasource::file_format::options; pub mod registry; pub mod runtime_env; diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index f3d36301792e..03e376878740 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -28,6 +28,7 @@ //! entities (e.g. entire files) if the statistics are known via some //! other source (e.g. a catalog) +use std::collections::HashSet; use std::convert::TryFrom; use std::sync::Arc; @@ -131,7 +132,7 @@ impl PruningPredicate { // build predicate expression once let mut required_columns = RequiredStatColumns::new(); let logical_predicate_expr = - build_predicate_expression(&expr, schema.as_ref(), &mut required_columns)?; + build_predicate_expression(&expr, schema.as_ref(), &mut required_columns); let stat_fields = required_columns .iter() .map(|(_, _, f)| f.clone()) @@ -258,6 +259,14 @@ impl RequiredStatColumns { Self::default() } + /// Returns number of unique columns. + pub(crate) fn n_columns(&self) -> usize { + self.iter() + .map(|(c, _s, _f)| c) + .collect::>() + .len() + } + /// Returns an iterator over items in columns (see doc on /// `self.columns` for details) pub(crate) fn iter(&self) -> impl Iterator { @@ -712,7 +721,7 @@ fn build_predicate_expression( expr: &Expr, schema: &Schema, required_columns: &mut RequiredStatColumns, -) -> Result { +) -> Expr { // Returned for unsupported expressions. Such expressions are // converted to TRUE. let unhandled = lit(true); @@ -721,23 +730,20 @@ fn build_predicate_expression( let (left, op, right) = match expr { Expr::BinaryExpr(BinaryExpr { left, op, right }) => (left, *op, right), Expr::IsNull(expr) => { - let expr = build_is_null_column_expr(expr, schema, required_columns) + return build_is_null_column_expr(expr, schema, required_columns) .unwrap_or(unhandled); - return Ok(expr); } Expr::Column(col) => { - let expr = build_single_column_expr(col, schema, required_columns, false) + return build_single_column_expr(col, schema, required_columns, false) .unwrap_or(unhandled); - return Ok(expr); } // match !col (don't do so recursively) Expr::Not(input) => { if let Expr::Column(col) = input.as_ref() { - let expr = build_single_column_expr(col, schema, required_columns, true) + return build_single_column_expr(col, schema, required_columns, true) .unwrap_or(unhandled); - return Ok(expr); } else { - return Ok(unhandled); + return unhandled; } } Expr::InList { @@ -755,13 +761,13 @@ fn build_predicate_expression( return build_predicate_expression(&change_expr, schema, required_columns); } _ => { - return Ok(unhandled); + return unhandled; } }; if op == Operator::And || op == Operator::Or { - let left_expr = build_predicate_expression(left, schema, required_columns)?; - let right_expr = build_predicate_expression(right, schema, required_columns)?; + let left_expr = build_predicate_expression(left, schema, required_columns); + let right_expr = build_predicate_expression(right, schema, required_columns); // simplify boolean expression if applicable let expr = match (&left_expr, op, &right_expr) { (left, Operator::And, _) if *left == unhandled => right_expr, @@ -771,7 +777,7 @@ fn build_predicate_expression( } _ => binary_expr(left_expr, op, right_expr), }; - return Ok(expr); + return expr; } let expr_builder = @@ -781,12 +787,11 @@ fn build_predicate_expression( // allow partial failure in predicate expression generation // this can still produce a useful predicate when multiple conditions are joined using AND Err(_) => { - return Ok(unhandled); + return unhandled; } }; - let statistics_expr = build_statistics_expr(&mut expr_builder).unwrap_or(unhandled); - Ok(statistics_expr) + build_statistics_expr(&mut expr_builder).unwrap_or(unhandled) } fn build_statistics_expr(expr_builder: &mut PruningExpressionBuilder) -> Result { @@ -1213,7 +1218,7 @@ mod tests { Field::new("s1_min", DataType::Utf8, true), )]); - // Note the statistics return binary (which can't be cast to string) + // Note the statistics return an invalid UTF-8 sequence which will be converted to null let statistics = OneContainerStats { min_values: Some(Arc::new(BinaryArray::from_slice([&[255u8] as &[u8]]))), max_values: None, @@ -1268,13 +1273,13 @@ mod tests { // test column on the left let expr = col("c1").eq(lit(1)); let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); // test column on the right let expr = lit(1).eq(col("c1")); let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); Ok(()) @@ -1288,13 +1293,13 @@ mod tests { // test column on the left let expr = col("c1").not_eq(lit(1)); let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); // test column on the right let expr = lit(1).not_eq(col("c1")); let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); Ok(()) @@ -1308,13 +1313,13 @@ mod tests { // test column on the left let expr = col("c1").gt(lit(1)); let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); // test column on the right let expr = lit(1).lt(col("c1")); let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); Ok(()) @@ -1328,12 +1333,12 @@ mod tests { // test column on the left let expr = col("c1").gt_eq(lit(1)); let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); // test column on the right let expr = lit(1).lt_eq(col("c1")); let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); Ok(()) @@ -1347,13 +1352,13 @@ mod tests { // test column on the left let expr = col("c1").lt(lit(1)); let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); // test column on the right let expr = lit(1).gt(col("c1")); let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); Ok(()) @@ -1367,12 +1372,12 @@ mod tests { // test column on the left let expr = col("c1").lt_eq(lit(1)); let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); // test column on the right let expr = lit(1).gt_eq(col("c1")); let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); Ok(()) @@ -1389,7 +1394,7 @@ mod tests { let expr = col("c1").lt(lit(1)).and(col("c2").lt(col("c3"))); let expected_expr = "c1_min < Int32(1)"; let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); Ok(()) @@ -1405,7 +1410,7 @@ mod tests { let expr = col("c1").lt(lit(1)).or(col("c2").modulus(lit(2))); let expected_expr = "Boolean(true)"; let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); Ok(()) @@ -1418,7 +1423,7 @@ mod tests { let expr = col("c1").not(); let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); Ok(()) @@ -1431,7 +1436,7 @@ mod tests { let expr = col("c1").not(); let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); Ok(()) @@ -1444,7 +1449,7 @@ mod tests { let expr = col("c1"); let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); Ok(()) @@ -1459,7 +1464,7 @@ mod tests { // this predicate will error when evaluated let expr = col("c1").lt(lit(true)); let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); Ok(()) @@ -1478,7 +1483,7 @@ mod tests { .and(col("c2").eq(lit(2)).or(col("c2").eq(lit(3)))); let expected_expr = "c1_min < Int32(1) AND (c2_min <= Int32(2) AND Int32(2) <= c2_max OR c2_min <= Int32(3) AND Int32(3) <= c2_max)"; let predicate_expr = - build_predicate_expression(&expr, &schema, &mut required_columns)?; + build_predicate_expression(&expr, &schema, &mut required_columns); assert_eq!(format!("{predicate_expr:?}"), expected_expr); // c1 < 1 should add c1_min let c1_min_field = Field::new("c1_min", DataType::Int32, false); @@ -1517,7 +1522,7 @@ mod tests { }; let expected_expr = "c1_min <= Int32(1) AND Int32(1) <= c1_max OR c1_min <= Int32(2) AND Int32(2) <= c1_max OR c1_min <= Int32(3) AND Int32(3) <= c1_max"; let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); Ok(()) @@ -1537,7 +1542,7 @@ mod tests { }; let expected_expr = "Boolean(true)"; let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); Ok(()) @@ -1559,7 +1564,7 @@ mod tests { AND (c1_min != Int32(2) OR Int32(2) != c1_max) \ AND (c1_min != Int32(3) OR Int32(3) != c1_max)"; let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); Ok(()) @@ -1574,13 +1579,13 @@ mod tests { // test column on the left let expr = cast(col("c1"), DataType::Int64).eq(lit(ScalarValue::Int64(Some(1)))); let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); // test column on the right let expr = lit(ScalarValue::Int64(Some(1))).eq(cast(col("c1"), DataType::Int64)); let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); let expected_expr = "TRY_CAST(c1_max AS Int64) > Int64(1)"; @@ -1589,14 +1594,14 @@ mod tests { let expr = try_cast(col("c1"), DataType::Int64).gt(lit(ScalarValue::Int64(Some(1)))); let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); // test column on the right let expr = lit(ScalarValue::Int64(Some(1))).lt(try_cast(col("c1"), DataType::Int64)); let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); Ok(()) @@ -1617,7 +1622,7 @@ mod tests { }; let expected_expr = "CAST(c1_min AS Int64) <= Int64(1) AND Int64(1) <= CAST(c1_max AS Int64) OR CAST(c1_min AS Int64) <= Int64(2) AND Int64(2) <= CAST(c1_max AS Int64) OR CAST(c1_min AS Int64) <= Int64(3) AND Int64(3) <= CAST(c1_max AS Int64)"; let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); let expr = Expr::InList { @@ -1634,7 +1639,7 @@ mod tests { AND (CAST(c1_min AS Int64) != Int64(2) OR Int64(2) != CAST(c1_max AS Int64)) \ AND (CAST(c1_min AS Int64) != Int64(3) OR Int64(3) != CAST(c1_max AS Int64))"; let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new()); assert_eq!(format!("{predicate_expr:?}"), expected_expr); Ok(()) diff --git a/datafusion/core/src/physical_plan/aggregates/mod.rs b/datafusion/core/src/physical_plan/aggregates/mod.rs index 8ca50fec85f8..90a19a3b6e4b 100644 --- a/datafusion/core/src/physical_plan/aggregates/mod.rs +++ b/datafusion/core/src/physical_plan/aggregates/mod.rs @@ -811,22 +811,22 @@ mod tests { common::collect(partial_aggregate.execute(0, task_ctx.clone())?).await?; let expected = vec![ - "+---+---+-----------------+", - "| a | b | COUNT(1)[count] |", - "+---+---+-----------------+", - "| | 1 | 2 |", - "| | 2 | 2 |", - "| | 3 | 2 |", - "| | 4 | 2 |", - "| 2 | | 2 |", - "| 2 | 1 | 2 |", - "| 3 | | 3 |", - "| 3 | 2 | 2 |", - "| 3 | 3 | 1 |", - "| 4 | | 3 |", - "| 4 | 3 | 1 |", - "| 4 | 4 | 2 |", - "+---+---+-----------------+", + "+---+-----+-----------------+", + "| a | b | COUNT(1)[count] |", + "+---+-----+-----------------+", + "| | 1.0 | 2 |", + "| | 2.0 | 2 |", + "| | 3.0 | 2 |", + "| | 4.0 | 2 |", + "| 2 | | 2 |", + "| 2 | 1.0 | 2 |", + "| 3 | | 3 |", + "| 3 | 2.0 | 2 |", + "| 3 | 3.0 | 1 |", + "| 4 | | 3 |", + "| 4 | 3.0 | 1 |", + "| 4 | 4.0 | 2 |", + "+---+-----+-----------------+", ]; assert_batches_sorted_eq!(expected, &result); @@ -858,22 +858,22 @@ mod tests { assert_eq!(batch.num_rows(), 12); let expected = vec![ - "+---+---+----------+", - "| a | b | COUNT(1) |", - "+---+---+----------+", - "| | 1 | 2 |", - "| | 2 | 2 |", - "| | 3 | 2 |", - "| | 4 | 2 |", - "| 2 | | 2 |", - "| 2 | 1 | 2 |", - "| 3 | | 3 |", - "| 3 | 2 | 2 |", - "| 3 | 3 | 1 |", - "| 4 | | 3 |", - "| 4 | 3 | 1 |", - "| 4 | 4 | 2 |", - "+---+---+----------+", + "+---+-----+----------+", + "| a | b | COUNT(1) |", + "+---+-----+----------+", + "| | 1.0 | 2 |", + "| | 2.0 | 2 |", + "| | 3.0 | 2 |", + "| | 4.0 | 2 |", + "| 2 | | 2 |", + "| 2 | 1.0 | 2 |", + "| 3 | | 3 |", + "| 3 | 2.0 | 2 |", + "| 3 | 3.0 | 1 |", + "| 4 | | 3 |", + "| 4 | 3.0 | 1 |", + "| 4 | 4.0 | 2 |", + "+---+-----+----------+", ]; assert_batches_sorted_eq!(&expected, &result); @@ -919,9 +919,9 @@ mod tests { "+---+---------------+-------------+", "| a | AVG(b)[count] | AVG(b)[sum] |", "+---+---------------+-------------+", - "| 2 | 2 | 2 |", - "| 3 | 3 | 7 |", - "| 4 | 3 | 11 |", + "| 2 | 2 | 2.0 |", + "| 3 | 3 | 7.0 |", + "| 4 | 3 | 11.0 |", "+---+---------------+-------------+", ]; assert_batches_sorted_eq!(expected, &result); @@ -956,7 +956,7 @@ mod tests { "+---+--------------------+", "| a | AVG(b) |", "+---+--------------------+", - "| 2 | 1 |", + "| 2 | 1.0 |", "| 3 | 2.3333333333333335 |", // 3, (2 + 3 + 2) / 3 "| 4 | 3.6666666666666665 |", // 4, (3 + 4 + 4) / 3 "+---+--------------------+", diff --git a/datafusion/core/src/physical_plan/coalesce_batches.rs b/datafusion/core/src/physical_plan/coalesce_batches.rs index 2e7211fc3ae4..ec7dd7b4d63a 100644 --- a/datafusion/core/src/physical_plan/coalesce_batches.rs +++ b/datafusion/core/src/physical_plan/coalesce_batches.rs @@ -295,7 +295,6 @@ mod tests { use crate::config::ConfigOptions; use crate::datasource::MemTable; use crate::physical_plan::filter::FilterExec; - use crate::physical_plan::projection::ProjectionExec; use crate::physical_plan::{memory::MemoryExec, repartition::RepartitionExec}; use crate::prelude::SessionContext; use crate::test::create_vec_batches; @@ -308,12 +307,7 @@ mod tests { let ctx = SessionContext::with_config(config.into()); let plan = create_physical_plan(ctx).await?; - let projection = plan.as_any().downcast_ref::().unwrap(); - let coalesce = projection - .input() - .as_any() - .downcast_ref::() - .unwrap(); + let coalesce = plan.as_any().downcast_ref::().unwrap(); assert_eq!(1234, coalesce.target_batch_size); Ok(()) } @@ -325,13 +319,7 @@ mod tests { let ctx = SessionContext::with_config(config.into()); let plan = create_physical_plan(ctx).await?; - let projection = plan.as_any().downcast_ref::().unwrap(); - // projection should directly wrap filter with no coalesce step - let _filter = projection - .input() - .as_any() - .downcast_ref::() - .unwrap(); + let _filter = plan.as_any().downcast_ref::().unwrap(); Ok(()) } diff --git a/datafusion/core/src/physical_plan/file_format/csv.rs b/datafusion/core/src/physical_plan/file_format/csv.rs index 337a54f42ef9..9197d8f3babf 100644 --- a/datafusion/core/src/physical_plan/file_format/csv.rs +++ b/datafusion/core/src/physical_plan/file_format/csv.rs @@ -345,7 +345,8 @@ mod tests { case(FileCompressionType::UNCOMPRESSED), case(FileCompressionType::GZIP), case(FileCompressionType::BZIP2), - case(FileCompressionType::XZ) + case(FileCompressionType::XZ), + case(FileCompressionType::ZSTD) )] #[tokio::test] async fn csv_exec_with_projection( @@ -400,7 +401,8 @@ mod tests { case(FileCompressionType::UNCOMPRESSED), case(FileCompressionType::GZIP), case(FileCompressionType::BZIP2), - case(FileCompressionType::XZ) + case(FileCompressionType::XZ), + case(FileCompressionType::ZSTD) )] #[tokio::test] async fn csv_exec_with_mixed_order_projection( @@ -455,7 +457,8 @@ mod tests { case(FileCompressionType::UNCOMPRESSED), case(FileCompressionType::GZIP), case(FileCompressionType::BZIP2), - case(FileCompressionType::XZ) + case(FileCompressionType::XZ), + case(FileCompressionType::ZSTD) )] #[tokio::test] async fn csv_exec_with_limit( @@ -510,7 +513,8 @@ mod tests { case(FileCompressionType::UNCOMPRESSED), case(FileCompressionType::GZIP), case(FileCompressionType::BZIP2), - case(FileCompressionType::XZ) + case(FileCompressionType::XZ), + case(FileCompressionType::ZSTD) )] #[tokio::test] async fn csv_exec_with_missing_column( @@ -553,7 +557,8 @@ mod tests { case(FileCompressionType::UNCOMPRESSED), case(FileCompressionType::GZIP), case(FileCompressionType::BZIP2), - case(FileCompressionType::XZ) + case(FileCompressionType::XZ), + case(FileCompressionType::ZSTD) )] #[tokio::test] async fn csv_exec_with_partition( @@ -688,7 +693,8 @@ mod tests { case(FileCompressionType::UNCOMPRESSED), case(FileCompressionType::GZIP), case(FileCompressionType::BZIP2), - case(FileCompressionType::XZ) + case(FileCompressionType::XZ), + case(FileCompressionType::ZSTD) )] #[tokio::test] async fn test_chunked_csv( diff --git a/datafusion/core/src/physical_plan/file_format/json.rs b/datafusion/core/src/physical_plan/file_format/json.rs index 6d26965ecd8e..3556774a8002 100644 --- a/datafusion/core/src/physical_plan/file_format/json.rs +++ b/datafusion/core/src/physical_plan/file_format/json.rs @@ -364,14 +364,14 @@ mod tests { assert_batches_eq!( &[ - "+-----+----------------+---------------+------+", - "| a | b | c | d |", - "+-----+----------------+---------------+------+", - "| 1 | [2, 1.3, -6.1] | [false, true] | 4 |", - "| -10 | [2, 1.3, -6.1] | [true, true] | 4 |", - "| 2 | [2, , -6.1] | [false, ] | text |", - "| | | | |", - "+-----+----------------+---------------+------+", + "+-----+------------------+---------------+------+", + "| a | b | c | d |", + "+-----+------------------+---------------+------+", + "| 1 | [2.0, 1.3, -6.1] | [false, true] | 4 |", + "| -10 | [2.0, 1.3, -6.1] | [true, true] | 4 |", + "| 2 | [2.0, , -6.1] | [false, ] | text |", + "| | | | |", + "+-----+------------------+---------------+------+", ], &results ); @@ -382,7 +382,8 @@ mod tests { case(FileCompressionType::UNCOMPRESSED), case(FileCompressionType::GZIP), case(FileCompressionType::BZIP2), - case(FileCompressionType::XZ) + case(FileCompressionType::XZ), + case(FileCompressionType::ZSTD) )] #[tokio::test] async fn nd_json_exec_file_without_projection( @@ -452,7 +453,8 @@ mod tests { case(FileCompressionType::UNCOMPRESSED), case(FileCompressionType::GZIP), case(FileCompressionType::BZIP2), - case(FileCompressionType::XZ) + case(FileCompressionType::XZ), + case(FileCompressionType::ZSTD) )] #[tokio::test] async fn nd_json_exec_file_with_missing_column( @@ -504,7 +506,8 @@ mod tests { case(FileCompressionType::UNCOMPRESSED), case(FileCompressionType::GZIP), case(FileCompressionType::BZIP2), - case(FileCompressionType::XZ) + case(FileCompressionType::XZ), + case(FileCompressionType::ZSTD) )] #[tokio::test] async fn nd_json_exec_file_projection( @@ -554,7 +557,8 @@ mod tests { case(FileCompressionType::UNCOMPRESSED), case(FileCompressionType::GZIP), case(FileCompressionType::BZIP2), - case(FileCompressionType::XZ) + case(FileCompressionType::XZ), + case(FileCompressionType::ZSTD) )] #[tokio::test] async fn nd_json_exec_file_mixed_order_projection( @@ -658,7 +662,8 @@ mod tests { case(FileCompressionType::UNCOMPRESSED), case(FileCompressionType::GZIP), case(FileCompressionType::BZIP2), - case(FileCompressionType::XZ) + case(FileCompressionType::XZ), + case(FileCompressionType::ZSTD) )] #[tokio::test] async fn test_chunked_json( diff --git a/datafusion/core/src/physical_plan/file_format/parquet.rs b/datafusion/core/src/physical_plan/file_format/parquet.rs index 5bb03b4f42bb..e2d8cc94dcce 100644 --- a/datafusion/core/src/physical_plan/file_format/parquet.rs +++ b/datafusion/core/src/physical_plan/file_format/parquet.rs @@ -376,6 +376,7 @@ impl ExecutionPlan for ParquetExec { partition_index, projection: Arc::from(projection), batch_size: ctx.session_config().batch_size(), + limit: self.base_config.limit, predicate: self.predicate.clone(), pruning_predicate: self.pruning_predicate.clone(), page_pruning_predicate: self.page_pruning_predicate.clone(), @@ -460,6 +461,7 @@ struct ParquetOpener { partition_index: usize, projection: Arc<[usize]>, batch_size: usize, + limit: Option, predicate: Option>, pruning_predicate: Option>, page_pruning_predicate: Option>, @@ -500,6 +502,7 @@ impl FileOpener for ParquetOpener { let reorder_predicates = self.reorder_filters; let pushdown_filters = self.pushdown_filters; let enable_page_index = self.enable_page_index; + let limit = self.limit; Ok(Box::pin(async move { let options = ArrowReaderOptions::new().with_page_index(enable_page_index); @@ -562,6 +565,10 @@ impl FileOpener for ParquetOpener { } } + if let Some(limit) = limit { + builder = builder.with_limit(limit) + } + let stream = builder .with_projection(mask) .with_batch_size(batch_size) diff --git a/datafusion/core/src/physical_plan/file_format/parquet/page_filter.rs b/datafusion/core/src/physical_plan/file_format/parquet/page_filter.rs index ebe59db9e713..585f0c886245 100644 --- a/datafusion/core/src/physical_plan/file_format/parquet/page_filter.rs +++ b/datafusion/core/src/physical_plan/file_format/parquet/page_filter.rs @@ -110,14 +110,16 @@ impl PagePruningPredicate { pub fn try_new(expr: &Expr, schema: SchemaRef) -> Result { let predicates = split_conjunction(expr) .into_iter() - .filter_map(|predicate| match predicate.to_columns() { - Ok(columns) if columns.len() == 1 => { - match PruningPredicate::try_new(predicate.clone(), schema.clone()) { - Ok(p) if !p.allways_true() => Some(Ok(p)), - _ => None, + .filter_map(|predicate| { + match PruningPredicate::try_new(predicate.clone(), schema.clone()) { + Ok(p) + if (!p.allways_true()) + && (p.required_columns().n_columns() < 2) => + { + Some(Ok(p)) } + _ => None, } - _ => None, }) .collect::>>()?; Ok(Self { predicates }) diff --git a/datafusion/core/src/physical_plan/joins/sort_merge_join.rs b/datafusion/core/src/physical_plan/joins/sort_merge_join.rs index 88667b692b54..8fa5145938c4 100644 --- a/datafusion/core/src/physical_plan/joins/sort_merge_join.rs +++ b/datafusion/core/src/physical_plan/joins/sort_merge_join.rs @@ -1983,13 +1983,13 @@ mod tests { let (_, batches) = join_collect(left, right, on, JoinType::Inner).await?; let expected = vec![ - "+------------+------------+------------+------------+------------+------------+", - "| a1 | b1 | c1 | a2 | b1 | c2 |", - "+------------+------------+------------+------------+------------+------------+", - "| 1970-01-01 | 2022-04-23 | 1970-01-01 | 1970-01-01 | 2022-04-23 | 1970-01-01 |", - "| 1970-01-01 | 2022-04-25 | 1970-01-01 | 1970-01-01 | 2022-04-25 | 1970-01-01 |", - "| 1970-01-01 | 2022-04-25 | 1970-01-01 | 1970-01-01 | 2022-04-25 | 1970-01-01 |", - "+------------+------------+------------+------------+------------+------------+", + "+-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+", + "| a1 | b1 | c1 | a2 | b1 | c2 |", + "+-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+", + "| 1970-01-01T00:00:00.001 | 2022-04-23T08:44:01 | 1970-01-01T00:00:00.007 | 1970-01-01T00:00:00.010 | 2022-04-23T08:44:01 | 1970-01-01T00:00:00.070 |", + "| 1970-01-01T00:00:00.002 | 2022-04-25T16:17:21 | 1970-01-01T00:00:00.008 | 1970-01-01T00:00:00.030 | 2022-04-25T16:17:21 | 1970-01-01T00:00:00.090 |", + "| 1970-01-01T00:00:00.003 | 2022-04-25T16:17:21 | 1970-01-01T00:00:00.009 | 1970-01-01T00:00:00.030 | 2022-04-25T16:17:21 | 1970-01-01T00:00:00.090 |", + "+-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+", ]; // The output order is important as SMJ preserves sortedness assert_batches_eq!(expected, &batches); diff --git a/datafusion/core/src/physical_plan/mod.rs b/datafusion/core/src/physical_plan/mod.rs index c8f5eb793910..dbd1024ae482 100644 --- a/datafusion/core/src/physical_plan/mod.rs +++ b/datafusion/core/src/physical_plan/mod.rs @@ -318,15 +318,14 @@ pub fn with_new_children_if_necessary( /// let normalized = Path::from_filesystem_path(working_directory).unwrap(); /// let plan_string = plan_string.replace(normalized.as_ref(), "WORKING_DIR"); /// -/// assert_eq!("ProjectionExec: expr=[a@0 as a]\ -/// \n CoalesceBatchesExec: target_batch_size=8192\ -/// \n FilterExec: a@0 < 5\ -/// \n RepartitionExec: partitioning=RoundRobinBatch(3), input_partitions=1\ -/// \n CsvExec: files={1 group: [[WORKING_DIR/tests/data/example.csv]]}, has_header=true, limit=None, projection=[a]", +/// assert_eq!("CoalesceBatchesExec: target_batch_size=8192\ +/// \n FilterExec: a@0 < 5\ +/// \n RepartitionExec: partitioning=RoundRobinBatch(3), input_partitions=1\ +/// \n CsvExec: files={1 group: [[WORKING_DIR/tests/data/example.csv]]}, has_header=true, limit=None, projection=[a]", /// plan_string.trim()); /// /// let one_line = format!("{}", displayable_plan.one_line()); -/// assert_eq!("ProjectionExec: expr=[a@0 as a]", one_line.trim()); +/// assert_eq!("CoalesceBatchesExec: target_batch_size=8192", one_line.trim()); /// } /// ``` /// diff --git a/datafusion/core/src/physical_plan/planner.rs b/datafusion/core/src/physical_plan/planner.rs index 93616f3fc684..86b1677f4796 100644 --- a/datafusion/core/src/physical_plan/planner.rs +++ b/datafusion/core/src/physical_plan/planner.rs @@ -27,8 +27,8 @@ use crate::datasource::source_as_provider; use crate::execution::context::{ExecutionProps, SessionState}; use crate::logical_expr::utils::generate_sort_key; use crate::logical_expr::{ - Aggregate, Distinct, EmptyRelation, Join, Projection, Sort, SubqueryAlias, TableScan, - Unnest, Window, + Aggregate, EmptyRelation, Join, Projection, Sort, SubqueryAlias, TableScan, Unnest, + Window, }; use crate::logical_expr::{ CrossJoin, Expr, LogicalPlan, Partitioning as LogicalPartitioning, PlanType, @@ -65,7 +65,6 @@ use datafusion_expr::expr::{ }; use datafusion_expr::expr_rewriter::unnormalize_cols; use datafusion_expr::logical_plan::builder::wrap_projection_for_join_if_necessary; -use datafusion_expr::utils::expand_wildcard; use datafusion_expr::{logical_plan, StringifiedPlan}; use datafusion_expr::{WindowFrame, WindowFrameBound}; use datafusion_optimizer::utils::unalias; @@ -710,17 +709,6 @@ impl DefaultPhysicalPlanner { physical_input_schema.clone(), )?)) } - LogicalPlan::Distinct(Distinct { input }) => { - // Convert distinct to groupby with no aggregations - let group_expr = expand_wildcard(input.schema(), input)?; - let aggregate = LogicalPlan::Aggregate(Aggregate::try_new_with_schema( - input.clone(), - group_expr, - vec![], - input.schema().clone(), // input schema and aggregate schema are the same in this case - )?); - Ok(self.create_initial_plan(&aggregate, session_state).await?) - } LogicalPlan::Projection(Projection { input, expr, .. }) => { let input_exec = self.create_initial_plan(input, session_state).await?; let input_schema = input.as_ref().schema(); @@ -1207,6 +1195,11 @@ impl DefaultPhysicalPlanner { LogicalPlan::Explain(_) => Err(DataFusionError::Internal( "Unsupported logical plan: Explain must be root of the plan".to_string(), )), + LogicalPlan::Distinct(_) => { + Err(DataFusionError::Internal( + "Unsupported logical plan: Distinct should be replaced to Aggregate".to_string(), + )) + } LogicalPlan::Analyze(a) => { let input = self.create_initial_plan(&a.input, session_state).await?; let schema = SchemaRef::new((*a.schema).clone().into()); diff --git a/datafusion/core/src/physical_plan/sorts/sort.rs b/datafusion/core/src/physical_plan/sorts/sort.rs index 9bacf9fad6ff..c3fc06206ca1 100644 --- a/datafusion/core/src/physical_plan/sorts/sort.rs +++ b/datafusion/core/src/physical_plan/sorts/sort.rs @@ -1054,7 +1054,7 @@ mod tests { #[tokio::test] async fn test_sort_fetch_memory_calculation() -> Result<()> { // This test mirrors down the size from the example above. - let avg_batch_size = 5336; + let avg_batch_size = 6000; let partitions = 4; // A tuple of (fetch, expect_spillage) diff --git a/datafusion/core/src/test/mod.rs b/datafusion/core/src/test/mod.rs index 737893f51845..9dccde8a7345 100644 --- a/datafusion/core/src/test/mod.rs +++ b/datafusion/core/src/test/mod.rs @@ -36,6 +36,7 @@ use arrow::record_batch::RecordBatch; use bzip2::write::BzEncoder; #[cfg(feature = "compression")] use bzip2::Compression as BzCompression; +use datafusion_common::DataFusionError; #[cfg(feature = "compression")] use flate2::write::GzEncoder; #[cfg(feature = "compression")] @@ -49,6 +50,8 @@ use std::sync::Arc; use tempfile::TempDir; #[cfg(feature = "compression")] use xz2::write::XzEncoder; +#[cfg(feature = "compression")] +use zstd::Encoder as ZstdEncoder; pub fn create_table_dual() -> Arc { let dual_schema = Arc::new(Schema::new(vec![ @@ -124,14 +127,22 @@ pub fn partitioned_file_groups( #[cfg(feature = "compression")] FileCompressionType::XZ => Box::new(XzEncoder::new(file, 9)), #[cfg(feature = "compression")] + FileCompressionType::ZSTD => { + let encoder = ZstdEncoder::new(file, 0) + .map_err(|e| DataFusionError::External(Box::new(e)))? + .auto_finish(); + Box::new(encoder) + } + #[cfg(feature = "compression")] FileCompressionType::BZIP2 => { Box::new(BzEncoder::new(file, BzCompression::default())) } #[cfg(not(feature = "compression"))] FileCompressionType::GZIP | FileCompressionType::BZIP2 - | FileCompressionType::XZ => { - panic!("GZIP compression is not supported in this build") + | FileCompressionType::XZ + | FileCompressionType::ZSTD => { + panic!("Compression is not supported in this build") } }; diff --git a/datafusion/core/tests/custom_sources.rs b/datafusion/core/tests/custom_sources.rs index 9842f1b596e7..e78351500ca9 100644 --- a/datafusion/core/tests/custom_sources.rs +++ b/datafusion/core/tests/custom_sources.rs @@ -22,7 +22,7 @@ use arrow::record_batch::RecordBatch; use datafusion::execution::context::{SessionContext, SessionState, TaskContext}; use datafusion::from_slice::FromSlice; use datafusion::logical_expr::{ - col, Expr, LogicalPlan, LogicalPlanBuilder, Projection, TableScan, UNNAMED_TABLE, + col, Expr, LogicalPlan, LogicalPlanBuilder, TableScan, UNNAMED_TABLE, }; use datafusion::physical_plan::empty::EmptyExec; use datafusion::physical_plan::expressions::PhysicalSortExpr; @@ -214,24 +214,18 @@ async fn custom_source_dataframe() -> Result<()> { let optimized_plan = state.optimize(&logical_plan)?; match &optimized_plan { - LogicalPlan::Projection(Projection { input, .. }) => match &**input { - LogicalPlan::TableScan(TableScan { - source, - projected_schema, - .. - }) => { - assert_eq!(source.schema().fields().len(), 2); - assert_eq!(projected_schema.fields().len(), 1); - } - _ => panic!("input to projection should be TableScan"), - }, - _ => panic!("expect optimized_plan to be projection"), + LogicalPlan::TableScan(TableScan { + source, + projected_schema, + .. + }) => { + assert_eq!(source.schema().fields().len(), 2); + assert_eq!(projected_schema.fields().len(), 1); + } + _ => panic!("input to projection should be TableScan"), } - let expected = format!( - "Projection: {UNNAMED_TABLE}.c2\ - \n TableScan: {UNNAMED_TABLE} projection=[c2]" - ); + let expected = format!("TableScan: {UNNAMED_TABLE} projection=[c2]"); assert_eq!(format!("{optimized_plan:?}"), expected); let physical_plan = state.create_physical_plan(&optimized_plan).await?; @@ -242,7 +236,7 @@ async fn custom_source_dataframe() -> Result<()> { let batches = collect(physical_plan, state.task_ctx()).await?; let origin_rec_batch = TEST_CUSTOM_RECORD_BATCH!()?; assert_eq!(1, batches.len()); - assert_eq!(1, batches[0].num_columns()); + assert_eq!(2, batches[0].num_columns()); assert_eq!(origin_rec_batch.num_rows(), batches[0].num_rows()); Ok(()) @@ -270,8 +264,8 @@ async fn optimizers_catch_all_statistics() { let expected = RecordBatch::try_new( Arc::new(Schema::new(vec![ Field::new("COUNT(UInt8(1))", DataType::Int64, false), - Field::new("MIN(test.c1)", DataType::Int32, false), - Field::new("MAX(test.c1)", DataType::Int32, false), + Field::new("MIN(c1)", DataType::Int32, false), + Field::new("MAX(c1)", DataType::Int32, false), ])), vec![ Arc::new(Int64Array::from_slice([4])), diff --git a/datafusion/core/tests/dataframe.rs b/datafusion/core/tests/dataframe.rs index f10902839606..a9e28848cc0e 100644 --- a/datafusion/core/tests/dataframe.rs +++ b/datafusion/core/tests/dataframe.rs @@ -128,7 +128,7 @@ async fn sort_on_unprojected_columns() -> Result<()> { } #[tokio::test] -async fn sort_on_distinct_unprojected_columns() -> Result<()> { +async fn sort_on_distinct_columns() -> Result<()> { let schema = Schema::new(vec![ Field::new("a", DataType::Int32, false), Field::new("b", DataType::Int32, false), @@ -138,7 +138,7 @@ async fn sort_on_distinct_unprojected_columns() -> Result<()> { Arc::new(schema.clone()), vec![ Arc::new(Int32Array::from_slice([1, 10, 10, 100])), - Arc::new(Int32Array::from_slice([2, 12, 12, 120])), + Arc::new(Int32Array::from_slice([2, 3, 4, 5])), ], ) .unwrap(); @@ -153,7 +153,7 @@ async fn sort_on_distinct_unprojected_columns() -> Result<()> { .unwrap() .distinct() .unwrap() - .sort(vec![Expr::Sort(Sort::new(Box::new(col("b")), false, true))]) + .sort(vec![Expr::Sort(Sort::new(Box::new(col("a")), false, true))]) .unwrap(); let results = df.collect().await.unwrap(); @@ -170,6 +170,38 @@ async fn sort_on_distinct_unprojected_columns() -> Result<()> { assert_batches_eq!(expected, &results); Ok(()) } +#[tokio::test] +async fn sort_on_distinct_unprojected_columns() -> Result<()> { + let schema = Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + ]); + + let batch = RecordBatch::try_new( + Arc::new(schema.clone()), + vec![ + Arc::new(Int32Array::from_slice([1, 10, 10, 100])), + Arc::new(Int32Array::from_slice([2, 3, 4, 5])), + ], + ) + .unwrap(); + + // Cannot sort on a column after distinct that would add a new column + let ctx = SessionContext::new(); + ctx.register_batch("t", batch).unwrap(); + let err = ctx + .table("t") + .await + .unwrap() + .select(vec![col("a")]) + .unwrap() + .distinct() + .unwrap() + .sort(vec![Expr::Sort(Sort::new(Box::new(col("b")), false, true))]) + .unwrap_err(); + assert_eq!(err.to_string(), "Error during planning: For SELECT DISTINCT, ORDER BY expressions b must appear in select list"); + Ok(()) +} #[tokio::test] async fn filter_with_alias_overwrite() -> Result<()> { @@ -218,7 +250,7 @@ async fn select_with_alias_overwrite() -> Result<()> { )?; let ctx = SessionContext::new(); - ctx.register_batch("t", batch).unwrap(); + ctx.register_batch("t", batch)?; let df = ctx .table("t") @@ -361,19 +393,19 @@ async fn test_grouping_set_array_agg_with_overflow() -> Result<()> { "| | 2 | 184 | 8.363636363636363 |", "| | 1 | 367 | 16.681818181818183 |", "| e | | 847 | 40.333333333333336 |", - "| e | 5 | -22 | -11 |", + "| e | 5 | -22 | -11.0 |", "| e | 4 | 261 | 37.285714285714285 |", - "| e | 3 | 192 | 48 |", + "| e | 3 | 192 | 48.0 |", "| e | 2 | 189 | 37.8 |", "| e | 1 | 227 | 75.66666666666667 |", "| d | | 458 | 25.444444444444443 |", "| d | 5 | -99 | -49.5 |", - "| d | 4 | 162 | 54 |", + "| d | 4 | 162 | 54.0 |", "| d | 3 | 124 | 41.333333333333336 |", "| d | 2 | 328 | 109.33333333333333 |", "| d | 1 | -57 | -8.142857142857142 |", "| c | | -28 | -1.3333333333333333 |", - "| c | 5 | 24 | 12 |", + "| c | 5 | 24 | 12.0 |", "| c | 4 | -43 | -10.75 |", "| c | 3 | 190 | 47.5 |", "| c | 2 | -389 | -55.57142857142857 |", @@ -381,12 +413,12 @@ async fn test_grouping_set_array_agg_with_overflow() -> Result<()> { "| b | | -111 | -5.842105263157895 |", "| b | 5 | -1 | -0.2 |", "| b | 4 | -223 | -44.6 |", - "| b | 3 | -84 | -42 |", + "| b | 3 | -84 | -42.0 |", "| b | 2 | 102 | 25.5 |", "| b | 1 | 95 | 31.666666666666668 |", "| a | | -385 | -18.333333333333332 |", - "| a | 5 | -96 | -32 |", - "| a | 4 | -128 | -32 |", + "| a | 5 | -96 | -32.0 |", + "| a | 4 | -128 | -32.0 |", "| a | 3 | -27 | -4.5 |", "| a | 2 | -46 | -15.333333333333334 |", "| a | 1 | -88 | -17.6 |", @@ -470,12 +502,11 @@ async fn right_semi_with_alias_filter() -> Result<()> { .select(vec![col("t2.a"), col("t2.b"), col("t2.c")])?; let optimized_plan = df.clone().into_optimized_plan()?; let expected = vec![ - "Projection: t2.a, t2.b, t2.c [a:UInt32, b:Utf8, c:Int32]", - " RightSemi Join: t1.a = t2.a [a:UInt32, b:Utf8, c:Int32]", - " Filter: t1.c > Int32(1) [a:UInt32, c:Int32]", - " TableScan: t1 projection=[a, c] [a:UInt32, c:Int32]", - " Filter: t2.c > Int32(1) [a:UInt32, b:Utf8, c:Int32]", - " TableScan: t2 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32]", + "RightSemi Join: t1.a = t2.a [a:UInt32, b:Utf8, c:Int32]", + " Filter: t1.c > Int32(1) [a:UInt32, c:Int32]", + " TableScan: t1 projection=[a, c] [a:UInt32, c:Int32]", + " Filter: t2.c > Int32(1) [a:UInt32, b:Utf8, c:Int32]", + " TableScan: t2 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32]", ]; let formatted = optimized_plan.display_indent_schema().to_string(); @@ -515,11 +546,10 @@ async fn right_anti_filter_push_down() -> Result<()> { .select(vec![col("t2.a"), col("t2.b"), col("t2.c")])?; let optimized_plan = df.clone().into_optimized_plan()?; let expected = vec![ - "Projection: t2.a, t2.b, t2.c [a:UInt32, b:Utf8, c:Int32]", - " RightAnti Join: t1.a = t2.a Filter: t2.c > Int32(1) [a:UInt32, b:Utf8, c:Int32]", - " Filter: t1.c > Int32(1) [a:UInt32, c:Int32]", - " TableScan: t1 projection=[a, c] [a:UInt32, c:Int32]", - " TableScan: t2 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32]", + "RightAnti Join: t1.a = t2.a Filter: t2.c > Int32(1) [a:UInt32, b:Utf8, c:Int32]", + " Filter: t1.c > Int32(1) [a:UInt32, c:Int32]", + " TableScan: t1 projection=[a, c] [a:UInt32, c:Int32]", + " TableScan: t2 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32]", ]; let formatted = optimized_plan.display_indent_schema().to_string(); @@ -548,14 +578,14 @@ async fn unnest_columns() -> Result<()> { let df = table_with_nested_types(NUM_ROWS).await?; let results = df.collect().await?; let expected = vec![ - r#"+----------+------------------------------------------------------------+--------------------+"#, - r#"| shape_id | points | tags |"#, - r#"+----------+------------------------------------------------------------+--------------------+"#, - r#"| 1 | [{"x": -3, "y": -4}, {"x": -3, "y": 6}, {"x": 2, "y": -2}] | [tag1] |"#, - r#"| 2 | | [tag1, tag2] |"#, - r#"| 3 | [{"x": -9, "y": 2}, {"x": -10, "y": -4}] | |"#, - r#"| 4 | [{"x": -3, "y": 5}, {"x": 2, "y": -1}] | [tag1, tag2, tag3] |"#, - r#"+----------+------------------------------------------------------------+--------------------+"#, + "+----------+------------------------------------------------+--------------------+", + "| shape_id | points | tags |", + "+----------+------------------------------------------------+--------------------+", + "| 1 | [{x: -3, y: -4}, {x: -3, y: 6}, {x: 2, y: -2}] | [tag1] |", + "| 2 | | [tag1, tag2] |", + "| 3 | [{x: -9, y: 2}, {x: -10, y: -4}] | |", + "| 4 | [{x: -3, y: 5}, {x: 2, y: -1}] | [tag1, tag2, tag3] |", + "+----------+------------------------------------------------+--------------------+", ]; assert_batches_sorted_eq!(expected, &results); @@ -563,17 +593,17 @@ async fn unnest_columns() -> Result<()> { let df = table_with_nested_types(NUM_ROWS).await?; let results = df.unnest_column("tags")?.collect().await?; let expected = vec![ - r#"+----------+------------------------------------------------------------+------+"#, - r#"| shape_id | points | tags |"#, - r#"+----------+------------------------------------------------------------+------+"#, - r#"| 1 | [{"x": -3, "y": -4}, {"x": -3, "y": 6}, {"x": 2, "y": -2}] | tag1 |"#, - r#"| 2 | | tag1 |"#, - r#"| 2 | | tag2 |"#, - r#"| 3 | [{"x": -9, "y": 2}, {"x": -10, "y": -4}] | |"#, - r#"| 4 | [{"x": -3, "y": 5}, {"x": 2, "y": -1}] | tag1 |"#, - r#"| 4 | [{"x": -3, "y": 5}, {"x": 2, "y": -1}] | tag2 |"#, - r#"| 4 | [{"x": -3, "y": 5}, {"x": 2, "y": -1}] | tag3 |"#, - r#"+----------+------------------------------------------------------------+------+"#, + "+----------+------------------------------------------------+------+", + "| shape_id | points | tags |", + "+----------+------------------------------------------------+------+", + "| 1 | [{x: -3, y: -4}, {x: -3, y: 6}, {x: 2, y: -2}] | tag1 |", + "| 2 | | tag1 |", + "| 2 | | tag2 |", + "| 3 | [{x: -9, y: 2}, {x: -10, y: -4}] | |", + "| 4 | [{x: -3, y: 5}, {x: 2, y: -1}] | tag1 |", + "| 4 | [{x: -3, y: 5}, {x: 2, y: -1}] | tag2 |", + "| 4 | [{x: -3, y: 5}, {x: 2, y: -1}] | tag3 |", + "+----------+------------------------------------------------+------+", ]; assert_batches_sorted_eq!(expected, &results); @@ -586,18 +616,18 @@ async fn unnest_columns() -> Result<()> { let df = table_with_nested_types(NUM_ROWS).await?; let results = df.unnest_column("points")?.collect().await?; let expected = vec![ - r#"+----------+---------------------+--------------------+"#, - r#"| shape_id | points | tags |"#, - r#"+----------+---------------------+--------------------+"#, - r#"| 1 | {"x": -3, "y": -4} | [tag1] |"#, - r#"| 1 | {"x": -3, "y": 6} | [tag1] |"#, - r#"| 1 | {"x": 2, "y": -2} | [tag1] |"#, - r#"| 2 | | [tag1, tag2] |"#, - r#"| 3 | {"x": -9, "y": 2} | |"#, - r#"| 3 | {"x": -10, "y": -4} | |"#, - r#"| 4 | {"x": -3, "y": 5} | [tag1, tag2, tag3] |"#, - r#"| 4 | {"x": 2, "y": -1} | [tag1, tag2, tag3] |"#, - r#"+----------+---------------------+--------------------+"#, + "+----------+-----------------+--------------------+", + "| shape_id | points | tags |", + "+----------+-----------------+--------------------+", + "| 1 | {x: -3, y: -4} | [tag1] |", + "| 1 | {x: -3, y: 6} | [tag1] |", + "| 1 | {x: 2, y: -2} | [tag1] |", + "| 2 | | [tag1, tag2] |", + "| 3 | {x: -10, y: -4} | |", + "| 3 | {x: -9, y: 2} | |", + "| 4 | {x: -3, y: 5} | [tag1, tag2, tag3] |", + "| 4 | {x: 2, y: -1} | [tag1, tag2, tag3] |", + "+----------+-----------------+--------------------+", ]; assert_batches_sorted_eq!(expected, &results); @@ -614,23 +644,23 @@ async fn unnest_columns() -> Result<()> { .collect() .await?; let expected = vec![ - r#"+----------+---------------------+------+"#, - r#"| shape_id | points | tags |"#, - r#"+----------+---------------------+------+"#, - r#"| 1 | {"x": -3, "y": -4} | tag1 |"#, - r#"| 1 | {"x": -3, "y": 6} | tag1 |"#, - r#"| 1 | {"x": 2, "y": -2} | tag1 |"#, - r#"| 2 | | tag1 |"#, - r#"| 2 | | tag2 |"#, - r#"| 3 | {"x": -9, "y": 2} | |"#, - r#"| 3 | {"x": -10, "y": -4} | |"#, - r#"| 4 | {"x": -3, "y": 5} | tag1 |"#, - r#"| 4 | {"x": -3, "y": 5} | tag2 |"#, - r#"| 4 | {"x": -3, "y": 5} | tag3 |"#, - r#"| 4 | {"x": 2, "y": -1} | tag1 |"#, - r#"| 4 | {"x": 2, "y": -1} | tag2 |"#, - r#"| 4 | {"x": 2, "y": -1} | tag3 |"#, - r#"+----------+---------------------+------+"#, + "+----------+-----------------+------+", + "| shape_id | points | tags |", + "+----------+-----------------+------+", + "| 1 | {x: -3, y: -4} | tag1 |", + "| 1 | {x: -3, y: 6} | tag1 |", + "| 1 | {x: 2, y: -2} | tag1 |", + "| 2 | | tag1 |", + "| 2 | | tag2 |", + "| 3 | {x: -10, y: -4} | |", + "| 3 | {x: -9, y: 2} | |", + "| 4 | {x: -3, y: 5} | tag1 |", + "| 4 | {x: -3, y: 5} | tag2 |", + "| 4 | {x: -3, y: 5} | tag3 |", + "| 4 | {x: 2, y: -1} | tag1 |", + "| 4 | {x: 2, y: -1} | tag2 |", + "| 4 | {x: 2, y: -1} | tag3 |", + "+----------+-----------------+------+", ]; assert_batches_sorted_eq!(expected, &results); diff --git a/datafusion/core/tests/dataframe_functions.rs b/datafusion/core/tests/dataframe_functions.rs index c6291dc36c46..8d280692e8c7 100644 --- a/datafusion/core/tests/dataframe_functions.rs +++ b/datafusion/core/tests/dataframe_functions.rs @@ -662,10 +662,10 @@ async fn test_cast() -> Result<()> { "+--------+", "| test.b |", "+--------+", - "| 1 |", - "| 10 |", - "| 10 |", - "| 100 |", + "| 1.0 |", + "| 10.0 |", + "| 10.0 |", + "| 100.0 |", "+--------+", ]; diff --git a/datafusion/core/tests/sql/aggregates.rs b/datafusion/core/tests/sql/aggregates.rs index cea7edf749fb..972c8a94a2f2 100644 --- a/datafusion/core/tests/sql/aggregates.rs +++ b/datafusion/core/tests/sql/aggregates.rs @@ -99,11 +99,11 @@ async fn aggregate_timestamps_count() -> Result<()> { .await; let expected = vec![ - "+----------------+-----------------+-----------------+---------------+", - "| COUNT(t.nanos) | COUNT(t.micros) | COUNT(t.millis) | COUNT(t.secs) |", - "+----------------+-----------------+-----------------+---------------+", - "| 3 | 3 | 3 | 3 |", - "+----------------+-----------------+-----------------+---------------+", + "+--------------+---------------+---------------+-------------+", + "| COUNT(nanos) | COUNT(micros) | COUNT(millis) | COUNT(secs) |", + "+--------------+---------------+---------------+-------------+", + "| 3 | 3 | 3 | 3 |", + "+--------------+---------------+---------------+-------------+", ]; assert_batches_sorted_eq!(expected, &results); @@ -185,11 +185,11 @@ async fn aggregate_times_count() -> Result<()> { .await; let expected = vec![ - "+----------------+-----------------+-----------------+---------------+", - "| COUNT(t.nanos) | COUNT(t.micros) | COUNT(t.millis) | COUNT(t.secs) |", - "+----------------+-----------------+-----------------+---------------+", - "| 4 | 4 | 4 | 4 |", - "+----------------+-----------------+-----------------+---------------+", + "+--------------+---------------+---------------+-------------+", + "| COUNT(nanos) | COUNT(micros) | COUNT(millis) | COUNT(secs) |", + "+--------------+---------------+---------------+-------------+", + "| 4 | 4 | 4 | 4 |", + "+--------------+---------------+---------------+-------------+", ]; assert_batches_sorted_eq!(expected, &results); @@ -962,11 +962,11 @@ async fn test_accumulator_row_accumulator() -> Result<()> { "+----+----+--------------------------------+-----------+--------------------------------+------------+--------------------+--------------------------------+------+--------------+", "| c1 | c2 | min1 | min2 | max1 | max2 | avg1 | min3 | cnt1 | sum1 |", "+----+----+--------------------------------+-----------+--------------------------------+------------+--------------------+--------------------------------+------+--------------+", - "| a | 1 | 0keZ5G8BffGwgF2RwQD59TFzMStxCB | 774637006 | waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs | 4015442341 | 2437927011 | 0keZ5G8BffGwgF2RwQD59TFzMStxCB | 5 | 6094771121.5 |", - "| a | 2 | b3b9esRhTzFEawbs6XhpKnD9ojutHB | 145294611 | ukyD7b0Efj7tNlFSRmzZ0IqkEzg2a8 | 3717551163 | 2267588664 | b3b9esRhTzFEawbs6XhpKnD9ojutHB | 3 | 3401364777 |", + "| a | 1 | 0keZ5G8BffGwgF2RwQD59TFzMStxCB | 774637006 | waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs | 4015442341 | 2437927011.0 | 0keZ5G8BffGwgF2RwQD59TFzMStxCB | 5 | 6094771121.5 |", + "| a | 2 | b3b9esRhTzFEawbs6XhpKnD9ojutHB | 145294611 | ukyD7b0Efj7tNlFSRmzZ0IqkEzg2a8 | 3717551163 | 2267588664.0 | b3b9esRhTzFEawbs6XhpKnD9ojutHB | 3 | 3401364777.0 |", "| a | 3 | Amn2K87Db5Es3dFQO9cw9cvpAM6h35 | 431948861 | oLZ21P2JEDooxV1pU31cIxQHEeeoLu | 3998790955 | 2225685115.1666665 | Amn2K87Db5Es3dFQO9cw9cvpAM6h35 | 6 | 6676994872.5 |", - "| a | 4 | KJFcmTVjdkCMv94wYCtfHMFhzyRsmH | 466439833 | ydkwycaISlYSlEq3TlkS2m15I2pcp8 | 2502326480 | 1655431654 | KJFcmTVjdkCMv94wYCtfHMFhzyRsmH | 4 | 3310812222.5 |", - "| a | 5 | MeSTAXq8gVxVjbEjgkvU9YLte0X9uE | 141047417 | QJYm7YRA3YetcBHI5wkMZeLXVmfuNy | 2496054700 | 1216992989.6666667 | MeSTAXq8gVxVjbEjgkvU9YLte0X9uE | 3 | 1825431770 |", + "| a | 4 | KJFcmTVjdkCMv94wYCtfHMFhzyRsmH | 466439833 | ydkwycaISlYSlEq3TlkS2m15I2pcp8 | 2502326480 | 1655431654.0 | KJFcmTVjdkCMv94wYCtfHMFhzyRsmH | 4 | 3310812222.5 |", + "| a | 5 | MeSTAXq8gVxVjbEjgkvU9YLte0X9uE | 141047417 | QJYm7YRA3YetcBHI5wkMZeLXVmfuNy | 2496054700 | 1216992989.6666667 | MeSTAXq8gVxVjbEjgkvU9YLte0X9uE | 3 | 1825431770.0 |", "+----+----+--------------------------------+-----------+--------------------------------+------------+--------------------+--------------------------------+------+--------------+", ]; assert_batches_eq!(expected, &actual); diff --git a/datafusion/core/tests/sql/avro.rs b/datafusion/core/tests/sql/avro.rs index d7bbf261cddc..d933db067d6d 100644 --- a/datafusion/core/tests/sql/avro.rs +++ b/datafusion/core/tests/sql/avro.rs @@ -140,18 +140,16 @@ async fn avro_explain() { let expected = vec![ vec![ "logical_plan", - "Projection: COUNT(UInt8(1))\ - \n Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1))]]\ - \n TableScan: alltypes_plain projection=[id]", + "Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1))]]\ + \n TableScan: alltypes_plain projection=[id]", ], vec![ "physical_plan", - "ProjectionExec: expr=[COUNT(UInt8(1))@0 as COUNT(UInt8(1))]\ - \n AggregateExec: mode=Final, gby=[], aggr=[COUNT(UInt8(1))]\ - \n CoalescePartitionsExec\ - \n AggregateExec: mode=Partial, gby=[], aggr=[COUNT(UInt8(1))]\ - \n RepartitionExec: partitioning=RoundRobinBatch(NUM_CORES), input_partitions=1\ - \n AvroExec: files={1 group: [[ARROW_TEST_DATA/avro/alltypes_plain.avro]]}, limit=None\ + "AggregateExec: mode=Final, gby=[], aggr=[COUNT(UInt8(1))]\ + \n CoalescePartitionsExec\ + \n AggregateExec: mode=Partial, gby=[], aggr=[COUNT(UInt8(1))]\ + \n RepartitionExec: partitioning=RoundRobinBatch(NUM_CORES), input_partitions=1\ + \n AvroExec: files={1 group: [[ARROW_TEST_DATA/avro/alltypes_plain.avro]]}, limit=None\ \n", ], ]; diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs index 99f5bd2d670d..3cbf50275326 100644 --- a/datafusion/core/tests/sql/explain_analyze.rs +++ b/datafusion/core/tests/sql/explain_analyze.rs @@ -650,8 +650,7 @@ async fn test_physical_plan_display_indent_multi_children() { " CoalesceBatchesExec: target_batch_size=4096", " RepartitionExec: partitioning=Hash([Column { name: \"c1\", index: 0 }], 9000), input_partitions=9000", " RepartitionExec: partitioning=RoundRobinBatch(9000), input_partitions=1", - " ProjectionExec: expr=[c1@0 as c1]", - " CsvExec: files={1 group: [[ARROW_TEST_DATA/csv/aggregate_test_100.csv]]}, has_header=true, limit=None, projection=[c1]", + " CsvExec: files={1 group: [[ARROW_TEST_DATA/csv/aggregate_test_100.csv]]}, has_header=true, limit=None, projection=[c1]", " CoalesceBatchesExec: target_batch_size=4096", " RepartitionExec: partitioning=Hash([Column { name: \"c2\", index: 0 }], 9000), input_partitions=9000", " RepartitionExec: partitioning=RoundRobinBatch(9000), input_partitions=1", @@ -757,10 +756,9 @@ async fn explain_logical_plan_only() { let expected = vec![ vec![ "logical_plan", - "Projection: COUNT(UInt8(1))\ - \n Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1))]]\ - \n SubqueryAlias: t\ - \n Values: (Utf8(\"a\"), Int64(1), Int64(100)), (Utf8(\"a\"), Int64(2), Int64(150))", + "Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1))]]\ + \n SubqueryAlias: t\ + \n Values: (Utf8(\"a\"), Int64(1), Int64(100)), (Utf8(\"a\"), Int64(2), Int64(150))", ]]; assert_eq!(expected, actual); } @@ -776,9 +774,8 @@ async fn explain_physical_plan_only() { let expected = vec![vec![ "physical_plan", - "ProjectionExec: expr=[COUNT(UInt8(1))@0 as COUNT(UInt8(1))]\ - \n ProjectionExec: expr=[2 as COUNT(UInt8(1))]\ - \n EmptyExec: produce_one_row=true\ + "ProjectionExec: expr=[2 as COUNT(UInt8(1))]\ + \n EmptyExec: produce_one_row=true\ \n", ]]; assert_eq!(expected, actual); diff --git a/datafusion/core/tests/sql/expr.rs b/datafusion/core/tests/sql/expr.rs index 01fd26723c46..24017f9cd274 100644 --- a/datafusion/core/tests/sql/expr.rs +++ b/datafusion/core/tests/sql/expr.rs @@ -514,7 +514,7 @@ async fn query_without_from() -> Result<()> { "+---------------------+---------------------+---------------+", "| Int64(1) + Int64(2) | Int64(3) / Int64(4) | cos(Int64(0)) |", "+---------------------+---------------------+---------------+", - "| 3 | 0 | 1 |", + "| 3 | 0 | 1.0 |", "+---------------------+---------------------+---------------+", ]; assert_batches_eq!(expected, &actual); @@ -818,18 +818,12 @@ async fn test_array_literals() -> Result<()> { #[tokio::test] async fn test_struct_literals() -> Result<()> { - test_expression!( - "STRUCT(1,2,3,4,5)", - "{\"c0\": 1, \"c1\": 2, \"c2\": 3, \"c3\": 4, \"c4\": 5}" - ); - test_expression!("STRUCT(Null)", "{\"c0\": null}"); - test_expression!("STRUCT(2)", "{\"c0\": 2}"); - test_expression!("STRUCT('1',Null)", "{\"c0\": \"1\", \"c1\": null}"); - test_expression!("STRUCT(true, false)", "{\"c0\": true, \"c1\": false}"); - test_expression!( - "STRUCT('str1', 'str2')", - "{\"c0\": \"str1\", \"c1\": \"str2\"}" - ); + test_expression!("STRUCT(1,2,3,4,5)", "{c0: 1, c1: 2, c2: 3, c3: 4, c4: 5}"); + test_expression!("STRUCT(Null)", "{c0: }"); + test_expression!("STRUCT(2)", "{c0: 2}"); + test_expression!("STRUCT('1',Null)", "{c0: 1, c1: }"); + test_expression!("STRUCT(true, false)", "{c0: true, c1: false}"); + test_expression!("STRUCT('str1', 'str2')", "{c0: str1, c1: str2}"); Ok(()) } @@ -1236,53 +1230,53 @@ async fn in_list_array() -> Result<()> { #[tokio::test] async fn test_extract_date_part() -> Result<()> { - test_expression!("date_part('YEAR', CAST('2000-01-01' AS DATE))", "2000"); + test_expression!("date_part('YEAR', CAST('2000-01-01' AS DATE))", "2000.0"); test_expression!( "EXTRACT(year FROM to_timestamp('2020-09-08T12:00:00+00:00'))", - "2020" + "2020.0" ); - test_expression!("date_part('QUARTER', CAST('2000-01-01' AS DATE))", "1"); + test_expression!("date_part('QUARTER', CAST('2000-01-01' AS DATE))", "1.0"); test_expression!( "EXTRACT(quarter FROM to_timestamp('2020-09-08T12:00:00+00:00'))", - "3" + "3.0" ); - test_expression!("date_part('MONTH', CAST('2000-01-01' AS DATE))", "1"); + test_expression!("date_part('MONTH', CAST('2000-01-01' AS DATE))", "1.0"); test_expression!( "EXTRACT(month FROM to_timestamp('2020-09-08T12:00:00+00:00'))", - "9" + "9.0" ); - test_expression!("date_part('WEEK', CAST('2003-01-01' AS DATE))", "1"); + test_expression!("date_part('WEEK', CAST('2003-01-01' AS DATE))", "1.0"); test_expression!( "EXTRACT(WEEK FROM to_timestamp('2020-09-08T12:00:00+00:00'))", - "37" + "37.0" ); - test_expression!("date_part('DAY', CAST('2000-01-01' AS DATE))", "1"); + test_expression!("date_part('DAY', CAST('2000-01-01' AS DATE))", "1.0"); test_expression!( "EXTRACT(day FROM to_timestamp('2020-09-08T12:00:00+00:00'))", - "8" + "8.0" ); - test_expression!("date_part('DOY', CAST('2000-01-01' AS DATE))", "1"); + test_expression!("date_part('DOY', CAST('2000-01-01' AS DATE))", "1.0"); test_expression!( "EXTRACT(doy FROM to_timestamp('2020-09-08T12:00:00+00:00'))", - "252" + "252.0" ); - test_expression!("date_part('DOW', CAST('2000-01-01' AS DATE))", "6"); + test_expression!("date_part('DOW', CAST('2000-01-01' AS DATE))", "6.0"); test_expression!( "EXTRACT(dow FROM to_timestamp('2020-09-08T12:00:00+00:00'))", - "2" + "2.0" ); - test_expression!("date_part('HOUR', CAST('2000-01-01' AS DATE))", "0"); + test_expression!("date_part('HOUR', CAST('2000-01-01' AS DATE))", "0.0"); test_expression!( "EXTRACT(hour FROM to_timestamp('2020-09-08T12:03:03+00:00'))", - "12" + "12.0" ); test_expression!( "EXTRACT(minute FROM to_timestamp('2020-09-08T12:12:00+00:00'))", - "12" + "12.0" ); test_expression!( "date_part('minute', to_timestamp('2020-09-08T12:12:00+00:00'))", - "12" + "12.0" ); test_expression!( "EXTRACT(second FROM to_timestamp('2020-09-08T12:00:12.12345678+00:00'))", @@ -1298,7 +1292,7 @@ async fn test_extract_date_part() -> Result<()> { ); test_expression!( "EXTRACT(nanosecond FROM to_timestamp('2020-09-08T12:00:12.12345678+00:00'))", - "12123456780" + "1.212345678e10" ); test_expression!( "date_part('second', to_timestamp('2020-09-08T12:00:12.12345678+00:00'))", @@ -1314,7 +1308,7 @@ async fn test_extract_date_part() -> Result<()> { ); test_expression!( "date_part('nanosecond', to_timestamp('2020-09-08T12:00:12.12345678+00:00'))", - "12123456780" + "1.212345678e10" ); Ok(()) } diff --git a/datafusion/core/tests/sql/functions.rs b/datafusion/core/tests/sql/functions.rs index 615871076e3b..ab3cc82b5069 100644 --- a/datafusion/core/tests/sql/functions.rs +++ b/datafusion/core/tests/sql/functions.rs @@ -67,8 +67,8 @@ async fn csv_query_cast_literal() -> Result<()> { "+--------------------+----------+", "| c12 | Int64(1) |", "+--------------------+----------+", - "| 0.9294097332465232 | 1 |", - "| 0.3114712539863804 | 1 |", + "| 0.9294097332465232 | 1.0 |", + "| 0.3114712539863804 | 1.0 |", "+--------------------+----------+", ]; @@ -344,7 +344,7 @@ async fn case_sensitive_identifiers_functions() { "+-----------+", "| sqrt(t.i) |", "+-----------+", - "| 1 |", + "| 1.0 |", "+-----------+", ]; @@ -431,7 +431,7 @@ async fn case_builtin_math_expression() { "+-----------+", "| sqrt(t.v) |", "+-----------+", - "| 1 |", + "| 1.0 |", "+-----------+", ]; let results = plan_and_collect(&ctx, "SELECT sqrt(v) FROM t") @@ -499,10 +499,10 @@ async fn test_power() -> Result<()> { "+-----------+-----------+-----------+-----------+------------------+--------------------+", "| power_i32 | power_i64 | power_f32 | power_f64 | power_int_scalar | power_float_scalar |", "+-----------+-----------+-----------+-----------+------------------+--------------------+", - "| 8 | 8 | 1 | 1 | 8 | 15.625 |", - "| 125 | 125 | 15.625 | 15.625 | 8 | 15.625 |", - "| 0 | 0 | 0 | 0 | 8 | 15.625 |", - "| -2744 | -2744 | -3048.625 | -3048.625 | 8 | 15.625 |", + "| 8 | 8.0 | 1.0 | 1.0 | 8 | 15.625 |", + "| 125 | 125.0 | 15.625 | 15.625 | 8 | 15.625 |", + "| 0 | 0.0 | 0.0 | 0.0 | 8 | 15.625 |", + "| -2744 | -2744.0 | -3048.625 | -3048.625 | 8 | 15.625 |", "| | | | | 8 | 15.625 |", "+-----------+-----------+-----------+-----------+------------------+--------------------+", ]; diff --git a/datafusion/core/tests/sql/group_by.rs b/datafusion/core/tests/sql/group_by.rs index 5fb106d440e8..a92eaf0f4d31 100644 --- a/datafusion/core/tests/sql/group_by.rs +++ b/datafusion/core/tests/sql/group_by.rs @@ -73,15 +73,15 @@ async fn csv_query_group_by_float64() -> Result<()> { let actual = execute_to_batches(&ctx, sql).await; let expected = vec![ - "+-----+----------------+", - "| cnt | c2 |", - "+-----+----------------+", - "| 5 | 0.000000000005 |", - "| 4 | 0.000000000004 |", - "| 3 | 0.000000000003 |", - "| 2 | 0.000000000002 |", - "| 1 | 0.000000000001 |", - "+-----+----------------+", + "+-----+---------+", + "| cnt | c2 |", + "+-----+---------+", + "| 5 | 5.0e-12 |", + "| 4 | 4.0e-12 |", + "| 3 | 3.0e-12 |", + "| 2 | 2.0e-12 |", + "| 1 | 1.0e-12 |", + "+-----+---------+", ]; assert_batches_eq!(expected, &actual); diff --git a/datafusion/core/tests/sql/intersection.rs b/datafusion/core/tests/sql/intersection.rs deleted file mode 100644 index 607048477bea..000000000000 --- a/datafusion/core/tests/sql/intersection.rs +++ /dev/null @@ -1,87 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use super::*; - -#[tokio::test] -async fn intersect_with_null_not_equal() { - let sql = "SELECT * FROM (SELECT null AS id1, 1 AS id2) t1 - INTERSECT SELECT * FROM (SELECT null AS id1, 2 AS id2) t2"; - - let expected = vec!["++", "++"]; - let ctx = create_join_context_qualified("t1", "t2").unwrap(); - let actual = execute_to_batches(&ctx, sql).await; - assert_batches_eq!(expected, &actual); -} - -#[tokio::test] -async fn intersect_with_null_equal() { - let sql = "SELECT * FROM (SELECT null AS id1, 1 AS id2) t1 - INTERSECT SELECT * FROM (SELECT null AS id1, 1 AS id2) t2"; - - let expected = vec![ - "+-----+-----+", - "| id1 | id2 |", - "+-----+-----+", - "| | 1 |", - "+-----+-----+", - ]; - - let ctx = create_join_context_qualified("t1", "t2").unwrap(); - let actual = execute_to_batches(&ctx, sql).await; - - assert_batches_eq!(expected, &actual); -} - -#[tokio::test] -async fn test_intersect_all() -> Result<()> { - let ctx = SessionContext::new(); - register_alltypes_parquet(&ctx).await; - // execute the query - let sql = "SELECT int_col, double_col FROM alltypes_plain where int_col > 0 INTERSECT ALL SELECT int_col, double_col FROM alltypes_plain LIMIT 4"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+---------+------------+", - "| int_col | double_col |", - "+---------+------------+", - "| 1 | 10.1 |", - "| 1 | 10.1 |", - "| 1 | 10.1 |", - "| 1 | 10.1 |", - "+---------+------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn test_intersect_distinct() -> Result<()> { - let ctx = SessionContext::new(); - register_alltypes_parquet(&ctx).await; - // execute the query - let sql = "SELECT int_col, double_col FROM alltypes_plain where int_col > 0 INTERSECT SELECT int_col, double_col FROM alltypes_plain"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+---------+------------+", - "| int_col | double_col |", - "+---------+------------+", - "| 1 | 10.1 |", - "+---------+------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} diff --git a/datafusion/core/tests/sql/joins.rs b/datafusion/core/tests/sql/joins.rs index 6d1b1e91b66e..5675a4bd6c5f 100644 --- a/datafusion/core/tests/sql/joins.rs +++ b/datafusion/core/tests/sql/joins.rs @@ -1354,10 +1354,9 @@ async fn hash_join_with_date32() -> Result<()> { let plan = dataframe.into_optimized_plan()?; let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.c1, t1.c2, t1.c3, t1.c4, t2.c1, t2.c2, t2.c3, t2.c4 [c1:Date32;N, c2:Date64;N, c3:Decimal128(5, 2);N, c4:Dictionary(Int32, Utf8);N, c1:Date32;N, c2:Date64;N, c3:Decimal128(10, 2);N, c4:Dictionary(Int32, Utf8);N]", - " Inner Join: t1.c1 = t2.c1 [c1:Date32;N, c2:Date64;N, c3:Decimal128(5, 2);N, c4:Dictionary(Int32, Utf8);N, c1:Date32;N, c2:Date64;N, c3:Decimal128(10, 2);N, c4:Dictionary(Int32, Utf8);N]", - " TableScan: t1 projection=[c1, c2, c3, c4] [c1:Date32;N, c2:Date64;N, c3:Decimal128(5, 2);N, c4:Dictionary(Int32, Utf8);N]", - " TableScan: t2 projection=[c1, c2, c3, c4] [c1:Date32;N, c2:Date64;N, c3:Decimal128(10, 2);N, c4:Dictionary(Int32, Utf8);N]", + " Inner Join: t1.c1 = t2.c1 [c1:Date32;N, c2:Date64;N, c3:Decimal128(5, 2);N, c4:Dictionary(Int32, Utf8);N, c1:Date32;N, c2:Date64;N, c3:Decimal128(10, 2);N, c4:Dictionary(Int32, Utf8);N]", + " TableScan: t1 projection=[c1, c2, c3, c4] [c1:Date32;N, c2:Date64;N, c3:Decimal128(5, 2);N, c4:Dictionary(Int32, Utf8);N]", + " TableScan: t2 projection=[c1, c2, c3, c4] [c1:Date32;N, c2:Date64;N, c3:Decimal128(10, 2);N, c4:Dictionary(Int32, Utf8);N]", ]; let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); @@ -1367,12 +1366,12 @@ async fn hash_join_with_date32() -> Result<()> { ); let expected = vec![ - "+------------+------------+---------+-----+------------+------------+---------+-----+", - "| c1 | c2 | c3 | c4 | c1 | c2 | c3 | c4 |", - "+------------+------------+---------+-----+------------+------------+---------+-----+", - "| 1970-01-02 | 1970-01-02 | 1.23 | abc | 1970-01-02 | 1970-01-02 | -123.12 | abc |", - "| 1970-01-04 | | -123.12 | jkl | 1970-01-04 | | 789.00 | |", - "+------------+------------+---------+-----+------------+------------+---------+-----+", + "+------------+---------------------+---------+-----+------------+---------------------+---------+-----+", + "| c1 | c2 | c3 | c4 | c1 | c2 | c3 | c4 |", + "+------------+---------------------+---------+-----+------------+---------------------+---------+-----+", + "| 1970-01-02 | 1970-01-02T00:00:00 | 1.23 | abc | 1970-01-02 | 1970-01-02T00:00:00 | -123.12 | abc |", + "| 1970-01-04 | | -123.12 | jkl | 1970-01-04 | | 789.00 | |", + "+------------+---------------------+---------+-----+------------+---------------------+---------+-----+", ]; let results = execute_to_batches(&ctx, sql).await; @@ -1392,10 +1391,9 @@ async fn hash_join_with_date64() -> Result<()> { let plan = dataframe.into_optimized_plan()?; let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.c1, t1.c2, t1.c3, t1.c4, t2.c1, t2.c2, t2.c3, t2.c4 [c1:Date32;N, c2:Date64;N, c3:Decimal128(5, 2);N, c4:Dictionary(Int32, Utf8);N, c1:Date32;N, c2:Date64;N, c3:Decimal128(10, 2);N, c4:Dictionary(Int32, Utf8);N]", - " Left Join: t1.c2 = t2.c2 [c1:Date32;N, c2:Date64;N, c3:Decimal128(5, 2);N, c4:Dictionary(Int32, Utf8);N, c1:Date32;N, c2:Date64;N, c3:Decimal128(10, 2);N, c4:Dictionary(Int32, Utf8);N]", - " TableScan: t1 projection=[c1, c2, c3, c4] [c1:Date32;N, c2:Date64;N, c3:Decimal128(5, 2);N, c4:Dictionary(Int32, Utf8);N]", - " TableScan: t2 projection=[c1, c2, c3, c4] [c1:Date32;N, c2:Date64;N, c3:Decimal128(10, 2);N, c4:Dictionary(Int32, Utf8);N]", + " Left Join: t1.c2 = t2.c2 [c1:Date32;N, c2:Date64;N, c3:Decimal128(5, 2);N, c4:Dictionary(Int32, Utf8);N, c1:Date32;N, c2:Date64;N, c3:Decimal128(10, 2);N, c4:Dictionary(Int32, Utf8);N]", + " TableScan: t1 projection=[c1, c2, c3, c4] [c1:Date32;N, c2:Date64;N, c3:Decimal128(5, 2);N, c4:Dictionary(Int32, Utf8);N]", + " TableScan: t2 projection=[c1, c2, c3, c4] [c1:Date32;N, c2:Date64;N, c3:Decimal128(10, 2);N, c4:Dictionary(Int32, Utf8);N]", ]; let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); @@ -1405,14 +1403,14 @@ async fn hash_join_with_date64() -> Result<()> { ); let expected = vec![ - "+------------+------------+---------+-----+------------+------------+---------+--------+", - "| c1 | c2 | c3 | c4 | c1 | c2 | c3 | c4 |", - "+------------+------------+---------+-----+------------+------------+---------+--------+", - "| | 1970-01-04 | 789.00 | ghi | | 1970-01-04 | 0.00 | qwerty |", - "| 1970-01-02 | 1970-01-02 | 1.23 | abc | 1970-01-02 | 1970-01-02 | -123.12 | abc |", - "| 1970-01-03 | 1970-01-03 | 456.00 | def | | | | |", - "| 1970-01-04 | | -123.12 | jkl | | | | |", - "+------------+------------+---------+-----+------------+------------+---------+--------+", + "+------------+---------------------+---------+-----+------------+---------------------+---------+--------+", + "| c1 | c2 | c3 | c4 | c1 | c2 | c3 | c4 |", + "+------------+---------------------+---------+-----+------------+---------------------+---------+--------+", + "| | 1970-01-04T00:00:00 | 789.00 | ghi | | 1970-01-04T00:00:00 | 0.00 | qwerty |", + "| 1970-01-02 | 1970-01-02T00:00:00 | 1.23 | abc | 1970-01-02 | 1970-01-02T00:00:00 | -123.12 | abc |", + "| 1970-01-03 | 1970-01-03T00:00:00 | 456.00 | def | | | | |", + "| 1970-01-04 | | -123.12 | jkl | | | | |", + "+------------+---------------------+---------+-----+------------+---------------------+---------+--------+", ]; let results = execute_to_batches(&ctx, sql).await; @@ -1432,10 +1430,9 @@ async fn hash_join_with_decimal() -> Result<()> { let plan = dataframe.into_optimized_plan()?; let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.c1, t1.c2, t1.c3, t1.c4, t2.c1, t2.c2, t2.c3, t2.c4 [c1:Date32;N, c2:Date64;N, c3:Decimal128(5, 2);N, c4:Dictionary(Int32, Utf8);N, c1:Date32;N, c2:Date64;N, c3:Decimal128(10, 2);N, c4:Dictionary(Int32, Utf8);N]", - " Right Join: CAST(t1.c3 AS Decimal128(10, 2)) = t2.c3 [c1:Date32;N, c2:Date64;N, c3:Decimal128(5, 2);N, c4:Dictionary(Int32, Utf8);N, c1:Date32;N, c2:Date64;N, c3:Decimal128(10, 2);N, c4:Dictionary(Int32, Utf8);N]", - " TableScan: t1 projection=[c1, c2, c3, c4] [c1:Date32;N, c2:Date64;N, c3:Decimal128(5, 2);N, c4:Dictionary(Int32, Utf8);N]", - " TableScan: t2 projection=[c1, c2, c3, c4] [c1:Date32;N, c2:Date64;N, c3:Decimal128(10, 2);N, c4:Dictionary(Int32, Utf8);N]", + " Right Join: CAST(t1.c3 AS Decimal128(10, 2)) = t2.c3 [c1:Date32;N, c2:Date64;N, c3:Decimal128(5, 2);N, c4:Dictionary(Int32, Utf8);N, c1:Date32;N, c2:Date64;N, c3:Decimal128(10, 2);N, c4:Dictionary(Int32, Utf8);N]", + " TableScan: t1 projection=[c1, c2, c3, c4] [c1:Date32;N, c2:Date64;N, c3:Decimal128(5, 2);N, c4:Dictionary(Int32, Utf8);N]", + " TableScan: t2 projection=[c1, c2, c3, c4] [c1:Date32;N, c2:Date64;N, c3:Decimal128(10, 2);N, c4:Dictionary(Int32, Utf8);N]", ]; let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); @@ -1445,14 +1442,14 @@ async fn hash_join_with_decimal() -> Result<()> { ); let expected = vec![ - "+------------+------------+---------+-----+------------+------------+-----------+---------+", - "| c1 | c2 | c3 | c4 | c1 | c2 | c3 | c4 |", - "+------------+------------+---------+-----+------------+------------+-----------+---------+", - "| | | | | | | 100000.00 | abcdefg |", - "| | | | | | 1970-01-04 | 0.00 | qwerty |", - "| | 1970-01-04 | 789.00 | ghi | 1970-01-04 | | 789.00 | |", - "| 1970-01-04 | | -123.12 | jkl | 1970-01-02 | 1970-01-02 | -123.12 | abc |", - "+------------+------------+---------+-----+------------+------------+-----------+---------+", + "+------------+---------------------+---------+-----+------------+---------------------+-----------+---------+", + "| c1 | c2 | c3 | c4 | c1 | c2 | c3 | c4 |", + "+------------+---------------------+---------+-----+------------+---------------------+-----------+---------+", + "| | | | | | | 100000.00 | abcdefg |", + "| | | | | | 1970-01-04T00:00:00 | 0.00 | qwerty |", + "| | 1970-01-04T00:00:00 | 789.00 | ghi | 1970-01-04 | | 789.00 | |", + "| 1970-01-04 | | -123.12 | jkl | 1970-01-02 | 1970-01-02T00:00:00 | -123.12 | abc |", + "+------------+---------------------+---------+-----+------------+---------------------+-----------+---------+", ]; let results = execute_to_batches(&ctx, sql).await; @@ -1472,10 +1469,9 @@ async fn hash_join_with_dictionary() -> Result<()> { let plan = dataframe.into_optimized_plan()?; let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.c1, t1.c2, t1.c3, t1.c4, t2.c1, t2.c2, t2.c3, t2.c4 [c1:Date32;N, c2:Date64;N, c3:Decimal128(5, 2);N, c4:Dictionary(Int32, Utf8);N, c1:Date32;N, c2:Date64;N, c3:Decimal128(10, 2);N, c4:Dictionary(Int32, Utf8);N]", - " Inner Join: t1.c4 = t2.c4 [c1:Date32;N, c2:Date64;N, c3:Decimal128(5, 2);N, c4:Dictionary(Int32, Utf8);N, c1:Date32;N, c2:Date64;N, c3:Decimal128(10, 2);N, c4:Dictionary(Int32, Utf8);N]", - " TableScan: t1 projection=[c1, c2, c3, c4] [c1:Date32;N, c2:Date64;N, c3:Decimal128(5, 2);N, c4:Dictionary(Int32, Utf8);N]", - " TableScan: t2 projection=[c1, c2, c3, c4] [c1:Date32;N, c2:Date64;N, c3:Decimal128(10, 2);N, c4:Dictionary(Int32, Utf8);N]", + " Inner Join: t1.c4 = t2.c4 [c1:Date32;N, c2:Date64;N, c3:Decimal128(5, 2);N, c4:Dictionary(Int32, Utf8);N, c1:Date32;N, c2:Date64;N, c3:Decimal128(10, 2);N, c4:Dictionary(Int32, Utf8);N]", + " TableScan: t1 projection=[c1, c2, c3, c4] [c1:Date32;N, c2:Date64;N, c3:Decimal128(5, 2);N, c4:Dictionary(Int32, Utf8);N]", + " TableScan: t2 projection=[c1, c2, c3, c4] [c1:Date32;N, c2:Date64;N, c3:Decimal128(10, 2);N, c4:Dictionary(Int32, Utf8);N]", ]; let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); @@ -1485,11 +1481,11 @@ async fn hash_join_with_dictionary() -> Result<()> { ); let expected = vec![ - "+------------+------------+------+-----+------------+------------+---------+-----+", - "| c1 | c2 | c3 | c4 | c1 | c2 | c3 | c4 |", - "+------------+------------+------+-----+------------+------------+---------+-----+", - "| 1970-01-02 | 1970-01-02 | 1.23 | abc | 1970-01-02 | 1970-01-02 | -123.12 | abc |", - "+------------+------------+------+-----+------------+------------+---------+-----+", + "+------------+---------------------+------+-----+------------+---------------------+---------+-----+", + "| c1 | c2 | c3 | c4 | c1 | c2 | c3 | c4 |", + "+------------+---------------------+------+-----+------------+---------------------+---------+-----+", + "| 1970-01-02 | 1970-01-02T00:00:00 | 1.23 | abc | 1970-01-02 | 1970-01-02T00:00:00 | -123.12 | abc |", + "+------------+---------------------+------+-----+------------+---------------------+---------+-----+", ]; let results = execute_to_batches(&ctx, sql).await; @@ -1512,12 +1508,11 @@ async fn reduce_left_join_1() -> Result<()> { let plan = dataframe.into_optimized_plan()?; let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.t1_id, t1.t1_name, t1.t1_int, t2.t2_id, t2.t2_name, t2.t2_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " Inner Join: t1.t1_id = t2.t2_id [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " Filter: t1.t1_id < UInt32(100) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " Filter: t2.t2_id < UInt32(100) [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " Inner Join: t1.t1_id = t2.t2_id [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " Filter: t1.t1_id < UInt32(100) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " Filter: t2.t2_id < UInt32(100) [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); @@ -1560,12 +1555,11 @@ async fn reduce_left_join_2() -> Result<()> { let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.t1_id, t1.t1_name, t1.t1_int, t2.t2_id, t2.t2_name, t2.t2_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " Filter: t2.t2_int < UInt32(10) OR t1.t1_int > UInt32(2) AND t2.t2_name != Utf8(\"w\") [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " Inner Join: t1.t1_id = t2.t2_id [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " Filter: t2.t2_int < UInt32(10) OR t2.t2_name != Utf8(\"w\") [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " Filter: t2.t2_int < UInt32(10) OR t1.t1_int > UInt32(2) AND t2.t2_name != Utf8(\"w\") [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " Inner Join: t1.t1_id = t2.t2_id [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " Filter: t2.t2_int < UInt32(10) OR t2.t2_name != Utf8(\"w\") [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); @@ -1603,16 +1597,15 @@ async fn reduce_left_join_3() -> Result<()> { let plan = dataframe.into_optimized_plan()?; let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t3.t1_id, t3.t1_name, t3.t1_int, t2.t2_id, t2.t2_name, t2.t2_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " Left Join: t3.t1_int = t2.t2_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " SubqueryAlias: t3 [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " Projection: t1.t1_id, t1.t1_name, t1.t1_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " Inner Join: t1.t1_id = t2.t2_id [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_int:UInt32;N]", - " Filter: t1.t1_id < UInt32(100) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " Filter: t2.t2_int < UInt32(3) AND t2.t2_id < UInt32(100) [t2_id:UInt32;N, t2_int:UInt32;N]", - " TableScan: t2 projection=[t2_id, t2_int] [t2_id:UInt32;N, t2_int:UInt32;N]", - " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " Left Join: t3.t1_int = t2.t2_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " SubqueryAlias: t3 [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " Projection: t1.t1_id, t1.t1_name, t1.t1_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " Inner Join: t1.t1_id = t2.t2_id [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_int:UInt32;N]", + " Filter: t1.t1_id < UInt32(100) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " Filter: t2.t2_int < UInt32(3) AND t2.t2_id < UInt32(100) [t2_id:UInt32;N, t2_int:UInt32;N]", + " TableScan: t2 projection=[t2_id, t2_int] [t2_id:UInt32;N, t2_int:UInt32;N]", + " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); @@ -1648,11 +1641,10 @@ async fn reduce_right_join_1() -> Result<()> { let plan = dataframe.into_optimized_plan()?; let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.t1_id, t1.t1_name, t1.t1_int, t2.t2_id, t2.t2_name, t2.t2_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " Inner Join: t1.t1_id = t2.t2_id [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " Filter: t1.t1_int IS NOT NULL [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " Inner Join: t1.t1_id = t2.t2_id [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " Filter: t1.t1_int IS NOT NULL [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); @@ -1690,11 +1682,10 @@ async fn reduce_right_join_2() -> Result<()> { let plan = dataframe.into_optimized_plan()?; let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.t1_id, t1.t1_name, t1.t1_int, t2.t2_id, t2.t2_name, t2.t2_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " Filter: t1.t1_int != t2.t2_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " Inner Join: t1.t1_id = t2.t2_id [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " Filter: t1.t1_int != t2.t2_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " Inner Join: t1.t1_id = t2.t2_id [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); @@ -1732,11 +1723,10 @@ async fn reduce_full_join_to_right_join() -> Result<()> { let plan = dataframe.into_optimized_plan()?; let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.t1_id, t1.t1_name, t1.t1_int, t2.t2_id, t2.t2_name, t2.t2_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " Right Join: t1.t1_id = t2.t2_id [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " Filter: t2.t2_name IS NOT NULL [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " Right Join: t1.t1_id = t2.t2_id [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " Filter: t2.t2_name IS NOT NULL [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); @@ -1776,11 +1766,10 @@ async fn reduce_full_join_to_left_join() -> Result<()> { let plan = dataframe.into_optimized_plan()?; let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.t1_id, t1.t1_name, t1.t1_int, t2.t2_id, t2.t2_name, t2.t2_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " Left Join: t1.t1_id = t2.t2_id [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " Filter: t1.t1_name != Utf8(\"b\") [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " Left Join: t1.t1_id = t2.t2_id [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " Filter: t1.t1_name != Utf8(\"b\") [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); @@ -1817,12 +1806,11 @@ async fn reduce_full_join_to_inner_join() -> Result<()> { let plan = dataframe.into_optimized_plan()?; let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.t1_id, t1.t1_name, t1.t1_int, t2.t2_id, t2.t2_name, t2.t2_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " Inner Join: t1.t1_id = t2.t2_id [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " Filter: t1.t1_name != Utf8(\"b\") [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " Filter: t2.t2_name = Utf8(\"x\") [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " Inner Join: t1.t1_id = t2.t2_id [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " Filter: t1.t1_name != Utf8(\"b\") [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " Filter: t2.t2_name = Utf8(\"x\") [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); @@ -1879,18 +1867,17 @@ async fn sort_merge_join_on_date32() -> Result<()> { let dataframe = ctx.sql(sql).await.expect(&msg); let physical_plan = dataframe.create_physical_plan().await?; let expected = vec![ - "ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3, c4@3 as c4, c1@4 as c1, c2@5 as c2, c3@6 as c3, c4@7 as c4]", - " SortMergeJoin: join_type=Inner, on=[(Column { name: \"c1\", index: 0 }, Column { name: \"c1\", index: 0 })]", - " SortExec: expr=[c1@0 ASC]", - " CoalesceBatchesExec: target_batch_size=4096", - " RepartitionExec: partitioning=Hash([Column { name: \"c1\", index: 0 }], 2), input_partitions=2", - " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", - " MemoryExec: partitions=1, partition_sizes=[1]", - " SortExec: expr=[c1@0 ASC]", - " CoalesceBatchesExec: target_batch_size=4096", - " RepartitionExec: partitioning=Hash([Column { name: \"c1\", index: 0 }], 2), input_partitions=2", - " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", - " MemoryExec: partitions=1, partition_sizes=[1]", + "SortMergeJoin: join_type=Inner, on=[(Column { name: \"c1\", index: 0 }, Column { name: \"c1\", index: 0 })]", + " SortExec: expr=[c1@0 ASC]", + " CoalesceBatchesExec: target_batch_size=4096", + " RepartitionExec: partitioning=Hash([Column { name: \"c1\", index: 0 }], 2), input_partitions=2", + " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", + " MemoryExec: partitions=1, partition_sizes=[1]", + " SortExec: expr=[c1@0 ASC]", + " CoalesceBatchesExec: target_batch_size=4096", + " RepartitionExec: partitioning=Hash([Column { name: \"c1\", index: 0 }], 2), input_partitions=2", + " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", + " MemoryExec: partitions=1, partition_sizes=[1]", ]; let formatted = displayable(physical_plan.as_ref()).indent().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); @@ -1900,12 +1887,12 @@ async fn sort_merge_join_on_date32() -> Result<()> { ); let expected = vec![ - "+------------+------------+---------+-----+------------+------------+---------+-----+", - "| c1 | c2 | c3 | c4 | c1 | c2 | c3 | c4 |", - "+------------+------------+---------+-----+------------+------------+---------+-----+", - "| 1970-01-02 | 1970-01-02 | 1.23 | abc | 1970-01-02 | 1970-01-02 | -123.12 | abc |", - "| 1970-01-04 | | -123.12 | jkl | 1970-01-04 | | 789.00 | |", - "+------------+------------+---------+-----+------------+------------+---------+-----+", + "+------------+---------------------+---------+-----+------------+---------------------+---------+-----+", + "| c1 | c2 | c3 | c4 | c1 | c2 | c3 | c4 |", + "+------------+---------------------+---------+-----+------------+---------------------+---------+-----+", + "| 1970-01-02 | 1970-01-02T00:00:00 | 1.23 | abc | 1970-01-02 | 1970-01-02T00:00:00 | -123.12 | abc |", + "| 1970-01-04 | | -123.12 | jkl | 1970-01-04 | | 789.00 | |", + "+------------+---------------------+---------+-----+------------+---------------------+---------+-----+", ]; let results = execute_to_batches(&ctx, sql).await; @@ -1924,20 +1911,19 @@ async fn sort_merge_join_on_decimal() -> Result<()> { let dataframe = ctx.sql(sql).await.expect(&msg); let physical_plan = dataframe.create_physical_plan().await?; let expected = vec![ - "ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3, c4@3 as c4, c1@4 as c1, c2@5 as c2, c3@6 as c3, c4@7 as c4]", - " ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3, c4@3 as c4, c1@5 as c1, c2@6 as c2, c3@7 as c3, c4@8 as c4]", - " SortMergeJoin: join_type=Right, on=[(Column { name: \"CAST(t1.c3 AS Decimal128(10, 2))\", index: 4 }, Column { name: \"c3\", index: 2 })]", - " SortExec: expr=[CAST(t1.c3 AS Decimal128(10, 2))@4 ASC]", - " CoalesceBatchesExec: target_batch_size=4096", - " RepartitionExec: partitioning=Hash([Column { name: \"CAST(t1.c3 AS Decimal128(10, 2))\", index: 4 }], 2), input_partitions=2", - " ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3, c4@3 as c4, CAST(c3@2 AS Decimal128(10, 2)) as CAST(t1.c3 AS Decimal128(10, 2))]", - " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", - " MemoryExec: partitions=1, partition_sizes=[1]", - " SortExec: expr=[c3@2 ASC]", - " CoalesceBatchesExec: target_batch_size=4096", - " RepartitionExec: partitioning=Hash([Column { name: \"c3\", index: 2 }], 2), input_partitions=2", + "ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3, c4@3 as c4, c1@5 as c1, c2@6 as c2, c3@7 as c3, c4@8 as c4]", + " SortMergeJoin: join_type=Right, on=[(Column { name: \"CAST(t1.c3 AS Decimal128(10, 2))\", index: 4 }, Column { name: \"c3\", index: 2 })]", + " SortExec: expr=[CAST(t1.c3 AS Decimal128(10, 2))@4 ASC]", + " CoalesceBatchesExec: target_batch_size=4096", + " RepartitionExec: partitioning=Hash([Column { name: \"CAST(t1.c3 AS Decimal128(10, 2))\", index: 4 }], 2), input_partitions=2", + " ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3, c4@3 as c4, CAST(c3@2 AS Decimal128(10, 2)) as CAST(t1.c3 AS Decimal128(10, 2))]", " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", " MemoryExec: partitions=1, partition_sizes=[1]", + " SortExec: expr=[c3@2 ASC]", + " CoalesceBatchesExec: target_batch_size=4096", + " RepartitionExec: partitioning=Hash([Column { name: \"c3\", index: 2 }], 2), input_partitions=2", + " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", + " MemoryExec: partitions=1, partition_sizes=[1]", ]; let formatted = displayable(physical_plan.as_ref()).indent().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); @@ -1947,14 +1933,14 @@ async fn sort_merge_join_on_decimal() -> Result<()> { ); let expected = vec![ - "+------------+------------+---------+-----+------------+------------+-----------+---------+", - "| c1 | c2 | c3 | c4 | c1 | c2 | c3 | c4 |", - "+------------+------------+---------+-----+------------+------------+-----------+---------+", - "| | | | | | | 100000.00 | abcdefg |", - "| | | | | | 1970-01-04 | 0.00 | qwerty |", - "| | 1970-01-04 | 789.00 | ghi | 1970-01-04 | | 789.00 | |", - "| 1970-01-04 | | -123.12 | jkl | 1970-01-02 | 1970-01-02 | -123.12 | abc |", - "+------------+------------+---------+-----+------------+------------+-----------+---------+", + "+------------+---------------------+---------+-----+------------+---------------------+-----------+---------+", + "| c1 | c2 | c3 | c4 | c1 | c2 | c3 | c4 |", + "+------------+---------------------+---------+-----+------------+---------------------+-----------+---------+", + "| | | | | | | 100000.00 | abcdefg |", + "| | | | | | 1970-01-04T00:00:00 | 0.00 | qwerty |", + "| | 1970-01-04T00:00:00 | 789.00 | ghi | 1970-01-04 | | 789.00 | |", + "| 1970-01-04 | | -123.12 | jkl | 1970-01-02 | 1970-01-02T00:00:00 | -123.12 | abc |", + "+------------+---------------------+---------+-----+------------+---------------------+-----------+---------+", ]; let results = execute_to_batches(&ctx, sql).await; @@ -1982,28 +1968,26 @@ async fn left_semi_join() -> Result<()> { vec![ "SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]", " SortExec: expr=[t1_id@0 ASC NULLS LAST]", - " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name]", - " CoalesceBatchesExec: target_batch_size=4096", - " HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2_id\", index: 0 })]", - " CoalesceBatchesExec: target_batch_size=4096", - " RepartitionExec: partitioning=Hash([Column { name: \"t1_id\", index: 0 }], 2), input_partitions=2", - " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", + " CoalesceBatchesExec: target_batch_size=4096", + " HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2_id\", index: 0 })]", + " CoalesceBatchesExec: target_batch_size=4096", + " RepartitionExec: partitioning=Hash([Column { name: \"t1_id\", index: 0 }], 2), input_partitions=2", + " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", + " MemoryExec: partitions=1, partition_sizes=[1]", + " CoalesceBatchesExec: target_batch_size=4096", + " RepartitionExec: partitioning=Hash([Column { name: \"t2_id\", index: 0 }], 2), input_partitions=2", + " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", + " ProjectionExec: expr=[t2_id@0 as t2_id]", " MemoryExec: partitions=1, partition_sizes=[1]", - " CoalesceBatchesExec: target_batch_size=4096", - " RepartitionExec: partitioning=Hash([Column { name: \"t2_id\", index: 0 }], 2), input_partitions=2", - " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", - " ProjectionExec: expr=[t2_id@0 as t2_id]", - " MemoryExec: partitions=1, partition_sizes=[1]", ] } else { vec![ "SortExec: expr=[t1_id@0 ASC NULLS LAST]", - " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name]", - " CoalesceBatchesExec: target_batch_size=4096", - " HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2_id\", index: 0 })]", + " CoalesceBatchesExec: target_batch_size=4096", + " HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2_id\", index: 0 })]", + " MemoryExec: partitions=1, partition_sizes=[1]", + " ProjectionExec: expr=[t2_id@0 as t2_id]", " MemoryExec: partitions=1, partition_sizes=[1]", - " ProjectionExec: expr=[t2_id@0 as t2_id]", - " MemoryExec: partitions=1, partition_sizes=[1]", ] }; let formatted = displayable(physical_plan.as_ref()).indent().to_string(); @@ -2062,26 +2046,24 @@ async fn left_semi_join() -> Result<()> { vec![ "SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]", " SortExec: expr=[t1_id@0 ASC NULLS LAST]", - " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name]", - " CoalesceBatchesExec: target_batch_size=4096", - " HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2_id\", index: 0 })]", - " CoalesceBatchesExec: target_batch_size=4096", - " RepartitionExec: partitioning=Hash([Column { name: \"t1_id\", index: 0 }], 2), input_partitions=2", - " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", - " MemoryExec: partitions=1, partition_sizes=[1]", - " CoalesceBatchesExec: target_batch_size=4096", - " RepartitionExec: partitioning=Hash([Column { name: \"t2_id\", index: 0 }], 2), input_partitions=2", - " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", - " MemoryExec: partitions=1, partition_sizes=[1]", + " CoalesceBatchesExec: target_batch_size=4096", + " HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2_id\", index: 0 })]", + " CoalesceBatchesExec: target_batch_size=4096", + " RepartitionExec: partitioning=Hash([Column { name: \"t1_id\", index: 0 }], 2), input_partitions=2", + " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", + " MemoryExec: partitions=1, partition_sizes=[1]", + " CoalesceBatchesExec: target_batch_size=4096", + " RepartitionExec: partitioning=Hash([Column { name: \"t2_id\", index: 0 }], 2), input_partitions=2", + " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", + " MemoryExec: partitions=1, partition_sizes=[1]", ] } else { vec![ "SortExec: expr=[t1_id@0 ASC NULLS LAST]", - " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name]", - " CoalesceBatchesExec: target_batch_size=4096", - " HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2_id\", index: 0 })]", - " MemoryExec: partitions=1, partition_sizes=[1]", - " MemoryExec: partitions=1, partition_sizes=[1]", + " CoalesceBatchesExec: target_batch_size=4096", + " HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2_id\", index: 0 })]", + " MemoryExec: partitions=1, partition_sizes=[1]", + " MemoryExec: partitions=1, partition_sizes=[1]", ] }; let formatted = displayable(physical_plan.as_ref()).indent().to_string(); @@ -2120,11 +2102,10 @@ async fn left_semi_join_pushdown() -> Result<()> { let plan = dataframe.into_optimized_plan()?; let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.t1_id, t1.t1_name [t1_id:UInt32;N, t1_name:Utf8;N]", - " LeftSemi Join: t1.t1_id = t2.t2_id [t1_id:UInt32;N, t1_name:Utf8;N]", - " TableScan: t1 projection=[t1_id, t1_name] [t1_id:UInt32;N, t1_name:Utf8;N]", - " Filter: t2.t2_int > UInt32(1) [t2_id:UInt32;N, t2_int:UInt32;N]", - " TableScan: t2 projection=[t2_id, t2_int] [t2_id:UInt32;N, t2_int:UInt32;N]", + " LeftSemi Join: t1.t1_id = t2.t2_id [t1_id:UInt32;N, t1_name:Utf8;N]", + " TableScan: t1 projection=[t1_id, t1_name] [t1_id:UInt32;N, t1_name:Utf8;N]", + " Filter: t2.t2_int > UInt32(1) [t2_id:UInt32;N, t2_int:UInt32;N]", + " TableScan: t2 projection=[t2_id, t2_int] [t2_id:UInt32;N, t2_int:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); @@ -2255,29 +2236,25 @@ async fn right_semi_join() -> Result<()> { let physical_plan = dataframe.create_physical_plan().await?; let expected = if repartition_joins { vec!["SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]", - " SortExec: expr=[t1_id@0 ASC NULLS LAST]", - " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int]", - " CoalesceBatchesExec: target_batch_size=4096", - " HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(Column { name: \"t2_id\", index: 0 }, Column { name: \"t1_id\", index: 0 })], filter=BinaryExpr { left: Column { name: \"t2_name\", index: 1 }, op: NotEq, right: Column { name: \"t1_name\", index: 0 } }", - " CoalesceBatchesExec: target_batch_size=4096", - " RepartitionExec: partitioning=Hash([Column { name: \"t2_id\", index: 0 }], 2), input_partitions=2", - " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", - " ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name]", - " MemoryExec: partitions=1, partition_sizes=[1]", - " CoalesceBatchesExec: target_batch_size=4096", - " RepartitionExec: partitioning=Hash([Column { name: \"t1_id\", index: 0 }], 2), input_partitions=2", - " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", - " MemoryExec: partitions=1, partition_sizes=[1]", + " SortExec: expr=[t1_id@0 ASC NULLS LAST]", + " CoalesceBatchesExec: target_batch_size=4096", + " HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(Column { name: \"t2_id\", index: 0 }, Column { name: \"t1_id\", index: 0 })], filter=BinaryExpr { left: Column { name: \"t2_name\", index: 1 }, op: NotEq, right: Column { name: \"t1_name\", index: 0 } }", + " CoalesceBatchesExec: target_batch_size=4096", + " RepartitionExec: partitioning=Hash([Column { name: \"t2_id\", index: 0 }], 2), input_partitions=2", + " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", + " MemoryExec: partitions=1, partition_sizes=[1]", + " CoalesceBatchesExec: target_batch_size=4096", + " RepartitionExec: partitioning=Hash([Column { name: \"t1_id\", index: 0 }], 2), input_partitions=2", + " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", + " MemoryExec: partitions=1, partition_sizes=[1]", ] } else { vec![ "SortExec: expr=[t1_id@0 ASC NULLS LAST]", - " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int]", - " CoalesceBatchesExec: target_batch_size=4096", - " HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(Column { name: \"t2_id\", index: 0 }, Column { name: \"t1_id\", index: 0 })], filter=BinaryExpr { left: Column { name: \"t2_name\", index: 1 }, op: NotEq, right: Column { name: \"t1_name\", index: 0 } }", - " ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name]", - " MemoryExec: partitions=1, partition_sizes=[1]", - " MemoryExec: partitions=1, partition_sizes=[1]", + " CoalesceBatchesExec: target_batch_size=4096", + " HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(Column { name: \"t2_id\", index: 0 }, Column { name: \"t1_id\", index: 0 })], filter=BinaryExpr { left: Column { name: \"t2_name\", index: 1 }, op: NotEq, right: Column { name: \"t1_name\", index: 0 } }", + " MemoryExec: partitions=1, partition_sizes=[1]", + " MemoryExec: partitions=1, partition_sizes=[1]", ] }; let formatted = displayable(physical_plan.as_ref()).indent().to_string(); @@ -2302,28 +2279,26 @@ async fn right_semi_join() -> Result<()> { let dataframe = ctx.sql(sql).await.expect(&msg); let physical_plan = dataframe.create_physical_plan().await?; let expected = if repartition_joins { - vec![ "SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]", - " SortExec: expr=[t1_id@0 ASC NULLS LAST]", - " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int]", - " CoalesceBatchesExec: target_batch_size=4096", - " HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(Column { name: \"t2_id\", index: 0 }, Column { name: \"t1_id\", index: 0 })], filter=BinaryExpr { left: Column { name: \"t2_name\", index: 0 }, op: NotEq, right: Column { name: \"t1_name\", index: 1 } }", - " CoalesceBatchesExec: target_batch_size=4096", - " RepartitionExec: partitioning=Hash([Column { name: \"t2_id\", index: 0 }], 2), input_partitions=2", - " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", - " MemoryExec: partitions=1, partition_sizes=[1]", - " CoalesceBatchesExec: target_batch_size=4096", - " RepartitionExec: partitioning=Hash([Column { name: \"t1_id\", index: 0 }], 2), input_partitions=2", - " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", - " MemoryExec: partitions=1, partition_sizes=[1]", + vec!["SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]", + " SortExec: expr=[t1_id@0 ASC NULLS LAST]", + " CoalesceBatchesExec: target_batch_size=4096", + " HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(Column { name: \"t2_id\", index: 0 }, Column { name: \"t1_id\", index: 0 })], filter=BinaryExpr { left: Column { name: \"t2_name\", index: 0 }, op: NotEq, right: Column { name: \"t1_name\", index: 1 } }", + " CoalesceBatchesExec: target_batch_size=4096", + " RepartitionExec: partitioning=Hash([Column { name: \"t2_id\", index: 0 }], 2), input_partitions=2", + " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", + " MemoryExec: partitions=1, partition_sizes=[1]", + " CoalesceBatchesExec: target_batch_size=4096", + " RepartitionExec: partitioning=Hash([Column { name: \"t1_id\", index: 0 }], 2), input_partitions=2", + " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", + " MemoryExec: partitions=1, partition_sizes=[1]", ] } else { vec![ "SortExec: expr=[t1_id@0 ASC NULLS LAST]", - " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int]", - " CoalesceBatchesExec: target_batch_size=4096", - " HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(Column { name: \"t2_id\", index: 0 }, Column { name: \"t1_id\", index: 0 })], filter=BinaryExpr { left: Column { name: \"t2_name\", index: 0 }, op: NotEq, right: Column { name: \"t1_name\", index: 1 } }", - " MemoryExec: partitions=1, partition_sizes=[1]", - " MemoryExec: partitions=1, partition_sizes=[1]", + " CoalesceBatchesExec: target_batch_size=4096", + " HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(Column { name: \"t2_id\", index: 0 }, Column { name: \"t1_id\", index: 0 })], filter=BinaryExpr { left: Column { name: \"t2_name\", index: 0 }, op: NotEq, right: Column { name: \"t1_name\", index: 1 } }", + " MemoryExec: partitions=1, partition_sizes=[1]", + " MemoryExec: partitions=1, partition_sizes=[1]", ] }; let formatted = displayable(physical_plan.as_ref()).indent().to_string(); @@ -2385,10 +2360,9 @@ async fn reduce_cross_join_with_expr_join_key_all() -> Result<()> { let plan = dataframe.into_optimized_plan()?; let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.t1_id, t1.t1_name, t1.t1_int, t2.t2_id, t2.t2_name, t2.t2_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " Inner Join: CAST(t1.t1_id AS Int64) + Int64(12) = CAST(t2.t2_id AS Int64) + Int64(1) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " Inner Join: CAST(t1.t1_id AS Int64) + Int64(12) = CAST(t2.t2_id AS Int64) + Int64(1) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); @@ -2769,29 +2743,27 @@ async fn select_wildcard_with_expr_key_inner_join() -> Result<()> { let expected = if repartition_joins { vec![ "ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int, t2_id@3 as t2_id, t2_name@4 as t2_name, t2_int@5 as t2_int]", - " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int, t2_id@3 as t2_id, t2_name@4 as t2_name, t2_int@5 as t2_int]", - " CoalesceBatchesExec: target_batch_size=4096", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2.t2_id - UInt32(11)\", index: 3 })]", - " CoalesceBatchesExec: target_batch_size=4096", - " RepartitionExec: partitioning=Hash([Column { name: \"t1_id\", index: 0 }], 2), input_partitions=2", + " CoalesceBatchesExec: target_batch_size=4096", + " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2.t2_id - UInt32(11)\", index: 3 })]", + " CoalesceBatchesExec: target_batch_size=4096", + " RepartitionExec: partitioning=Hash([Column { name: \"t1_id\", index: 0 }], 2), input_partitions=2", + " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", + " MemoryExec: partitions=1, partition_sizes=[1]", + " CoalesceBatchesExec: target_batch_size=4096", + " RepartitionExec: partitioning=Hash([Column { name: \"t2.t2_id - UInt32(11)\", index: 3 }], 2), input_partitions=2", + " ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, t2_id@0 - 11 as t2.t2_id - UInt32(11)]", " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", " MemoryExec: partitions=1, partition_sizes=[1]", - " CoalesceBatchesExec: target_batch_size=4096", - " RepartitionExec: partitioning=Hash([Column { name: \"t2.t2_id - UInt32(11)\", index: 3 }], 2), input_partitions=2", - " ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, t2_id@0 - 11 as t2.t2_id - UInt32(11)]", - " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", - " MemoryExec: partitions=1, partition_sizes=[1]", - ] + ] } else { vec![ "ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int, t2_id@3 as t2_id, t2_name@4 as t2_name, t2_int@5 as t2_int]", - " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int, t2_id@3 as t2_id, t2_name@4 as t2_name, t2_int@5 as t2_int]", - " CoalesceBatchesExec: target_batch_size=4096", - " HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2.t2_id - UInt32(11)\", index: 3 })]", - " MemoryExec: partitions=1, partition_sizes=[1]", - " ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, t2_id@0 - 11 as t2.t2_id - UInt32(11)]", - " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", - " MemoryExec: partitions=1, partition_sizes=[1]", + " CoalesceBatchesExec: target_batch_size=4096", + " HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2.t2_id - UInt32(11)\", index: 3 })]", + " MemoryExec: partitions=1, partition_sizes=[1]", + " ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, t2_id@0 - 11 as t2.t2_id - UInt32(11)]", + " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", + " MemoryExec: partitions=1, partition_sizes=[1]", ] }; let formatted = displayable(physical_plan.as_ref()).indent().to_string(); @@ -2831,10 +2803,9 @@ async fn join_with_type_coercion_for_equi_expr() -> Result<()> { let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.t1_id, t1.t1_name, t2.t2_id [t1_id:UInt32;N, t1_name:Utf8;N, t2_id:UInt32;N]", - " Inner Join: CAST(t1.t1_id AS Int64) + Int64(11) = CAST(t2.t2_id AS Int64) [t1_id:UInt32;N, t1_name:Utf8;N, t2_id:UInt32;N]", - " TableScan: t1 projection=[t1_id, t1_name] [t1_id:UInt32;N, t1_name:Utf8;N]", - " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", + " Inner Join: CAST(t1.t1_id AS Int64) + Int64(11) = CAST(t2.t2_id AS Int64) [t1_id:UInt32;N, t1_name:Utf8;N, t2_id:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_name] [t1_id:UInt32;N, t1_name:Utf8;N]", + " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); @@ -2873,10 +2844,9 @@ async fn join_only_with_filter() -> Result<()> { let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.t1_id, t1.t1_name, t2.t2_id [t1_id:UInt32;N, t1_name:Utf8;N, t2_id:UInt32;N]", - " Inner Join: Filter: CAST(t1.t1_id AS Int64) * Int64(4) < CAST(t2.t2_id AS Int64) [t1_id:UInt32;N, t1_name:Utf8;N, t2_id:UInt32;N]", - " TableScan: t1 projection=[t1_id, t1_name] [t1_id:UInt32;N, t1_name:Utf8;N]", - " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", + " Inner Join: Filter: CAST(t1.t1_id AS Int64) * Int64(4) < CAST(t2.t2_id AS Int64) [t1_id:UInt32;N, t1_name:Utf8;N, t2_id:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_name] [t1_id:UInt32;N, t1_name:Utf8;N]", + " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); @@ -2916,10 +2886,9 @@ async fn type_coercion_join_with_filter_and_equi_expr() -> Result<()> { let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.t1_id, t1.t1_name, t2.t2_id [t1_id:UInt32;N, t1_name:Utf8;N, t2_id:UInt32;N]", - " Inner Join: CAST(t1.t1_id AS Int64) * Int64(5) = CAST(t2.t2_id AS Int64) Filter: CAST(t1.t1_id AS Int64) * Int64(4) < CAST(t2.t2_id AS Int64) [t1_id:UInt32;N, t1_name:Utf8;N, t2_id:UInt32;N]", - " TableScan: t1 projection=[t1_id, t1_name] [t1_id:UInt32;N, t1_name:Utf8;N]", - " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", + " Inner Join: CAST(t1.t1_id AS Int64) * Int64(5) = CAST(t2.t2_id AS Int64) Filter: CAST(t1.t1_id AS Int64) * Int64(4) < CAST(t2.t2_id AS Int64) [t1_id:UInt32;N, t1_name:Utf8;N, t2_id:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_name] [t1_id:UInt32;N, t1_name:Utf8;N]", + " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); @@ -2980,18 +2949,18 @@ async fn test_cross_join_to_groupby_with_different_key_ordering() -> Result<()> "+------+------+----------+", "| col1 | col2 | sum_col3 |", "+------+------+----------+", - "| A | 1 | 2 |", - "| A | 2 | 2 |", - "| A | 3 | 2 |", - "| A | 4 | 2 |", - "| A | 5 | 0 |", - "| A | 6 | 0 |", - "| BB | 1 | 0 |", - "| BB | 2 | 0 |", - "| BB | 3 | 0 |", - "| BB | 4 | 0 |", - "| BB | 5 | 2 |", - "| BB | 6 | 2 |", + "| A | 1 | 2.0 |", + "| A | 2 | 2.0 |", + "| A | 3 | 2.0 |", + "| A | 4 | 2.0 |", + "| A | 5 | 0.0 |", + "| A | 6 | 0.0 |", + "| BB | 1 | 0.0 |", + "| BB | 2 | 0.0 |", + "| BB | 3 | 0.0 |", + "| BB | 4 | 0.0 |", + "| BB | 5 | 2.0 |", + "| BB | 6 | 2.0 |", "+------+------+----------+", ]; @@ -3014,12 +2983,11 @@ async fn subquery_to_join_with_both_side_expr() -> Result<()> { let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.t1_id, t1.t1_name, t1.t1_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.CAST(t2_id AS Int64) + Int64(1) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N]", - " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1) [CAST(t2_id AS Int64) + Int64(1):Int64;N]", - " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", + " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.CAST(t2_id AS Int64) + Int64(1) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N]", + " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1) [CAST(t2_id AS Int64) + Int64(1):Int64;N]", + " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); @@ -3049,7 +3017,7 @@ async fn subquery_to_join_with_both_side_expr() -> Result<()> { async fn subquery_to_join_with_muti_filter() -> Result<()> { let ctx = create_join_context("t1_id", "t2_id", false)?; - let sql = "select t1.t1_id, t1.t1_name, t1.t1_int from t1 where t1.t1_id + 12 in + let sql = "select t1.t1_id, t1.t1_name, t1.t1_int from t1 where t1.t1_id + 12 in (select t2.t2_id + 1 from t2 where t1.t1_int <= t2.t2_int and t2.t2_int > 0)"; // assert logical plan @@ -3059,13 +3027,12 @@ async fn subquery_to_join_with_muti_filter() -> Result<()> { let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.t1_id, t1.t1_name, t1.t1_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.CAST(t2_id AS Int64) + Int64(1) Filter: t1.t1_int <= __correlated_sq_1.t2_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N]", - " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1), t2.t2_int [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N]", - " Filter: t2.t2_int > UInt32(0) [t2_id:UInt32;N, t2_int:UInt32;N]", - " TableScan: t2 projection=[t2_id, t2_int] [t2_id:UInt32;N, t2_int:UInt32;N]", + " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.CAST(t2_id AS Int64) + Int64(1) Filter: t1.t1_int <= __correlated_sq_1.t2_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N]", + " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1), t2.t2_int [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N]", + " Filter: t2.t2_int > UInt32(0) [t2_id:UInt32;N, t2_int:UInt32;N]", + " TableScan: t2 projection=[t2_id, t2_int] [t2_id:UInt32;N, t2_int:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); @@ -3094,7 +3061,7 @@ async fn subquery_to_join_with_muti_filter() -> Result<()> { async fn three_projection_exprs_subquery_to_join() -> Result<()> { let ctx = create_join_context("t1_id", "t2_id", false)?; - let sql = "select t1.t1_id, t1.t1_name, t1.t1_int from t1 where t1.t1_id + 12 in + let sql = "select t1.t1_id, t1.t1_name, t1.t1_int from t1 where t1.t1_id + 12 in (select t2.t2_id + 1 from t2 where t1.t1_int <= t2.t2_int and t1.t1_name != t2.t2_name and t2.t2_int > 0)"; // assert logical plan @@ -3104,13 +3071,12 @@ async fn three_projection_exprs_subquery_to_join() -> Result<()> { let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.t1_id, t1.t1_name, t1.t1_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.CAST(t2_id AS Int64) + Int64(1) Filter: t1.t1_int <= __correlated_sq_1.t2_int AND t1.t1_name != __correlated_sq_1.t2_name [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N, t2_name:Utf8;N]", - " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1), t2.t2_int, t2.t2_name [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N, t2_name:Utf8;N]", - " Filter: t2.t2_int > UInt32(0) [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.CAST(t2_id AS Int64) + Int64(1) Filter: t1.t1_int <= __correlated_sq_1.t2_int AND t1.t1_name != __correlated_sq_1.t2_name [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N, t2_name:Utf8;N]", + " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1), t2.t2_int, t2.t2_name [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N, t2_name:Utf8;N]", + " Filter: t2.t2_int > UInt32(0) [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); @@ -3139,7 +3105,7 @@ async fn three_projection_exprs_subquery_to_join() -> Result<()> { async fn in_subquery_to_join_with_correlated_outer_filter() -> Result<()> { let ctx = create_join_context("t1_id", "t2_id", false)?; - let sql = "select t1.t1_id, t1.t1_name, t1.t1_int from t1 where t1.t1_id + 12 in + let sql = "select t1.t1_id, t1.t1_name, t1.t1_int from t1 where t1.t1_id + 12 in (select t2.t2_id + 1 from t2 where t1.t1_int > 0)"; // assert logical plan @@ -3148,13 +3114,12 @@ async fn in_subquery_to_join_with_correlated_outer_filter() -> Result<()> { let plan = dataframe.into_optimized_plan().unwrap(); let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.t1_id, t1.t1_name, t1.t1_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.CAST(t2_id AS Int64) + Int64(1) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " Filter: t1.t1_int > UInt32(0) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N]", - " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1) [CAST(t2_id AS Int64) + Int64(1):Int64;N]", - " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", + " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.CAST(t2_id AS Int64) + Int64(1) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " Filter: t1.t1_int > UInt32(0) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N]", + " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1) [CAST(t2_id AS Int64) + Int64(1):Int64;N]", + " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); @@ -3170,7 +3135,7 @@ async fn in_subquery_to_join_with_correlated_outer_filter() -> Result<()> { async fn not_in_subquery_to_join_with_correlated_outer_filter() -> Result<()> { let ctx = create_join_context("t1_id", "t2_id", false)?; - let sql = "select t1.t1_id, t1.t1_name, t1.t1_int from t1 where t1.t1_id + 12 not in + let sql = "select t1.t1_id, t1.t1_name, t1.t1_int from t1 where t1.t1_id + 12 not in (select t2.t2_id + 1 from t2 where t1.t1_int > 0)"; // assert logical plan @@ -3179,12 +3144,11 @@ async fn not_in_subquery_to_join_with_correlated_outer_filter() -> Result<()> { let plan = dataframe.into_optimized_plan().unwrap(); let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.t1_id, t1.t1_name, t1.t1_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " LeftAnti Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.CAST(t2_id AS Int64) + Int64(1) Filter: t1.t1_int > UInt32(0) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N]", - " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1) [CAST(t2_id AS Int64) + Int64(1):Int64;N]", - " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", + " LeftAnti Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.CAST(t2_id AS Int64) + Int64(1) Filter: t1.t1_int > UInt32(0) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N]", + " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1) [CAST(t2_id AS Int64) + Int64(1):Int64;N]", + " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); @@ -3200,7 +3164,7 @@ async fn not_in_subquery_to_join_with_correlated_outer_filter() -> Result<()> { async fn in_subquery_to_join_with_outer_filter() -> Result<()> { let ctx = create_join_context("t1_id", "t2_id", false)?; - let sql = "select t1.t1_id, t1.t1_name, t1.t1_int from t1 where t1.t1_id + 12 in + let sql = "select t1.t1_id, t1.t1_name, t1.t1_int from t1 where t1.t1_id + 12 in (select t2.t2_id + 1 from t2 where t1.t1_int <= t2.t2_int and t1.t1_name != t2.t2_name) and t1.t1_id > 0"; // assert logical plan @@ -3210,13 +3174,12 @@ async fn in_subquery_to_join_with_outer_filter() -> Result<()> { let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.t1_id, t1.t1_name, t1.t1_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.CAST(t2_id AS Int64) + Int64(1) Filter: t1.t1_int <= __correlated_sq_1.t2_int AND t1.t1_name != __correlated_sq_1.t2_name [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " Filter: t1.t1_id > UInt32(0) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N, t2_name:Utf8;N]", - " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1), t2.t2_int, t2.t2_name [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N, t2_name:Utf8;N]", - " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.CAST(t2_id AS Int64) + Int64(1) Filter: t1.t1_int <= __correlated_sq_1.t2_int AND t1.t1_name != __correlated_sq_1.t2_name [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " Filter: t1.t1_id > UInt32(0) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N, t2_name:Utf8;N]", + " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1), t2.t2_int, t2.t2_name [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N, t2_name:Utf8;N]", + " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); @@ -3245,8 +3208,8 @@ async fn in_subquery_to_join_with_outer_filter() -> Result<()> { async fn two_in_subquery_to_join_with_outer_filter() -> Result<()> { let ctx = create_join_context("t1_id", "t2_id", false)?; - let sql = "select t1.t1_id, t1.t1_name, t1.t1_int from t1 where t1.t1_id + 12 in - (select t2.t2_id + 1 from t2) + let sql = "select t1.t1_id, t1.t1_name, t1.t1_int from t1 where t1.t1_id + 12 in + (select t2.t2_id + 1 from t2) and t1.t1_int in(select t2.t2_int + 1 from t2) and t1.t1_id > 0"; @@ -3257,17 +3220,16 @@ async fn two_in_subquery_to_join_with_outer_filter() -> Result<()> { let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.t1_id, t1.t1_name, t1.t1_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " LeftSemi Join: CAST(t1.t1_int AS Int64) = __correlated_sq_2.CAST(t2_int AS Int64) + Int64(1) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.CAST(t2_id AS Int64) + Int64(1) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " Filter: t1.t1_id > UInt32(0) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N]", - " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1) [CAST(t2_id AS Int64) + Int64(1):Int64;N]", - " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", - " SubqueryAlias: __correlated_sq_2 [CAST(t2_int AS Int64) + Int64(1):Int64;N]", - " Projection: CAST(t2.t2_int AS Int64) + Int64(1) AS CAST(t2_int AS Int64) + Int64(1) [CAST(t2_int AS Int64) + Int64(1):Int64;N]", - " TableScan: t2 projection=[t2_int] [t2_int:UInt32;N]", + " LeftSemi Join: CAST(t1.t1_int AS Int64) = __correlated_sq_2.CAST(t2_int AS Int64) + Int64(1) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.CAST(t2_id AS Int64) + Int64(1) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " Filter: t1.t1_id > UInt32(0) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N]", + " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1) [CAST(t2_id AS Int64) + Int64(1):Int64;N]", + " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", + " SubqueryAlias: __correlated_sq_2 [CAST(t2_int AS Int64) + Int64(1):Int64;N]", + " Projection: CAST(t2.t2_int AS Int64) + Int64(1) AS CAST(t2_int AS Int64) + Int64(1) [CAST(t2_int AS Int64) + Int64(1):Int64;N]", + " TableScan: t2 projection=[t2_int] [t2_int:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); @@ -3296,8 +3258,8 @@ async fn right_as_inner_table_nested_loop_join() -> Result<()> { let ctx = create_nested_loop_join_context()?; // Distribution: left is `UnspecifiedDistribution`, right is `SinglePartition`. - let sql = "SELECT t1.t1_id, t2.t2_id - FROM t1 INNER JOIN t2 ON t1.t1_id > t2.t2_id + let sql = "SELECT t1.t1_id, t2.t2_id + FROM t1 INNER JOIN t2 ON t1.t1_id > t2.t2_id WHERE t1.t1_id > 10 AND t2.t2_int > 1"; let msg = format!("Creating logical plan for '{sql}'"); @@ -3346,8 +3308,8 @@ async fn left_as_inner_table_nested_loop_join() -> Result<()> { let ctx = create_nested_loop_join_context()?; // Distribution: left is `SinglePartition`, right is `UnspecifiedDistribution`. - let sql = "SELECT t1.t1_id,t2.t2_id FROM (select t1_id from t1 where t1.t1_id > 22) as t1 - RIGHT JOIN (select t2_id from t2 where t2.t2_id > 11) as t2 + let sql = "SELECT t1.t1_id,t2.t2_id FROM (select t1_id from t1 where t1.t1_id > 22) as t1 + RIGHT JOIN (select t2_id from t2 where t2.t2_id > 11) as t2 ON t1.t1_id < t2.t2_id"; let msg = format!("Creating logical plan for '{sql}'"); @@ -3356,19 +3318,16 @@ async fn left_as_inner_table_nested_loop_join() -> Result<()> { // left is single partition side, so it will be visited many times. let expected = vec![ - "ProjectionExec: expr=[t1_id@0 as t1_id, t2_id@1 as t2_id]", - " NestedLoopJoinExec: join_type=Right, filter=BinaryExpr { left: Column { name: \"t1_id\", index: 0 }, op: Lt, right: Column { name: \"t2_id\", index: 1 } }", - " CoalescePartitionsExec", - " ProjectionExec: expr=[t1_id@0 as t1_id]", - " CoalesceBatchesExec: target_batch_size=4096", - " FilterExec: t1_id@0 > 22", - " RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1", - " MemoryExec: partitions=1, partition_sizes=[1]", - " ProjectionExec: expr=[t2_id@0 as t2_id]", - " CoalesceBatchesExec: target_batch_size=4096", - " FilterExec: t2_id@0 > 11", - " RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1", - " MemoryExec: partitions=1, partition_sizes=[1]", + "NestedLoopJoinExec: join_type=Right, filter=BinaryExpr { left: Column { name: \"t1_id\", index: 0 }, op: Lt, right: Column { name: \"t2_id\", index: 1 } }", + " CoalescePartitionsExec", + " CoalesceBatchesExec: target_batch_size=4096", + " FilterExec: t1_id@0 > 22", + " RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1", + " MemoryExec: partitions=1, partition_sizes=[1]", + " CoalesceBatchesExec: target_batch_size=4096", + " FilterExec: t2_id@0 > 11", + " RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1", + " MemoryExec: partitions=1, partition_sizes=[1]", ]; let formatted = displayable(physical_plan.as_ref()).indent().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); @@ -3409,11 +3368,9 @@ async fn exists_subquery_to_join_expr_filter() -> Result<()> { let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.t1_id, t1.t1_name, t1.t1_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " LeftSemi Join: Filter: CAST(t1.t1_id AS Int64) + Int64(1) > CAST(t2.t2_id AS Int64) * Int64(2) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " Projection: t2.t2_id [t2_id:UInt32;N]", - " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", + " LeftSemi Join: Filter: CAST(t1.t1_id AS Int64) + Int64(1) > CAST(t2.t2_id AS Int64) * Int64(2) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); @@ -3453,12 +3410,11 @@ async fn exists_subquery_to_join_inner_filter() -> Result<()> { // `t2.t2_int < 3` will be kept in the subquery filter. let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.t1_id, t1.t1_name, t1.t1_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " LeftSemi Join: Filter: CAST(t1.t1_id AS Int64) + Int64(1) > CAST(t2.t2_id AS Int64) * Int64(2) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " Projection: t2.t2_id [t2_id:UInt32;N]", - " Filter: t2.t2_int < UInt32(3) [t2_id:UInt32;N, t2_int:UInt32;N]", - " TableScan: t2 projection=[t2_id, t2_int] [t2_id:UInt32;N, t2_int:UInt32;N]", + " LeftSemi Join: Filter: CAST(t1.t1_id AS Int64) + Int64(1) > CAST(t2.t2_id AS Int64) * Int64(2) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " Projection: t2.t2_id [t2_id:UInt32;N]", + " Filter: t2.t2_int < UInt32(3) [t2_id:UInt32;N, t2_int:UInt32;N]", + " TableScan: t2 projection=[t2_id, t2_int] [t2_id:UInt32;N, t2_int:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); @@ -3496,12 +3452,10 @@ async fn exists_subquery_to_join_outer_filter() -> Result<()> { // `t1.t1_int < 3` will be moved to the filter of t1. let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.t1_id, t1.t1_name, t1.t1_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " LeftSemi Join: Filter: CAST(t1.t1_id AS Int64) + Int64(1) > CAST(t2.t2_id AS Int64) * Int64(2) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " Filter: t1.t1_int < UInt32(3) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " Projection: t2.t2_id [t2_id:UInt32;N]", - " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", + " LeftSemi Join: Filter: CAST(t1.t1_id AS Int64) + Int64(1) > CAST(t2.t2_id AS Int64) * Int64(2) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " Filter: t1.t1_int < UInt32(3) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); @@ -3538,11 +3492,9 @@ async fn not_exists_subquery_to_join_expr_filter() -> Result<()> { let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: t1.t1_id, t1.t1_name, t1.t1_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " LeftAnti Join: Filter: CAST(t1.t1_id AS Int64) + Int64(1) > CAST(t2.t2_id AS Int64) * Int64(2) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " Projection: t2.t2_id [t2_id:UInt32;N]", - " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", + " LeftAnti Join: Filter: CAST(t1.t1_id AS Int64) + Int64(1) > CAST(t2.t2_id AS Int64) * Int64(2) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); diff --git a/datafusion/core/tests/sql/json.rs b/datafusion/core/tests/sql/json.rs index 26ddff61d0a7..965a9c14fc98 100644 --- a/datafusion/core/tests/sql/json.rs +++ b/datafusion/core/tests/sql/json.rs @@ -33,16 +33,16 @@ async fn json_query() { "+-----------------+------+", "| a | b |", "+-----------------+------+", - "| 1 | 2 |", + "| 1 | 2.0 |", "| -10 | -3.5 |", "| 2 | 0.6 |", - "| 1 | 2 |", + "| 1 | 2.0 |", "| 7 | -3.5 |", "| 1 | 0.6 |", - "| 1 | 2 |", + "| 1 | 2.0 |", "| 5 | -3.5 |", "| 1 | 0.6 |", - "| 1 | 2 |", + "| 1 | 2.0 |", "| 1 | -3.5 |", "| 100000000000000 | 0.6 |", "+-----------------+------+", @@ -83,18 +83,16 @@ async fn json_explain() { let expected = vec![ vec![ "logical_plan", - "Projection: COUNT(UInt8(1))\ - \n Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1))]]\ - \n TableScan: t1 projection=[a]", + "Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1))]]\ + \n TableScan: t1 projection=[a]", ], vec![ "physical_plan", - "ProjectionExec: expr=[COUNT(UInt8(1))@0 as COUNT(UInt8(1))]\ - \n AggregateExec: mode=Final, gby=[], aggr=[COUNT(UInt8(1))]\ - \n CoalescePartitionsExec\ - \n AggregateExec: mode=Partial, gby=[], aggr=[COUNT(UInt8(1))]\ - \n RepartitionExec: partitioning=RoundRobinBatch(NUM_CORES), input_partitions=1\ - \n JsonExec: limit=None, files={1 group: [[WORKING_DIR/tests/jsons/2.json]]}\n", + "AggregateExec: mode=Final, gby=[], aggr=[COUNT(UInt8(1))]\ + \n CoalescePartitionsExec\ + \n AggregateExec: mode=Partial, gby=[], aggr=[COUNT(UInt8(1))]\ + \n RepartitionExec: partitioning=RoundRobinBatch(NUM_CORES), input_partitions=1\ + \n JsonExec: limit=None, files={1 group: [[WORKING_DIR/tests/jsons/2.json]]}\n", ], ]; assert_eq!(expected, actual); diff --git a/datafusion/core/tests/sql/limit.rs b/datafusion/core/tests/sql/limit.rs index 261e7f819020..a4247492b415 100644 --- a/datafusion/core/tests/sql/limit.rs +++ b/datafusion/core/tests/sql/limit.rs @@ -17,87 +17,6 @@ use super::*; -#[tokio::test] -async fn csv_query_limit() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT c1 FROM aggregate_test_100 LIMIT 2"; - let actual = execute_to_batches(&ctx, sql).await; - #[rustfmt::skip] - let expected = vec![ - "+----+", - "| c1 |", - "+----+", - "| c |", - "| d |", - "+----+" - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn csv_query_limit_bigger_than_nbr_of_rows() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT c2 FROM aggregate_test_100 LIMIT 200"; - let actual = execute_to_batches(&ctx, sql).await; - // println!("{}", pretty_format_batches(&a).unwrap()); - let expected = vec![ - "+----+", "| c2 |", "+----+", "| 2 |", "| 5 |", "| 1 |", "| 1 |", "| 5 |", - "| 4 |", "| 3 |", "| 3 |", "| 1 |", "| 4 |", "| 1 |", "| 4 |", "| 3 |", - "| 2 |", "| 1 |", "| 1 |", "| 2 |", "| 1 |", "| 3 |", "| 2 |", "| 4 |", - "| 1 |", "| 5 |", "| 4 |", "| 2 |", "| 1 |", "| 4 |", "| 5 |", "| 2 |", - "| 3 |", "| 4 |", "| 2 |", "| 1 |", "| 5 |", "| 3 |", "| 1 |", "| 2 |", - "| 3 |", "| 3 |", "| 3 |", "| 2 |", "| 4 |", "| 1 |", "| 3 |", "| 2 |", - "| 5 |", "| 2 |", "| 1 |", "| 4 |", "| 1 |", "| 4 |", "| 2 |", "| 5 |", - "| 4 |", "| 2 |", "| 3 |", "| 4 |", "| 4 |", "| 4 |", "| 5 |", "| 4 |", - "| 2 |", "| 1 |", "| 2 |", "| 4 |", "| 2 |", "| 3 |", "| 5 |", "| 1 |", - "| 1 |", "| 4 |", "| 2 |", "| 1 |", "| 2 |", "| 1 |", "| 1 |", "| 5 |", - "| 4 |", "| 5 |", "| 2 |", "| 3 |", "| 2 |", "| 4 |", "| 1 |", "| 3 |", - "| 4 |", "| 3 |", "| 2 |", "| 5 |", "| 3 |", "| 3 |", "| 2 |", "| 5 |", - "| 5 |", "| 4 |", "| 1 |", "| 3 |", "| 3 |", "| 4 |", "| 4 |", "+----+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn csv_query_limit_with_same_nbr_of_rows() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT c2 FROM aggregate_test_100 LIMIT 100"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+----+", "| c2 |", "+----+", "| 2 |", "| 5 |", "| 1 |", "| 1 |", "| 5 |", - "| 4 |", "| 3 |", "| 3 |", "| 1 |", "| 4 |", "| 1 |", "| 4 |", "| 3 |", - "| 2 |", "| 1 |", "| 1 |", "| 2 |", "| 1 |", "| 3 |", "| 2 |", "| 4 |", - "| 1 |", "| 5 |", "| 4 |", "| 2 |", "| 1 |", "| 4 |", "| 5 |", "| 2 |", - "| 3 |", "| 4 |", "| 2 |", "| 1 |", "| 5 |", "| 3 |", "| 1 |", "| 2 |", - "| 3 |", "| 3 |", "| 3 |", "| 2 |", "| 4 |", "| 1 |", "| 3 |", "| 2 |", - "| 5 |", "| 2 |", "| 1 |", "| 4 |", "| 1 |", "| 4 |", "| 2 |", "| 5 |", - "| 4 |", "| 2 |", "| 3 |", "| 4 |", "| 4 |", "| 4 |", "| 5 |", "| 4 |", - "| 2 |", "| 1 |", "| 2 |", "| 4 |", "| 2 |", "| 3 |", "| 5 |", "| 1 |", - "| 1 |", "| 4 |", "| 2 |", "| 1 |", "| 2 |", "| 1 |", "| 1 |", "| 5 |", - "| 4 |", "| 5 |", "| 2 |", "| 3 |", "| 2 |", "| 4 |", "| 1 |", "| 3 |", - "| 4 |", "| 3 |", "| 2 |", "| 5 |", "| 3 |", "| 3 |", "| 2 |", "| 5 |", - "| 5 |", "| 4 |", "| 1 |", "| 3 |", "| 3 |", "| 4 |", "| 4 |", "+----+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn csv_query_limit_zero() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT c1 FROM aggregate_test_100 LIMIT 0"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec!["++", "++"]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - #[tokio::test] async fn limit() -> Result<()> { let tmp_dir = TempDir::new()?; @@ -184,84 +103,3 @@ async fn limit_multi_partitions() -> Result<()> { Ok(()) } - -#[tokio::test] -async fn csv_offset_without_limit_99() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT c1 FROM aggregate_test_100 OFFSET 99"; - let actual = execute_to_batches(&ctx, sql).await; - - #[rustfmt::skip] - let expected = vec![ - "+----+", - "| c1 |", - "+----+", - "| e |", - "+----+"]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn csv_offset_without_limit_100() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT c1 FROM aggregate_test_100 OFFSET 100"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec!["++", "++"]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn csv_offset_without_limit_101() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT c1 FROM aggregate_test_100 OFFSET 101"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec!["++", "++"]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn csv_query_offset() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT c1 FROM aggregate_test_100 OFFSET 2 LIMIT 2"; - let actual = execute_to_batches(&ctx, sql).await; - - #[rustfmt::skip] - let expected = vec![ - "+----+", - "| c1 |", - "+----+", - "| b |", - "| a |", - "+----+"]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn csv_query_offset_the_same_as_nbr_of_rows() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT c1 FROM aggregate_test_100 LIMIT 1 OFFSET 100"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec!["++", "++"]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn csv_query_offset_bigger_than_nbr_of_rows() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT c1 FROM aggregate_test_100 LIMIT 1 OFFSET 101"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec!["++", "++"]; - assert_batches_eq!(expected, &actual); - Ok(()) -} diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs index 4b3c60d7e01f..d8b6a83f2f1a 100644 --- a/datafusion/core/tests/sql/mod.rs +++ b/datafusion/core/tests/sql/mod.rs @@ -28,7 +28,7 @@ use chrono::Duration; use datafusion::config::ConfigOptions; use datafusion::datasource::TableProvider; use datafusion::from_slice::FromSlice; -use datafusion::logical_expr::{Aggregate, LogicalPlan, Projection, TableScan}; +use datafusion::logical_expr::{Aggregate, LogicalPlan, TableScan}; use datafusion::physical_plan::metrics::MetricValue; use datafusion::physical_plan::ExecutionPlan; use datafusion::physical_plan::ExecutionPlanVisitor; @@ -87,7 +87,6 @@ pub mod explain_analyze; pub mod expr; pub mod functions; pub mod group_by; -pub mod intersection; pub mod joins; pub mod json; pub mod limit; @@ -1589,18 +1588,15 @@ async fn nyc() -> Result<()> { let optimized_plan = dataframe.into_optimized_plan().unwrap(); match &optimized_plan { - LogicalPlan::Projection(Projection { input, .. }) => match input.as_ref() { - LogicalPlan::Aggregate(Aggregate { input, .. }) => match input.as_ref() { - LogicalPlan::TableScan(TableScan { - ref projected_schema, - .. - }) => { - assert_eq!(2, projected_schema.fields().len()); - assert_eq!(projected_schema.field(0).name(), "passenger_count"); - assert_eq!(projected_schema.field(1).name(), "fare_amount"); - } - _ => unreachable!(), - }, + LogicalPlan::Aggregate(Aggregate { input, .. }) => match input.as_ref() { + LogicalPlan::TableScan(TableScan { + ref projected_schema, + .. + }) => { + assert_eq!(2, projected_schema.fields().len()); + assert_eq!(projected_schema.field(0).name(), "passenger_count"); + assert_eq!(projected_schema.field(1).name(), "fare_amount"); + } _ => unreachable!(), }, _ => unreachable!(), diff --git a/datafusion/core/tests/sql/parquet.rs b/datafusion/core/tests/sql/parquet.rs index b01e8b843d52..31cd0da21b93 100644 --- a/datafusion/core/tests/sql/parquet.rs +++ b/datafusion/core/tests/sql/parquet.rs @@ -309,11 +309,11 @@ async fn parquet_query_with_max_min() { let sql = "SELECT max(c1) FROM foo"; let actual = execute_to_batches(&ctx, sql).await; let expected = vec![ - "+-------------+", - "| MAX(foo.c1) |", - "+-------------+", - "| 3 |", - "+-------------+", + "+---------+", + "| MAX(c1) |", + "+---------+", + "| 3 |", + "+---------+", ]; assert_batches_eq!(expected, &actual); @@ -333,11 +333,11 @@ async fn parquet_query_with_max_min() { let sql = "SELECT max(c3) FROM foo"; let actual = execute_to_batches(&ctx, sql).await; let expected = vec![ - "+-------------+", - "| MAX(foo.c3) |", - "+-------------+", - "| 300 |", - "+-------------+", + "+---------+", + "| MAX(c3) |", + "+---------+", + "| 300 |", + "+---------+", ]; assert_batches_eq!(expected, &actual); diff --git a/datafusion/core/tests/sql/predicates.rs b/datafusion/core/tests/sql/predicates.rs index 1e8888ce45f9..ac57e0a1f224 100644 --- a/datafusion/core/tests/sql/predicates.rs +++ b/datafusion/core/tests/sql/predicates.rs @@ -17,286 +17,6 @@ use super::*; -#[tokio::test] -async fn csv_query_with_predicate() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT c1, c12 FROM aggregate_test_100 WHERE c12 > 0.376 AND c12 < 0.4"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+----+---------------------+", - "| c1 | c12 |", - "+----+---------------------+", - "| e | 0.39144436569161134 |", - "| d | 0.38870280983958583 |", - "+----+---------------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn csv_query_with_negative_predicate() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT c1, c4 FROM aggregate_test_100 WHERE c3 < -55 AND -c4 > 30000"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+----+--------+", - "| c1 | c4 |", - "+----+--------+", - "| e | -31500 |", - "| c | -30187 |", - "+----+--------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn csv_query_with_negated_predicate() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT COUNT(1) FROM aggregate_test_100 WHERE NOT(c1 != 'a')"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----------------+", - "| COUNT(Int64(1)) |", - "+-----------------+", - "| 21 |", - "+-----------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn csv_query_with_is_not_null_predicate() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT COUNT(1) FROM aggregate_test_100 WHERE c1 IS NOT NULL"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----------------+", - "| COUNT(Int64(1)) |", - "+-----------------+", - "| 100 |", - "+-----------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn csv_query_with_is_null_predicate() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT COUNT(1) FROM aggregate_test_100 WHERE c1 IS NULL"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----------------+", - "| COUNT(Int64(1)) |", - "+-----------------+", - "| 0 |", - "+-----------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn query_where_neg_num() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv_by_sql(&ctx).await; - - // Negative numbers do not parse correctly as of Arrow 2.0.0 - let sql = "select c7, c8 from aggregate_test_100 where c7 >= -2 and c7 < 10"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+----+-------+", - "| c7 | c8 |", - "+----+-------+", - "| 7 | 45465 |", - "| 5 | 40622 |", - "| 0 | 61069 |", - "| 2 | 20120 |", - "| 4 | 39363 |", - "+----+-------+", - ]; - assert_batches_eq!(expected, &actual); - - // Also check floating point neg numbers - let sql = "select c7, c8 from aggregate_test_100 where c7 >= -2.9 and c7 < 10"; - let actual = execute_to_batches(&ctx, sql).await; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn like() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv_by_sql(&ctx).await; - let sql = "SELECT COUNT(c1) FROM aggregate_test_100 WHERE c13 LIKE '%FB%'"; - // check that the physical and logical schemas are equal - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+------------------------------+", - "| COUNT(aggregate_test_100.c1) |", - "+------------------------------+", - "| 1 |", - "+------------------------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn csv_between_expr() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT c4 FROM aggregate_test_100 WHERE c12 BETWEEN 0.995 AND 1.0"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-------+", - "| c4 |", - "+-------+", - "| 10837 |", - "+-------+", - ]; - assert_batches_sorted_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn csv_between_expr_negated() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT c4 FROM aggregate_test_100 WHERE c12 NOT BETWEEN 0 AND 0.995"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-------+", - "| c4 |", - "+-------+", - "| 10837 |", - "+-------+", - ]; - assert_batches_sorted_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn like_on_strings() -> Result<()> { - let input = vec![Some("foo"), Some("bar"), None, Some("fazzz")] - .into_iter() - .collect::(); - - let batch = RecordBatch::try_from_iter(vec![("c1", Arc::new(input) as _)]).unwrap(); - - let ctx = SessionContext::new(); - ctx.register_batch("test", batch)?; - - let sql = "SELECT * FROM test WHERE c1 LIKE '%a%'"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-------+", - "| c1 |", - "+-------+", - "| bar |", - "| fazzz |", - "+-------+", - ]; - - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn like_on_string_dictionaries() -> Result<()> { - let input = vec![Some("foo"), Some("bar"), None, Some("fazzz")] - .into_iter() - .collect::>(); - - let batch = RecordBatch::try_from_iter(vec![("c1", Arc::new(input) as _)]).unwrap(); - - let ctx = SessionContext::new(); - ctx.register_batch("test", batch)?; - - let sql = "SELECT * FROM test WHERE c1 LIKE '%a%'"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-------+", - "| c1 |", - "+-------+", - "| bar |", - "| fazzz |", - "+-------+", - ]; - - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn test_regexp_is_match() -> Result<()> { - let input = vec![Some("foo"), Some("Barrr"), Some("Bazzz"), Some("ZZZZZ")] - .into_iter() - .collect::(); - - let batch = RecordBatch::try_from_iter(vec![("c1", Arc::new(input) as _)]).unwrap(); - - let ctx = SessionContext::new(); - ctx.register_batch("test", batch)?; - - let sql = "SELECT * FROM test WHERE c1 ~ 'z'"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-------+", - "| c1 |", - "+-------+", - "| Bazzz |", - "+-------+", - ]; - assert_batches_eq!(expected, &actual); - - let sql = "SELECT * FROM test WHERE c1 ~* 'z'"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-------+", - "| c1 |", - "+-------+", - "| Bazzz |", - "| ZZZZZ |", - "+-------+", - ]; - assert_batches_eq!(expected, &actual); - - let sql = "SELECT * FROM test WHERE c1 !~ 'z'"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-------+", - "| c1 |", - "+-------+", - "| foo |", - "| Barrr |", - "| ZZZZZ |", - "+-------+", - ]; - assert_batches_eq!(expected, &actual); - - let sql = "SELECT * FROM test WHERE c1 !~* 'z'"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-------+", - "| c1 |", - "+-------+", - "| foo |", - "| Barrr |", - "+-------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - #[tokio::test] async fn string_coercion() -> Result<()> { let vendor_id_utf8: StringArray = @@ -340,214 +60,6 @@ async fn string_coercion() -> Result<()> { Ok(()) } -#[tokio::test] -async fn except_with_null_not_equal() { - let sql = "SELECT * FROM (SELECT null AS id1, 1 AS id2) t1 - EXCEPT SELECT * FROM (SELECT null AS id1, 2 AS id2) t2"; - - let expected = vec![ - "+-----+-----+", - "| id1 | id2 |", - "+-----+-----+", - "| | 1 |", - "+-----+-----+", - ]; - - let ctx = create_join_context_qualified("t1", "t2").unwrap(); - let actual = execute_to_batches(&ctx, sql).await; - - assert_batches_eq!(expected, &actual); -} - -#[tokio::test] -async fn except_with_null_equal() { - let sql = "SELECT * FROM (SELECT null AS id1, 1 AS id2) t1 - EXCEPT SELECT * FROM (SELECT null AS id1, 1 AS id2) t2"; - - let expected = vec!["++", "++"]; - let ctx = create_join_context_qualified("t1", "t2").unwrap(); - let actual = execute_to_batches(&ctx, sql).await; - - assert_batches_eq!(expected, &actual); -} - -#[tokio::test] -async fn test_expect_all() -> Result<()> { - let ctx = SessionContext::new(); - register_alltypes_parquet(&ctx).await; - // execute the query - let sql = "SELECT int_col, double_col FROM alltypes_plain where int_col > 0 EXCEPT ALL SELECT int_col, double_col FROM alltypes_plain where int_col < 1"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+---------+------------+", - "| int_col | double_col |", - "+---------+------------+", - "| 1 | 10.1 |", - "| 1 | 10.1 |", - "| 1 | 10.1 |", - "| 1 | 10.1 |", - "+---------+------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn test_expect_distinct() -> Result<()> { - let ctx = SessionContext::new(); - register_alltypes_parquet(&ctx).await; - // execute the query - let sql = "SELECT int_col, double_col FROM alltypes_plain where int_col > 0 EXCEPT SELECT int_col, double_col FROM alltypes_plain where int_col < 1"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+---------+------------+", - "| int_col | double_col |", - "+---------+------------+", - "| 1 | 10.1 |", - "+---------+------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn csv_in_set_test() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT count(*) FROM aggregate_test_100 WHERE c7 in ('25','155','204','77','208','67','139','191','26','7','202','113','129','197','249','146','129','220','154','163','220','19','71','243','150','231','196','170','99','255');"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----------------+", - "| COUNT(UInt8(1)) |", - "+-----------------+", - "| 36 |", - "+-----------------+", - ]; - assert_batches_sorted_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn in_list_string_dictionaries() -> Result<()> { - // let input = vec![Some("foo"), Some("bar"), None, Some("fazzz")] - let input = vec![Some("foo"), Some("bar"), Some("fazzz")] - .into_iter() - .collect::>(); - - let batch = RecordBatch::try_from_iter(vec![("c1", Arc::new(input) as _)]).unwrap(); - - let ctx = SessionContext::new(); - ctx.register_batch("test", batch)?; - - let sql = "SELECT * FROM test WHERE c1 IN ('Bar')"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec!["++", "++"]; - assert_batches_eq!(expected, &actual); - - let sql = "SELECT * FROM test WHERE c1 IN ('foo')"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec!["+-----+", "| c1 |", "+-----+", "| foo |", "+-----+"]; - assert_batches_eq!(expected, &actual); - - let sql = "SELECT * FROM test WHERE c1 IN ('bar', 'foo')"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----+", "| c1 |", "+-----+", "| foo |", "| bar |", "+-----+", - ]; - assert_batches_eq!(expected, &actual); - - let sql = "SELECT * FROM test WHERE c1 IN ('Bar', 'foo')"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec!["+-----+", "| c1 |", "+-----+", "| foo |", "+-----+"]; - assert_batches_eq!(expected, &actual); - - let sql = "SELECT * FROM test WHERE c1 IN ('foo', 'Bar', 'fazzz')"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-------+", - "| c1 |", - "+-------+", - "| foo |", - "| fazzz |", - "+-------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn in_list_string_dictionaries_with_null() -> Result<()> { - let input = vec![Some("foo"), Some("bar"), None, Some("fazzz")] - .into_iter() - .collect::>(); - - let batch = RecordBatch::try_from_iter(vec![("c1", Arc::new(input) as _)]).unwrap(); - - let ctx = SessionContext::new(); - ctx.register_batch("test", batch)?; - - let sql = "SELECT * FROM test WHERE c1 IN ('Bar')"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec!["++", "++"]; - assert_batches_eq!(expected, &actual); - - let sql = "SELECT * FROM test WHERE c1 IN ('foo')"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec!["+-----+", "| c1 |", "+-----+", "| foo |", "+-----+"]; - assert_batches_eq!(expected, &actual); - - let sql = "SELECT * FROM test WHERE c1 IN ('bar', 'foo')"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----+", "| c1 |", "+-----+", "| foo |", "| bar |", "+-----+", - ]; - assert_batches_eq!(expected, &actual); - - let sql = "SELECT * FROM test WHERE c1 IN ('Bar', 'foo')"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec!["+-----+", "| c1 |", "+-----+", "| foo |", "+-----+"]; - assert_batches_eq!(expected, &actual); - - let sql = "SELECT * FROM test WHERE c1 IN ('foo', 'Bar', 'fazzz')"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-------+", - "| c1 |", - "+-------+", - "| foo |", - "| fazzz |", - "+-------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn in_set_string_dictionaries() -> Result<()> { - let input = vec![Some("foo"), Some("bar"), None, Some("fazzz")] - .into_iter() - .collect::>(); - - let batch = RecordBatch::try_from_iter(vec![("c1", Arc::new(input) as _)]).unwrap(); - - let ctx = SessionContext::new(); - ctx.register_batch("test", batch)?; - - let sql = "SELECT * FROM test WHERE c1 IN ('foo', 'Bar', 'fazzz')"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-------+", - "| c1 |", - "+-------+", - "| foo |", - "| fazzz |", - "+-------+", - ]; - - assert_batches_eq!(expected, &actual); - Ok(()) -} - #[tokio::test] // Test issue: https://github.com/apache/arrow-datafusion/issues/3635 async fn multiple_or_predicates() -> Result<()> { @@ -644,15 +156,14 @@ where let plan = dataframe.into_optimized_plan().unwrap(); let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); - let expected =vec![ - "Projection: part.p_partkey, SUM(lineitem.l_extendedprice), AVG(lineitem.l_discount), COUNT(DISTINCT partsupp.ps_suppkey) [p_partkey:Int64, SUM(lineitem.l_extendedprice):Decimal128(25, 2);N, AVG(lineitem.l_discount):Decimal128(19, 6);N, COUNT(DISTINCT partsupp.ps_suppkey):Int64;N]", - " Aggregate: groupBy=[[part.p_partkey]], aggr=[[SUM(lineitem.l_extendedprice), AVG(lineitem.l_discount), COUNT(DISTINCT partsupp.ps_suppkey)]] [p_partkey:Int64, SUM(lineitem.l_extendedprice):Decimal128(25, 2);N, AVG(lineitem.l_discount):Decimal128(19, 6);N, COUNT(DISTINCT partsupp.ps_suppkey):Int64;N]", - " Inner Join: part.p_partkey = partsupp.ps_partkey [l_partkey:Int64, l_extendedprice:Decimal128(15, 2), l_discount:Decimal128(15, 2), p_partkey:Int64, p_brand:Utf8, ps_partkey:Int64, ps_suppkey:Int64]", - " Inner Join: lineitem.l_partkey = part.p_partkey [l_partkey:Int64, l_extendedprice:Decimal128(15, 2), l_discount:Decimal128(15, 2), p_partkey:Int64, p_brand:Utf8]", - " TableScan: lineitem projection=[l_partkey, l_extendedprice, l_discount] [l_partkey:Int64, l_extendedprice:Decimal128(15, 2), l_discount:Decimal128(15, 2)]", - " Filter: part.p_brand = Utf8(\"Brand#12\") OR part.p_brand = Utf8(\"Brand#23\") [p_partkey:Int64, p_brand:Utf8]", - " TableScan: part projection=[p_partkey, p_brand], partial_filters=[part.p_brand = Utf8(\"Brand#12\") OR part.p_brand = Utf8(\"Brand#23\")] [p_partkey:Int64, p_brand:Utf8]", - " TableScan: partsupp projection=[ps_partkey, ps_suppkey] [ps_partkey:Int64, ps_suppkey:Int64]", + let expected = vec![ + "Aggregate: groupBy=[[part.p_partkey]], aggr=[[SUM(lineitem.l_extendedprice), AVG(lineitem.l_discount), COUNT(DISTINCT partsupp.ps_suppkey)]] [p_partkey:Int64, SUM(lineitem.l_extendedprice):Decimal128(25, 2);N, AVG(lineitem.l_discount):Decimal128(19, 6);N, COUNT(DISTINCT partsupp.ps_suppkey):Int64;N]", + " Inner Join: part.p_partkey = partsupp.ps_partkey [l_partkey:Int64, l_extendedprice:Decimal128(15, 2), l_discount:Decimal128(15, 2), p_partkey:Int64, p_brand:Utf8, ps_partkey:Int64, ps_suppkey:Int64]", + " Inner Join: lineitem.l_partkey = part.p_partkey [l_partkey:Int64, l_extendedprice:Decimal128(15, 2), l_discount:Decimal128(15, 2), p_partkey:Int64, p_brand:Utf8]", + " TableScan: lineitem projection=[l_partkey, l_extendedprice, l_discount] [l_partkey:Int64, l_extendedprice:Decimal128(15, 2), l_discount:Decimal128(15, 2)]", + " Filter: part.p_brand = Utf8(\"Brand#12\") OR part.p_brand = Utf8(\"Brand#23\") [p_partkey:Int64, p_brand:Utf8]", + " TableScan: part projection=[p_partkey, p_brand], partial_filters=[part.p_brand = Utf8(\"Brand#12\") OR part.p_brand = Utf8(\"Brand#23\")] [p_partkey:Int64, p_brand:Utf8]", + " TableScan: partsupp projection=[ps_partkey, ps_suppkey] [ps_partkey:Int64, ps_suppkey:Int64]", ]; assert_eq!( diff --git a/datafusion/core/tests/sql/projection.rs b/datafusion/core/tests/sql/projection.rs index b4627c5979bf..ac697b11768c 100644 --- a/datafusion/core/tests/sql/projection.rs +++ b/datafusion/core/tests/sql/projection.rs @@ -175,22 +175,18 @@ async fn projection_on_table_scan() -> Result<()> { let state = ctx.state(); let optimized_plan = state.optimize(&logical_plan)?; match &optimized_plan { - LogicalPlan::Projection(Projection { input, .. }) => match &**input { - LogicalPlan::TableScan(TableScan { - source, - projected_schema, - .. - }) => { - assert_eq!(source.schema().fields().len(), 3); - assert_eq!(projected_schema.fields().len(), 1); - } - _ => panic!("input to projection should be TableScan"), - }, - _ => panic!("expect optimized_plan to be projection"), + LogicalPlan::TableScan(TableScan { + source, + projected_schema, + .. + }) => { + assert_eq!(source.schema().fields().len(), 3); + assert_eq!(projected_schema.fields().len(), 1); + } + _ => panic!("input to projection should be TableScan"), } - let expected = "Projection: test.c2\ - \n TableScan: test projection=[c2]"; + let expected = "TableScan: test projection=[c2]"; assert_eq!(format!("{optimized_plan:?}"), expected); let physical_plan = state.create_physical_plan(&optimized_plan).await?; @@ -291,24 +287,18 @@ async fn projection_on_memory_scan() -> Result<()> { let state = ctx.state(); let optimized_plan = state.optimize(&plan)?; match &optimized_plan { - LogicalPlan::Projection(Projection { input, .. }) => match &**input { - LogicalPlan::TableScan(TableScan { - source, - projected_schema, - .. - }) => { - assert_eq!(source.schema().fields().len(), 3); - assert_eq!(projected_schema.fields().len(), 1); - } - _ => panic!("input to projection should be InMemoryScan"), - }, - _ => panic!("expect optimized_plan to be projection"), + LogicalPlan::TableScan(TableScan { + source, + projected_schema, + .. + }) => { + assert_eq!(source.schema().fields().len(), 3); + assert_eq!(projected_schema.fields().len(), 1); + } + _ => panic!("input to projection should be InMemoryScan"), } - let expected = format!( - "Projection: {UNNAMED_TABLE}.b\ - \n TableScan: {UNNAMED_TABLE} projection=[b]" - ); + let expected = format!("TableScan: {UNNAMED_TABLE} projection=[b]"); assert_eq!(format!("{optimized_plan:?}"), expected); let physical_plan = state.create_physical_plan(&optimized_plan).await?; diff --git a/datafusion/core/tests/sql/select.rs b/datafusion/core/tests/sql/select.rs index 124f25d36a88..9a746f65694b 100644 --- a/datafusion/core/tests/sql/select.rs +++ b/datafusion/core/tests/sql/select.rs @@ -278,15 +278,15 @@ async fn select_distinct_simple_2() { let actual = execute_to_batches(&ctx, sql).await; let expected = vec![ - "+---------+----------------+", - "| c1 | c2 |", - "+---------+----------------+", - "| 0.00001 | 0.000000000001 |", - "| 0.00002 | 0.000000000002 |", - "| 0.00003 | 0.000000000003 |", - "| 0.00004 | 0.000000000004 |", - "| 0.00005 | 0.000000000005 |", - "+---------+----------------+", + "+---------+---------+", + "| c1 | c2 |", + "+---------+---------+", + "| 0.00001 | 1.0e-12 |", + "| 0.00002 | 2.0e-12 |", + "| 0.00003 | 3.0e-12 |", + "| 0.00004 | 4.0e-12 |", + "| 0.00005 | 5.0e-12 |", + "+---------+---------+", ]; assert_batches_eq!(expected, &actual); } @@ -763,11 +763,11 @@ async fn query_on_string_dictionary() -> Result<()> { let sql = "SELECT COUNT(d1) FROM test"; let actual = execute_to_batches(&ctx, sql).await; let expected = vec![ - "+----------------+", - "| COUNT(test.d1) |", - "+----------------+", - "| 2 |", - "+----------------+", + "+-----------+", + "| COUNT(d1) |", + "+-----------+", + "| 2 |", + "+-----------+", ]; assert_batches_eq!(expected, &actual); diff --git a/datafusion/core/tests/sql/set_variable.rs b/datafusion/core/tests/sql/set_variable.rs index 6a85f7df166d..b7161eb2b162 100644 --- a/datafusion/core/tests/sql/set_variable.rs +++ b/datafusion/core/tests/sql/set_variable.rs @@ -16,6 +16,7 @@ // under the License. use super::*; +use arrow::util::pretty::pretty_format_batches; #[tokio::test] async fn set_variable_to_value() { @@ -412,14 +413,8 @@ async fn set_time_zone_bad_time_zone_format() { plan_and_collect(&ctx, "SELECT '2000-01-01T00:00:00'::TIMESTAMP::TIMESTAMPTZ") .await .unwrap(); - let expected = vec![ - "+-----------------------------------------------------+", - "| Utf8(\"2000-01-01T00:00:00\") |", - "+-----------------------------------------------------+", - "| 2000-01-01T00:00:00 (Unknown Time Zone '+08:00:00') |", - "+-----------------------------------------------------+", - ]; - assert_batches_eq!(expected, &result); + let err = pretty_format_batches(&result).err().unwrap().to_string(); + assert_eq!(err, "Parser error: Invalid timezone \"+08:00:00\": Expected format [+-]XX:XX, [+-]XX, or [+-]XXXX"); plan_and_collect(&ctx, "SET TIME ZONE = '08:00'") .await @@ -430,14 +425,9 @@ async fn set_time_zone_bad_time_zone_format() { plan_and_collect(&ctx, "SELECT '2000-01-01T00:00:00'::TIMESTAMP::TIMESTAMPTZ") .await .unwrap(); - let expected = vec![ - "+-------------------------------------------------+", - "| Utf8(\"2000-01-01T00:00:00\") |", - "+-------------------------------------------------+", - "| 2000-01-01T00:00:00 (Unknown Time Zone '08:00') |", - "+-------------------------------------------------+", - ]; - assert_batches_eq!(expected, &result); + + let err = pretty_format_batches(&result).err().unwrap().to_string(); + assert_eq!(err, "Parser error: Invalid timezone \"08:00\": only offset based timezones supported without chrono-tz feature"); plan_and_collect(&ctx, "SET TIME ZONE = '08'") .await @@ -448,14 +438,9 @@ async fn set_time_zone_bad_time_zone_format() { plan_and_collect(&ctx, "SELECT '2000-01-01T00:00:00'::TIMESTAMP::TIMESTAMPTZ") .await .unwrap(); - let expected = vec![ - "+----------------------------------------------+", - "| Utf8(\"2000-01-01T00:00:00\") |", - "+----------------------------------------------+", - "| 2000-01-01T00:00:00 (Unknown Time Zone '08') |", - "+----------------------------------------------+", - ]; - assert_batches_eq!(expected, &result); + + let err = pretty_format_batches(&result).err().unwrap().to_string(); + assert_eq!(err, "Parser error: Invalid timezone \"08\": only offset based timezones supported without chrono-tz feature"); // we dont support named time zone yet plan_and_collect(&ctx, "SET TIME ZONE = 'Asia/Taipei'") @@ -467,14 +452,9 @@ async fn set_time_zone_bad_time_zone_format() { plan_and_collect(&ctx, "SELECT '2000-01-01T00:00:00'::TIMESTAMP::TIMESTAMPTZ") .await .unwrap(); - let expected = vec![ - "+-------------------------------------------------------+", - "| Utf8(\"2000-01-01T00:00:00\") |", - "+-------------------------------------------------------+", - "| 2000-01-01T00:00:00 (Unknown Time Zone 'Asia/Taipei') |", - "+-------------------------------------------------------+", - ]; - assert_batches_eq!(expected, &result); + + let err = pretty_format_batches(&result).err().unwrap().to_string(); + assert_eq!(err, "Parser error: Invalid timezone \"Asia/Taipei\": only offset based timezones supported without chrono-tz feature"); // this is invalid even after we support named time zone plan_and_collect(&ctx, "SET TIME ZONE = 'Asia/Taipei2'") @@ -486,12 +466,6 @@ async fn set_time_zone_bad_time_zone_format() { plan_and_collect(&ctx, "SELECT '2000-01-01T00:00:00'::TIMESTAMP::TIMESTAMPTZ") .await .unwrap(); - let expected = vec![ - "+--------------------------------------------------------+", - "| Utf8(\"2000-01-01T00:00:00\") |", - "+--------------------------------------------------------+", - "| 2000-01-01T00:00:00 (Unknown Time Zone 'Asia/Taipei2') |", - "+--------------------------------------------------------+", - ]; - assert_batches_eq!(expected, &result); + let err = pretty_format_batches(&result).err().unwrap().to_string(); + assert_eq!(err, "Parser error: Invalid timezone \"Asia/Taipei2\": only offset based timezones supported without chrono-tz feature"); } diff --git a/datafusion/core/tests/sql/subqueries.rs b/datafusion/core/tests/sql/subqueries.rs index 6928e98b789b..fb114d641567 100644 --- a/datafusion/core/tests/sql/subqueries.rs +++ b/datafusion/core/tests/sql/subqueries.rs @@ -116,3 +116,63 @@ where o_orderstatus in ( Ok(()) } + +#[tokio::test] +async fn exists_subquery_with_same_table() -> Result<()> { + let ctx = create_join_context("t1_id", "t2_id", true)?; + + // Subquery and outer query refer to the same table. + // It will not be rewritten to join because it is not a correlated subquery. + let sql = "SELECT t1_id, t1_name, t1_int FROM t1 WHERE EXISTS(SELECT t1_int FROM t1 WHERE t1.t1_id > t1.t1_int)"; + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(&("explain ".to_owned() + sql)).await.expect(&msg); + let plan = dataframe.into_optimized_plan()?; + + let expected = vec![ + "Explain [plan_type:Utf8, plan:Utf8]", + " Filter: EXISTS () [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " Subquery: [t1_int:UInt32;N]", + " Projection: t1.t1_int [t1_int:UInt32;N]", + " Filter: t1.t1_id > t1.t1_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t1 [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + Ok(()) +} + +#[tokio::test] +async fn in_subquery_with_same_table() -> Result<()> { + let ctx = create_join_context("t1_id", "t2_id", true)?; + + // Subquery and outer query refer to the same table. + // It will be rewritten to join because in-subquery has extra predicate(`t1.t1_id = __correlated_sq_1.t1_int`). + let sql = "SELECT t1_id, t1_name, t1_int FROM t1 WHERE t1_id IN(SELECT t1_int FROM t1 WHERE t1.t1_id > t1.t1_int)"; + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(&("explain ".to_owned() + sql)).await.expect(&msg); + let plan = dataframe.into_optimized_plan()?; + + let expected = vec![ + "Explain [plan_type:Utf8, plan:Utf8]", + " LeftSemi Join: t1.t1_id = __correlated_sq_1.t1_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " SubqueryAlias: __correlated_sq_1 [t1_int:UInt32;N]", + " Projection: t1.t1_int AS t1_int [t1_int:UInt32;N]", + " Filter: t1.t1_id > t1.t1_int [t1_id:UInt32;N, t1_int:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_int] [t1_id:UInt32;N, t1_int:UInt32;N]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + Ok(()) +} diff --git a/datafusion/core/tests/sql/udf.rs b/datafusion/core/tests/sql/udf.rs index 0688aa319488..a1c48595605d 100644 --- a/datafusion/core/tests/sql/udf.rs +++ b/datafusion/core/tests/sql/udf.rs @@ -121,6 +121,64 @@ async fn scalar_udf() -> Result<()> { Ok(()) } +#[tokio::test] +async fn scalar_udf_zero_params() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); + + let batch = RecordBatch::try_new( + Arc::new(schema.clone()), + vec![Arc::new(Int32Array::from_slice([1, 10, 10, 100]))], + )?; + let ctx = SessionContext::new(); + + ctx.register_batch("t", batch)?; + // create function just returns 100 regardless of inp + let myfunc = |args: &[ArrayRef]| { + let num_rows = args[0].len(); + Ok(Arc::new((0..num_rows).map(|_| 100).collect::()) as ArrayRef) + }; + let myfunc = make_scalar_function(myfunc); + + ctx.register_udf(create_udf( + "get_100", + vec![], + Arc::new(DataType::Int32), + Volatility::Immutable, + myfunc, + )); + + let result = plan_and_collect(&ctx, "select get_100() a from t").await?; + let expected = vec![ + "+-----+", // + "| a |", // + "+-----+", // + "| 100 |", // + "| 100 |", // + "| 100 |", // + "| 100 |", // + "+-----+", // + ]; + assert_batches_eq!(expected, &result); + + let result = plan_and_collect(&ctx, "select get_100() a").await?; + let expected = vec![ + "+-----+", // + "| a |", // + "+-----+", // + "| 100 |", // + "+-----+", // + ]; + assert_batches_eq!(expected, &result); + + let result = plan_and_collect(&ctx, "select get_100() from t where a=999").await?; + let expected = vec![ + "++", // + "++", + ]; + assert_batches_eq!(expected, &result); + Ok(()) +} + /// tests the creation, registration and usage of a UDAF #[tokio::test] async fn simple_udaf() -> Result<()> { @@ -158,7 +216,7 @@ async fn simple_udaf() -> Result<()> { "+-------------+", "| my_avg(t.a) |", "+-------------+", - "| 3 |", + "| 3.0 |", "+-------------+", ]; assert_batches_eq!(expected, &result); diff --git a/datafusion/core/tests/sql/union.rs b/datafusion/core/tests/sql/union.rs index 804833bb9d77..4cf908aa8582 100644 --- a/datafusion/core/tests/sql/union.rs +++ b/datafusion/core/tests/sql/union.rs @@ -38,15 +38,13 @@ async fn union_with_except_input() -> Result<()> { "Explain [plan_type:Utf8, plan:Utf8]", " Union [name:UInt8;N]", " LeftAnti Join: t1.name = t2.name [name:UInt8;N]", - " Distinct: [name:UInt8;N]", + " Aggregate: groupBy=[[t1.name]], aggr=[[]] [name:UInt8;N]", " TableScan: t1 projection=[name] [name:UInt8;N]", - " Projection: t2.name [name:UInt8;N]", - " TableScan: t2 projection=[name] [name:UInt8;N]", + " TableScan: t2 projection=[name] [name:UInt8;N]", " LeftAnti Join: t2.name = t1.name [name:UInt8;N]", - " Distinct: [name:UInt8;N]", + " Aggregate: groupBy=[[t2.name]], aggr=[[]] [name:UInt8;N]", " TableScan: t2 projection=[name] [name:UInt8;N]", - " Projection: t1.name [name:UInt8;N]", - " TableScan: t1 projection=[name] [name:UInt8;N]", + " TableScan: t1 projection=[name] [name:UInt8;N]", ]; let formatted = plan.display_indent_schema().to_string(); @@ -79,16 +77,14 @@ async fn union_with_type_coercion() -> Result<()> { "Explain [plan_type:Utf8, plan:Utf8]", " Union [id:Int32;N, name:UInt8;N]", " LeftAnti Join: t1.id = CAST(t2.id AS Int32), t1.name = t2.name [id:Int32;N, name:UInt8;N]", - " Distinct: [id:Int32;N, name:UInt8;N]", + " Aggregate: groupBy=[[t1.id, t1.name]], aggr=[[]] [id:Int32;N, name:UInt8;N]", " TableScan: t1 projection=[id, name] [id:Int32;N, name:UInt8;N]", - " Projection: t2.id, t2.name [id:UInt8;N, name:UInt8;N]", - " TableScan: t2 projection=[id, name] [id:UInt8;N, name:UInt8;N]", + " TableScan: t2 projection=[id, name] [id:UInt8;N, name:UInt8;N]", " Projection: CAST(t2.id AS Int32) AS id, t2.name [id:Int32;N, name:UInt8;N]", " LeftAnti Join: CAST(t2.id AS Int32) = t1.id, t2.name = t1.name [id:UInt8;N, name:UInt8;N]", - " Distinct: [id:UInt8;N, name:UInt8;N]", + " Aggregate: groupBy=[[t2.id, t2.name]], aggr=[[]] [id:UInt8;N, name:UInt8;N]", " TableScan: t2 projection=[id, name] [id:UInt8;N, name:UInt8;N]", - " Projection: t1.id, t1.name [id:Int32;N, name:UInt8;N]", - " TableScan: t1 projection=[id, name] [id:Int32;N, name:UInt8;N]", + " TableScan: t1 projection=[id, name] [id:Int32;N, name:UInt8;N]", ]; let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); diff --git a/datafusion/core/tests/sql/wildcard.rs b/datafusion/core/tests/sql/wildcard.rs index a55ccb80f282..8cecfd829e9b 100644 --- a/datafusion/core/tests/sql/wildcard.rs +++ b/datafusion/core/tests/sql/wildcard.rs @@ -26,25 +26,25 @@ async fn select_qualified_wildcard() -> Result<()> { let results = execute_to_batches(&ctx, sql).await; let expected = vec![ - "+---------+----------------+-------+", - "| c1 | c2 | c3 |", - "+---------+----------------+-------+", - "| 0.00001 | 0.000000000001 | true |", - "| 0.00002 | 0.000000000002 | false |", - "| 0.00002 | 0.000000000002 | false |", - "| 0.00003 | 0.000000000003 | true |", - "| 0.00003 | 0.000000000003 | true |", - "| 0.00003 | 0.000000000003 | true |", - "| 0.00004 | 0.000000000004 | false |", - "| 0.00004 | 0.000000000004 | false |", - "| 0.00004 | 0.000000000004 | false |", - "| 0.00004 | 0.000000000004 | false |", - "| 0.00005 | 0.000000000005 | true |", - "| 0.00005 | 0.000000000005 | true |", - "| 0.00005 | 0.000000000005 | true |", - "| 0.00005 | 0.000000000005 | true |", - "| 0.00005 | 0.000000000005 | true |", - "+---------+----------------+-------+", + "+---------+---------+-------+", + "| c1 | c2 | c3 |", + "+---------+---------+-------+", + "| 0.00001 | 1.0e-12 | true |", + "| 0.00002 | 2.0e-12 | false |", + "| 0.00002 | 2.0e-12 | false |", + "| 0.00003 | 3.0e-12 | true |", + "| 0.00003 | 3.0e-12 | true |", + "| 0.00003 | 3.0e-12 | true |", + "| 0.00004 | 4.0e-12 | false |", + "| 0.00004 | 4.0e-12 | false |", + "| 0.00004 | 4.0e-12 | false |", + "| 0.00004 | 4.0e-12 | false |", + "| 0.00005 | 5.0e-12 | true |", + "| 0.00005 | 5.0e-12 | true |", + "| 0.00005 | 5.0e-12 | true |", + "| 0.00005 | 5.0e-12 | true |", + "| 0.00005 | 5.0e-12 | true |", + "+---------+---------+-------+", ]; assert_batches_eq!(expected, &results); @@ -61,25 +61,25 @@ async fn select_non_alias_qualified_wildcard() -> Result<()> { let results = execute_to_batches(&ctx, sql).await; let expected = vec![ - "+---------+----------------+-------+", - "| c1 | c2 | c3 |", - "+---------+----------------+-------+", - "| 0.00001 | 0.000000000001 | true |", - "| 0.00002 | 0.000000000002 | false |", - "| 0.00002 | 0.000000000002 | false |", - "| 0.00003 | 0.000000000003 | true |", - "| 0.00003 | 0.000000000003 | true |", - "| 0.00003 | 0.000000000003 | true |", - "| 0.00004 | 0.000000000004 | false |", - "| 0.00004 | 0.000000000004 | false |", - "| 0.00004 | 0.000000000004 | false |", - "| 0.00004 | 0.000000000004 | false |", - "| 0.00005 | 0.000000000005 | true |", - "| 0.00005 | 0.000000000005 | true |", - "| 0.00005 | 0.000000000005 | true |", - "| 0.00005 | 0.000000000005 | true |", - "| 0.00005 | 0.000000000005 | true |", - "+---------+----------------+-------+", + "+---------+---------+-------+", + "| c1 | c2 | c3 |", + "+---------+---------+-------+", + "| 0.00001 | 1.0e-12 | true |", + "| 0.00002 | 2.0e-12 | false |", + "| 0.00002 | 2.0e-12 | false |", + "| 0.00003 | 3.0e-12 | true |", + "| 0.00003 | 3.0e-12 | true |", + "| 0.00003 | 3.0e-12 | true |", + "| 0.00004 | 4.0e-12 | false |", + "| 0.00004 | 4.0e-12 | false |", + "| 0.00004 | 4.0e-12 | false |", + "| 0.00004 | 4.0e-12 | false |", + "| 0.00005 | 5.0e-12 | true |", + "| 0.00005 | 5.0e-12 | true |", + "| 0.00005 | 5.0e-12 | true |", + "| 0.00005 | 5.0e-12 | true |", + "| 0.00005 | 5.0e-12 | true |", + "+---------+---------+-------+", ]; assert_batches_eq!(expected, &results); diff --git a/datafusion/core/tests/sql/window.rs b/datafusion/core/tests/sql/window.rs index 7ef4af23a059..329eaee529cf 100644 --- a/datafusion/core/tests/sql/window.rs +++ b/datafusion/core/tests/sql/window.rs @@ -20,1053 +20,6 @@ use ::parquet::arrow::arrow_writer::ArrowWriter; use ::parquet::file::properties::WriterProperties; use datafusion::execution::options::ReadOptions; -#[tokio::test] -async fn window_in_expression() -> Result<()> { - let ctx = SessionContext::new(); - let sql = "select 1 - lag(amount, 1) over (order by idx) as column1 from (values ('a', 1, 100), ('a', 2, 150)) as t (col1, idx, amount)"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+---------+", - "| column1 |", - "+---------+", - "| |", - "| -99 |", - "+---------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_with_agg_in_expression() -> Result<()> { - let ctx = SessionContext::new(); - let sql = "select col1, idx, count(*), sum(amount), lag(sum(amount), 1) over (order by idx) as prev_amount, - sum(amount) - lag(sum(amount), 1) over (order by idx) as difference from ( - select * from (values ('a', 1, 100), ('a', 2, 150)) as t (col1, idx, amount) - ) a - group by col1, idx;"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+------+-----+-----------------+---------------+-------------+------------+", - "| col1 | idx | COUNT(UInt8(1)) | SUM(a.amount) | prev_amount | difference |", - "+------+-----+-----------------+---------------+-------------+------------+", - "| a | 1 | 1 | 100 | | |", - "| a | 2 | 1 | 150 | 100 | 50 |", - "+------+-----+-----------------+---------------+-------------+------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_empty() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT \ - SUM(c3) OVER() as sum1, \ - COUNT(*) OVER () as count1 \ - FROM aggregate_test_100 \ - ORDER BY c9 \ - LIMIT 5"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+------+--------+", - "| sum1 | count1 |", - "+------+--------+", - "| 781 | 100 |", - "| 781 | 100 |", - "| 781 | 100 |", - "| 781 | 100 |", - "| 781 | 100 |", - "+------+--------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_rows_preceding() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT \ - SUM(c4) OVER(ORDER BY c4 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING),\ - AVG(c4) OVER(ORDER BY c4 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING),\ - COUNT(*) OVER(ORDER BY c4 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING)\ - FROM aggregate_test_100 \ - ORDER BY c9 \ - LIMIT 5"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+----------------------------+----------------------------+-----------------+", - "| SUM(aggregate_test_100.c4) | AVG(aggregate_test_100.c4) | COUNT(UInt8(1)) |", - "+----------------------------+----------------------------+-----------------+", - "| -48302 | -16100.666666666666 | 3 |", - "| 11243 | 3747.6666666666665 | 3 |", - "| -51311 | -17103.666666666668 | 3 |", - "| -2391 | -797 | 3 |", - "| 46756 | 15585.333333333334 | 3 |", - "+----------------------------+----------------------------+-----------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_rows_preceding_stddev_variance() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT \ - VAR(c4) OVER(ORDER BY c4 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING),\ - VAR_POP(c4) OVER(ORDER BY c4 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING),\ - STDDEV(c4) OVER(ORDER BY c4 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING),\ - STDDEV_POP(c4) OVER(ORDER BY c4 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING)\ - FROM aggregate_test_100 \ - ORDER BY c9 \ - LIMIT 5"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+---------------------------------+------------------------------------+-------------------------------+----------------------------------+", - "| VARIANCE(aggregate_test_100.c4) | VARIANCEPOP(aggregate_test_100.c4) | STDDEV(aggregate_test_100.c4) | STDDEVPOP(aggregate_test_100.c4) |", - "+---------------------------------+------------------------------------+-------------------------------+----------------------------------+", - "| 46721.33333333174 | 31147.555555554496 | 216.15118166073427 | 176.4867007894773 |", - "| 2639429.333333332 | 1759619.5555555548 | 1624.6320609089714 | 1326.5065229977404 |", - "| 746202.3333333324 | 497468.2222222216 | 863.8300372951455 | 705.3142719541563 |", - "| 768422.9999999981 | 512281.9999999988 | 876.5973990378925 | 715.7387791645767 |", - "| 66526.3333333288 | 44350.88888888587 | 257.9269922542594 | 210.5965073045749 |", - "+---------------------------------+------------------------------------+-------------------------------+----------------------------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_rows_preceding_with_partition_unique_order_by() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT \ - SUM(c4) OVER(PARTITION BY c1 ORDER BY c9 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING),\ - AVG(c4) OVER(PARTITION BY c1 ORDER BY c9 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING),\ - COUNT(*) OVER(PARTITION BY c2 ORDER BY c9 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING)\ - FROM aggregate_test_100 \ - ORDER BY c9 \ - LIMIT 5"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+----------------------------+----------------------------+-----------------+", - "| SUM(aggregate_test_100.c4) | AVG(aggregate_test_100.c4) | COUNT(UInt8(1)) |", - "+----------------------------+----------------------------+-----------------+", - "| -38611 | -19305.5 | 2 |", - "| 17547 | 8773.5 | 2 |", - "| -1301 | -650.5 | 2 |", - "| 26638 | 13319 | 3 |", - "| 26861 | 8953.666666666666 | 3 |", - "+----------------------------+----------------------------+-----------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} -/// The partition by clause conducts sorting according to given partition column by default. If the -/// sorting columns have non unique values, the unstable sorting may produce indeterminate results. -/// Therefore, we are commenting out the following test for now. - -// #[tokio::test] -// async fn window_frame_rows_preceding_with_non_unique_partition() -> Result<()> { -// let ctx = SessionContext::new(); -// register_aggregate_csv(&ctx).await?; -// let sql = "SELECT \ -// SUM(c4) OVER(PARTITION BY c1 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING),\ -// COUNT(*) OVER(PARTITION BY c2 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING)\ -// FROM aggregate_test_100 \ -// ORDER BY c9 \ -// LIMIT 5"; -// let actual = execute_to_batches(&ctx, sql).await; -// let expected = vec![ -// "+----------------------------+-----------------+", -// "| SUM(aggregate_test_100.c4) | COUNT(UInt8(1)) |", -// "+----------------------------+-----------------+", -// "| -33822 | 3 |", -// "| 20808 | 3 |", -// "| -29881 | 3 |", -// "| -47613 | 3 |", -// "| -13474 | 3 |", -// "+----------------------------+-----------------+", -// ]; -// assert_batches_eq!(expected, &actual); -// Ok(()) -// } - -#[tokio::test] -async fn window_frame_ranges_preceding_following_desc() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT \ - SUM(c4) OVER(ORDER BY c2 DESC RANGE BETWEEN 1 PRECEDING AND 1 FOLLOWING),\ - SUM(c3) OVER(ORDER BY c2 DESC RANGE BETWEEN 10000 PRECEDING AND 10000 FOLLOWING),\ - COUNT(*) OVER(ORDER BY c2 DESC RANGE BETWEEN 1 PRECEDING AND 1 FOLLOWING) \ - FROM aggregate_test_100 \ - ORDER BY c9 \ - LIMIT 5"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+----------------------------+----------------------------+-----------------+", - "| SUM(aggregate_test_100.c4) | SUM(aggregate_test_100.c3) | COUNT(UInt8(1)) |", - "+----------------------------+----------------------------+-----------------+", - "| 52276 | 781 | 56 |", - "| 260620 | 781 | 63 |", - "| -28623 | 781 | 37 |", - "| 260620 | 781 | 63 |", - "| 260620 | 781 | 63 |", - "+----------------------------+----------------------------+-----------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_order_by_asc_desc_large() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT - SUM(c5) OVER (ORDER BY c2 ASC, c6 DESC) as sum1 - FROM aggregate_test_100 - LIMIT 5"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-------------+", - "| sum1 |", - "+-------------+", - "| -1383162419 |", - "| -3265456275 |", - "| -3909681744 |", - "| -5241214934 |", - "| -4246910946 |", - "+-------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_order_by_desc_large() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT - SUM(c5) OVER (ORDER BY c2 DESC, c6 ASC) as sum1 - FROM aggregate_test_100 - ORDER BY c9 - LIMIT 5"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-------------+", - "| sum1 |", - "+-------------+", - "| 11212193439 |", - "| 22799733943 |", - "| 2935356871 |", - "| 15810962683 |", - "| 18035025006 |", - "+-------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_order_by_null_timestamp_order_by() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_null_cases_csv(&ctx).await?; - let sql = "SELECT - SUM(c1) OVER (ORDER BY c2 DESC) as summation1 - FROM null_cases - LIMIT 5"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+------------+", - "| summation1 |", - "+------------+", - "| 962 |", - "| 962 |", - "| 962 |", - "| 962 |", - "| 962 |", - "+------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_order_by_null_desc() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_null_cases_csv(&ctx).await?; - let sql = "SELECT - COUNT(c2) OVER (ORDER BY c1 DESC RANGE BETWEEN 5 PRECEDING AND 3 FOLLOWING) - FROM null_cases - LIMIT 5"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+----------------------+", - "| COUNT(null_cases.c2) |", - "+----------------------+", - "| 9 |", - "| 9 |", - "| 9 |", - "| 9 |", - "| 9 |", - "+----------------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_order_by_null_asc() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_null_cases_csv(&ctx).await?; - let sql = "SELECT - COUNT(c2) OVER (ORDER BY c1 RANGE BETWEEN 5 PRECEDING AND 3 FOLLOWING) - FROM null_cases - ORDER BY c1 - LIMIT 5"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+----------------------+", - "| COUNT(null_cases.c2) |", - "+----------------------+", - "| 2 |", - "| 2 |", - "| 2 |", - "| 2 |", - "| 5 |", - "+----------------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_order_by_null_asc_null_first() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_null_cases_csv(&ctx).await?; - let sql = "SELECT - COUNT(c2) OVER (ORDER BY c1 NULLS FIRST RANGE BETWEEN 5 PRECEDING AND 3 FOLLOWING) - FROM null_cases - LIMIT 5"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+----------------------+", - "| COUNT(null_cases.c2) |", - "+----------------------+", - "| 9 |", - "| 9 |", - "| 9 |", - "| 9 |", - "| 9 |", - "+----------------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_order_by_null_desc_null_last() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_null_cases_csv(&ctx).await?; - let sql = "SELECT - COUNT(c2) OVER (ORDER BY c1 DESC NULLS LAST RANGE BETWEEN 5 PRECEDING AND 3 FOLLOWING) - FROM null_cases - LIMIT 5"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+----------------------+", - "| COUNT(null_cases.c2) |", - "+----------------------+", - "| 5 |", - "| 5 |", - "| 5 |", - "| 6 |", - "| 6 |", - "+----------------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_rows_order_by_null() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_null_cases_csv(&ctx).await?; - let sql = "SELECT - SUM(c1) OVER (ORDER BY c3 RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING) as a, - SUM(c1) OVER (ORDER BY c3 RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING) as b, - SUM(c1) OVER (ORDER BY c3 DESC RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING) as c, - SUM(c1) OVER (ORDER BY c3 NULLS first RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING) as d, - SUM(c1) OVER (ORDER BY c3 DESC NULLS last RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING) as e, - SUM(c1) OVER (ORDER BY c3 DESC NULLS first RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING) as f, - SUM(c1) OVER (ORDER BY c3 NULLS first RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING) as g, - SUM(c1) OVER (ORDER BY c3) as h, - SUM(c1) OVER (ORDER BY c3 DESC) as i, - SUM(c1) OVER (ORDER BY c3 NULLS first) as j, - SUM(c1) OVER (ORDER BY c3 DESC NULLS first) as k, - SUM(c1) OVER (ORDER BY c3 DESC NULLS last) as l, - SUM(c1) OVER (ORDER BY c3, c2) as m, - SUM(c1) OVER (ORDER BY c3, c1 DESC) as n, - SUM(c1) OVER (ORDER BY c3 DESC, c1) as o, - SUM(c1) OVER (ORDER BY c3, c1 NULLs first) as p, - SUM(c1) OVER (ORDER BY c3 RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING) as a1, - SUM(c1) OVER (ORDER BY c3 RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING) as b1, - SUM(c1) OVER (ORDER BY c3 DESC RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING) as c1, - SUM(c1) OVER (ORDER BY c3 NULLS first RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING) as d1, - SUM(c1) OVER (ORDER BY c3 DESC NULLS last RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING) as e1, - SUM(c1) OVER (ORDER BY c3 DESC NULLS first RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING) as f1, - SUM(c1) OVER (ORDER BY c3 NULLS first RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING) as g1, - SUM(c1) OVER (ORDER BY c3 RANGE BETWEEN UNBOUNDED PRECEDING AND current row) as h1, - SUM(c1) OVER (ORDER BY c3 RANGE BETWEEN UNBOUNDED PRECEDING AND current row) as j1, - SUM(c1) OVER (ORDER BY c3 DESC RANGE BETWEEN UNBOUNDED PRECEDING AND current row) as k1, - SUM(c1) OVER (ORDER BY c3 NULLS first RANGE BETWEEN UNBOUNDED PRECEDING AND current row) as l1, - SUM(c1) OVER (ORDER BY c3 DESC NULLS last RANGE BETWEEN UNBOUNDED PRECEDING AND current row) as m1, - SUM(c1) OVER (ORDER BY c3 DESC NULLS first RANGE BETWEEN UNBOUNDED PRECEDING AND current row) as n1, - SUM(c1) OVER (ORDER BY c3 NULLS first RANGE BETWEEN UNBOUNDED PRECEDING AND current row) as o1, - SUM(c1) OVER (ORDER BY c3 RANGE BETWEEN current row AND UNBOUNDED FOLLOWING) as h11, - SUM(c1) OVER (ORDER BY c3 RANGE BETWEEN current row AND UNBOUNDED FOLLOWING) as j11, - SUM(c1) OVER (ORDER BY c3 DESC RANGE BETWEEN current row AND UNBOUNDED FOLLOWING) as k11, - SUM(c1) OVER (ORDER BY c3 NULLS first RANGE BETWEEN current row AND UNBOUNDED FOLLOWING) as l11, - SUM(c1) OVER (ORDER BY c3 DESC NULLS last RANGE BETWEEN current row AND UNBOUNDED FOLLOWING) as m11, - SUM(c1) OVER (ORDER BY c3 DESC NULLS first RANGE BETWEEN current row AND UNBOUNDED FOLLOWING) as n11, - SUM(c1) OVER (ORDER BY c3 NULLS first RANGE BETWEEN current row AND UNBOUNDED FOLLOWING) as o11 - FROM null_cases - ORDER BY c3 - LIMIT 5"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----+-----+-----+-----+-----+-----+-----+-----+------+-----+------+------+-----+-----+------+-----+-----+-----+------+-----+------+------+-----+-----+-----+------+-----+------+------+-----+------+------+-----+------+-----+-----+------+", - "| a | b | c | d | e | f | g | h | i | j | k | l | m | n | o | p | a1 | b1 | c1 | d1 | e1 | f1 | g1 | h1 | j1 | k1 | l1 | m1 | n1 | o1 | h11 | j11 | k11 | l11 | m11 | n11 | o11 |", - "+-----+-----+-----+-----+-----+-----+-----+-----+------+-----+------+------+-----+-----+------+-----+-----+-----+------+-----+------+------+-----+-----+-----+------+-----+------+------+-----+------+------+-----+------+-----+-----+------+", - "| 412 | 412 | 339 | 412 | 339 | 339 | 412 | | 4627 | | 4627 | 4627 | | | 4627 | | 412 | 412 | 4627 | 412 | 4627 | 4627 | 412 | | | 4627 | | 4627 | 4627 | | 4627 | 4627 | | 4627 | | | 4627 |", - "| 488 | 488 | 412 | 488 | 412 | 412 | 488 | 72 | 4627 | 72 | 4627 | 4627 | 72 | 72 | 4627 | 72 | 488 | 488 | 4627 | 488 | 4627 | 4627 | 488 | 72 | 72 | 4627 | 72 | 4627 | 4627 | 72 | 4627 | 4627 | 72 | 4627 | 72 | 72 | 4627 |", - "| 543 | 543 | 488 | 543 | 488 | 488 | 543 | 96 | 4555 | 96 | 4555 | 4555 | 96 | 96 | 4555 | 96 | 543 | 543 | 4627 | 543 | 4627 | 4627 | 543 | 96 | 96 | 4555 | 96 | 4555 | 4555 | 96 | 4555 | 4555 | 96 | 4555 | 96 | 96 | 4555 |", - "| 553 | 553 | 543 | 553 | 543 | 543 | 553 | 115 | 4531 | 115 | 4531 | 4531 | 115 | 115 | 4531 | 115 | 553 | 553 | 4627 | 553 | 4627 | 4627 | 553 | 115 | 115 | 4531 | 115 | 4531 | 4531 | 115 | 4531 | 4531 | 115 | 4531 | 115 | 115 | 4531 |", - "| 553 | 553 | 553 | 553 | 553 | 553 | 553 | 140 | 4512 | 140 | 4512 | 4512 | 140 | 140 | 4512 | 140 | 553 | 553 | 4627 | 553 | 4627 | 4627 | 553 | 140 | 140 | 4512 | 140 | 4512 | 4512 | 140 | 4512 | 4512 | 140 | 4512 | 140 | 140 | 4512 |", - "+-----+-----+-----+-----+-----+-----+-----+-----+------+-----+------+------+-----+-----+------+-----+-----+-----+------+-----+------+------+-----+-----+-----+------+-----+------+------+-----+------+------+-----+------+-----+-----+------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_rows_preceding_with_unique_partition() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT \ - SUM(c4) OVER(PARTITION BY c1 ORDER BY c9 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING),\ - COUNT(*) OVER(PARTITION BY c1 ORDER BY c9 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING)\ - FROM aggregate_test_100 \ - ORDER BY c9 \ - LIMIT 5"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+----------------------------+-----------------+", - "| SUM(aggregate_test_100.c4) | COUNT(UInt8(1)) |", - "+----------------------------+-----------------+", - "| -38611 | 2 |", - "| 17547 | 2 |", - "| -1301 | 2 |", - "| 26638 | 2 |", - "| 26861 | 3 |", - "+----------------------------+-----------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_ranges_preceding_following() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT \ - SUM(c4) OVER(ORDER BY c2 RANGE BETWEEN 1 PRECEDING AND 1 FOLLOWING),\ - SUM(c3) OVER(ORDER BY c2 RANGE BETWEEN 10000 PRECEDING AND 10000 FOLLOWING),\ - COUNT(*) OVER(ORDER BY c2 RANGE BETWEEN 1 PRECEDING AND 1 FOLLOWING) \ - FROM aggregate_test_100 \ - ORDER BY c9 \ - LIMIT 5"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+----------------------------+----------------------------+-----------------+", - "| SUM(aggregate_test_100.c4) | SUM(aggregate_test_100.c3) | COUNT(UInt8(1)) |", - "+----------------------------+----------------------------+-----------------+", - "| 52276 | 781 | 56 |", - "| 260620 | 781 | 63 |", - "| -28623 | 781 | 37 |", - "| 260620 | 781 | 63 |", - "| 260620 | 781 | 63 |", - "+----------------------------+----------------------------+-----------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_ranges_ntile() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT \ - NTILE(8) OVER (ORDER BY C4) as ntile1,\ - NTILE(12) OVER (ORDER BY C12 DESC) as ntile2 \ - FROM aggregate_test_100 \ - ORDER BY c7 \ - LIMIT 5"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+--------+--------+", - "| ntile1 | ntile2 |", - "+--------+--------+", - "| 8 | 12 |", - "| 5 | 11 |", - "| 3 | 11 |", - "| 2 | 7 |", - "| 7 | 12 |", - "+--------+--------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_ranges_string_check() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT \ - SUM(LENGTH(c13)) OVER(ORDER BY c13), \ - SUM(LENGTH(c1)) OVER(ORDER BY c1) \ - FROM aggregate_test_100 \ - ORDER BY c9 \ - LIMIT 5"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+----------------------------------------------+---------------------------------------------+", - "| SUM(characterlength(aggregate_test_100.c13)) | SUM(characterlength(aggregate_test_100.c1)) |", - "+----------------------------------------------+---------------------------------------------+", - "| 2100 | 100 |", - "| 510 | 79 |", - "| 1440 | 21 |", - "| 1830 | 61 |", - "| 2010 | 21 |", - "+----------------------------------------------+---------------------------------------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_order_by_unique() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT \ - SUM(c5) OVER (ORDER BY c5) as sum1, \ - COUNT(*) OVER (ORDER BY c9) as count1 \ - FROM aggregate_test_100 \ - ORDER BY c9 \ - LIMIT 5"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+--------------+--------+", - "| sum1 | count1 |", - "+--------------+--------+", - "| -49877765574 | 1 |", - "| -50025861694 | 2 |", - "| -45402230071 | 3 |", - "| -14557735645 | 4 |", - "| -18365391649 | 5 |", - "+--------------+--------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -/// If the sorting columns have non unique values, the unstable sorting may produce -/// indeterminate results. Therefore, we are commenting out the following test for now. -/// -// #[tokio::test] -// async fn window_frame_order_by_non_unique() -> Result<()> { -// let ctx = SessionContext::new(); -// register_aggregate_csv(&ctx).await?; -// let sql = "SELECT \ -// c2, \ -// c9, \ -// SUM(c5) OVER (ORDER BY c2), \ -// COUNT(*) OVER (ORDER BY c2) \ -// FROM aggregate_test_100 \ -// ORDER BY c2 \ -// LIMIT 5"; -// let actual = execute_to_batches(&ctx, sql).await; -// let expected = vec![ -// "+----+------------+----------------------------+-----------------+", -// "| c2 | c9 | SUM(aggregate_test_100.c5) | COUNT(UInt8(1)) |", -// "+----+------------+----------------------------+-----------------+", -// "| 1 | 879082834 | -438598674 | 22 |", -// "| 1 | 3542840110 | -438598674 | 22 |", -// "| 1 | 3275293996 | -438598674 | 22 |", -// "| 1 | 774637006 | -438598674 | 22 |", -// "| 1 | 4015442341 | -438598674 | 22 |", -// "+----+------------+----------------------------+-----------------+", -// ]; -// assert_batches_eq!(expected, &actual); -// Ok(()) -// } - -#[tokio::test] -async fn window_frame_ranges_unbounded_preceding_following() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT \ - SUM(c2) OVER (ORDER BY c2 RANGE BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING) as sum1, \ - COUNT(*) OVER (ORDER BY c2 RANGE BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING) as cnt1 \ - FROM aggregate_test_100 \ - ORDER BY c9 \ - LIMIT 5"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+------+------+", - "| sum1 | cnt1 |", - "+------+------+", - "| 285 | 100 |", - "| 123 | 63 |", - "| 285 | 100 |", - "| 123 | 63 |", - "| 123 | 63 |", - "+------+------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_ranges_preceding_and_preceding() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT \ - SUM(c2) OVER (ORDER BY c2 RANGE BETWEEN 3 PRECEDING AND 1 PRECEDING), \ - COUNT(*) OVER (ORDER BY c2 RANGE BETWEEN 3 PRECEDING AND 1 PRECEDING) \ - FROM aggregate_test_100 \ - ORDER BY c9 \ - LIMIT 5"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+----------------------------+-----------------+", - "| SUM(aggregate_test_100.c2) | COUNT(UInt8(1)) |", - "+----------------------------+-----------------+", - "| 123 | 63 |", - "| 22 | 22 |", - "| 193 | 64 |", - "| 22 | 22 |", - "| 22 | 22 |", - "+----------------------------+-----------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_ranges_unbounded_preceding_following_diff_col() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT \ - SUM(c2) OVER (ORDER BY c2 RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING), \ - COUNT(*) OVER (ORDER BY c2 RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) \ - FROM aggregate_test_100 \ - ORDER BY c9 \ - LIMIT 5"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+----------------------------+-----------------+", - "| SUM(aggregate_test_100.c2) | COUNT(UInt8(1)) |", - "+----------------------------+-----------------+", - "| 162 | 37 |", - "| 101 | 41 |", - "| 70 | 14 |", - "| 101 | 41 |", - "| 101 | 41 |", - "+----------------------------+-----------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_partition_by_order_by_desc() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT - SUM(c4) OVER(PARTITION BY c1 ORDER BY c2 DESC RANGE BETWEEN 1 PRECEDING AND 1 FOLLOWING) - FROM aggregate_test_100 - ORDER BY c9 - LIMIT 5"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+----------------------------+", - "| SUM(aggregate_test_100.c4) |", - "+----------------------------+", - "| -124618 |", - "| 205080 |", - "| -40819 |", - "| -19517 |", - "| 47246 |", - "+----------------------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_range_float() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT - SUM(c12) OVER (ORDER BY C12 RANGE BETWEEN 0.2 PRECEDING AND 0.2 FOLLOWING) - FROM aggregate_test_100 - ORDER BY C9 - LIMIT 5"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----------------------------+", - "| SUM(aggregate_test_100.c12) |", - "+-----------------------------+", - "| 2.5476701803634296 |", - "| 10.6299412548214 |", - "| 2.5476701803634296 |", - "| 20.349518503437288 |", - "| 21.408674363507753 |", - "+-----------------------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_ranges_timestamp() -> Result<()> { - // define a schema. - let schema = Arc::new(Schema::new(vec![Field::new( - "ts", - DataType::Timestamp(TimeUnit::Nanosecond, None), - false, - )])); - - // define data in two partitions - let batch = RecordBatch::try_new( - schema.clone(), - vec![Arc::new(TimestampNanosecondArray::from_slice([ - 1664264591000000000, - 1664264592000000000, - 1664264592000000000, - 1664264593000000000, - 1664264594000000000, - 1664364594000000000, - 1664464594000000000, - 1664564594000000000, - ]))], - ) - .unwrap(); - - let ctx = SessionContext::new(); - // declare a new context. In spark API, this corresponds to a new spark SQLsession - // declare a table in memory. In spark API, this corresponds to createDataFrame(...). - let provider = MemTable::try_new(schema, vec![vec![batch]]).unwrap(); - // Register table - ctx.register_table("t", Arc::new(provider)).unwrap(); - - // execute the query - let df = ctx - .sql( - "SELECT - ts, - COUNT(*) OVER (ORDER BY ts RANGE BETWEEN INTERVAL '1' DAY PRECEDING AND INTERVAL '2 DAY' FOLLOWING) AS cnt1, - COUNT(*) OVER (ORDER BY ts RANGE BETWEEN '0 DAY' PRECEDING AND '0' DAY FOLLOWING) as cnt2, - COUNT(*) OVER (ORDER BY ts RANGE BETWEEN '5' SECOND PRECEDING AND CURRENT ROW) as cnt3 - FROM t - ORDER BY ts" - ) - .await?; - - let actual = df.collect().await?; - let expected = vec![ - "+---------------------+------+------+------+", - "| ts | cnt1 | cnt2 | cnt3 |", - "+---------------------+------+------+------+", - "| 2022-09-27T07:43:11 | 6 | 1 | 1 |", - "| 2022-09-27T07:43:12 | 6 | 2 | 3 |", - "| 2022-09-27T07:43:12 | 6 | 2 | 3 |", - "| 2022-09-27T07:43:13 | 6 | 1 | 4 |", - "| 2022-09-27T07:43:14 | 6 | 1 | 5 |", - "| 2022-09-28T11:29:54 | 2 | 1 | 1 |", - "| 2022-09-29T15:16:34 | 2 | 1 | 1 |", - "| 2022-09-30T19:03:14 | 1 | 1 | 1 |", - "+---------------------+------+------+------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_ranges_unbounded_preceding_err() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - // execute the query - let df = ctx - .sql( - "SELECT \ - SUM(c2) OVER (ORDER BY c2 RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED PRECEDING), \ - COUNT(*) OVER (ORDER BY c2 RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED PRECEDING) \ - FROM aggregate_test_100 \ - ORDER BY c9 \ - LIMIT 5", - ) - .await; - assert_eq!( - df.err().unwrap().to_string(), - "Execution error: Invalid window frame: end bound cannot be unbounded preceding" - .to_owned() - ); - Ok(()) -} - -#[tokio::test] -async fn window_frame_groups_preceding_following_desc() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT - SUM(c4) OVER(ORDER BY c2 DESC GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING), - SUM(c3) OVER(ORDER BY c2 DESC GROUPS BETWEEN 10000 PRECEDING AND 10000 FOLLOWING), - COUNT(*) OVER(ORDER BY c2 DESC GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING) - FROM aggregate_test_100 - ORDER BY c9 - LIMIT 5"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+----------------------------+----------------------------+-----------------+", - "| SUM(aggregate_test_100.c4) | SUM(aggregate_test_100.c3) | COUNT(UInt8(1)) |", - "+----------------------------+----------------------------+-----------------+", - "| 52276 | 781 | 56 |", - "| 260620 | 781 | 63 |", - "| -28623 | 781 | 37 |", - "| 260620 | 781 | 63 |", - "| 260620 | 781 | 63 |", - "+----------------------------+----------------------------+-----------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_groups_order_by_null_desc() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_null_cases_csv(&ctx).await?; - let sql = "SELECT - COUNT(c2) OVER (ORDER BY c1 DESC GROUPS BETWEEN 5 PRECEDING AND 3 FOLLOWING) - FROM null_cases - LIMIT 5"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+----------------------+", - "| COUNT(null_cases.c2) |", - "+----------------------+", - "| 12 |", - "| 12 |", - "| 12 |", - "| 12 |", - "| 12 |", - "+----------------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_groups() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_null_cases_csv(&ctx).await?; - let sql = "SELECT - SUM(c1) OVER (ORDER BY c3 GROUPS BETWEEN 9 PRECEDING AND 11 FOLLOWING) as a, - SUM(c1) OVER (ORDER BY c3 DESC GROUPS BETWEEN 9 PRECEDING AND 11 FOLLOWING) as b, - SUM(c1) OVER (ORDER BY c3 NULLS first GROUPS BETWEEN 9 PRECEDING AND 11 FOLLOWING) as c, - SUM(c1) OVER (ORDER BY c3 DESC NULLS last GROUPS BETWEEN 9 PRECEDING AND 11 FOLLOWING) as d, - SUM(c1) OVER (ORDER BY c3 DESC NULLS first GROUPS BETWEEN 9 PRECEDING AND 11 FOLLOWING) as e, - SUM(c1) OVER (ORDER BY c3 NULLS first GROUPS BETWEEN 9 PRECEDING AND 11 FOLLOWING) as f, - SUM(c1) OVER (ORDER BY c3 GROUPS current row) as a1, - SUM(c1) OVER (ORDER BY c3 GROUPS BETWEEN 9 PRECEDING AND 5 PRECEDING) as a2, - SUM(c1) OVER (ORDER BY c3 GROUPS BETWEEN UNBOUNDED PRECEDING AND 5 PRECEDING) as a3, - SUM(c1) OVER (ORDER BY c3 GROUPS BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING) as a4, - SUM(c1) OVER (ORDER BY c3 GROUPS BETWEEN UNBOUNDED PRECEDING AND current row) as a5, - SUM(c1) OVER (ORDER BY c3 GROUPS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as a6, - SUM(c1) OVER (ORDER BY c3 GROUPS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING) as a7, - SUM(c1) OVER (ORDER BY c3 GROUPS BETWEEN 3 FOLLOWING AND UNBOUNDED FOLLOWING) as a8, - SUM(c1) OVER (ORDER BY c3 GROUPS BETWEEN current row AND UNBOUNDED FOLLOWING) as a9, - SUM(c1) OVER (ORDER BY c3 GROUPS BETWEEN current row AND 3 FOLLOWING) as a10, - SUM(c1) OVER (ORDER BY c3 GROUPS BETWEEN 5 FOLLOWING AND 7 FOLLOWING) as a11, - SUM(c1) OVER (ORDER BY c3 DESC GROUPS current row) as a21, - SUM(c1) OVER (ORDER BY c3 NULLS first GROUPS BETWEEN 9 PRECEDING AND 5 PRECEDING) as a22, - SUM(c1) OVER (ORDER BY c3 DESC NULLS last GROUPS BETWEEN UNBOUNDED PRECEDING AND 5 PRECEDING) as a23, - SUM(c1) OVER (ORDER BY c3 NULLS last GROUPS BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING) as a24, - SUM(c1) OVER (ORDER BY c3 DESC NULLS first GROUPS BETWEEN UNBOUNDED PRECEDING AND current row) as a25 - FROM null_cases - ORDER BY c3 - LIMIT 10"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----+-----+-----+-----+-----+-----+----+-----+-----+-----+-----+------+------+------+------+-----+-----+-----+-----+------+-----+------+", - "| a | b | c | d | e | f | a1 | a2 | a3 | a4 | a5 | a6 | a7 | a8 | a9 | a10 | a11 | a21 | a22 | a23 | a24 | a25 |", - "+-----+-----+-----+-----+-----+-----+----+-----+-----+-----+-----+------+------+------+------+-----+-----+-----+-----+------+-----+------+", - "| 412 | 307 | 412 | 307 | 307 | 412 | | | | 412 | | 4627 | 4627 | 4531 | 4627 | 115 | 85 | | | 4487 | 412 | 4627 |", - "| 488 | 339 | 488 | 339 | 339 | 488 | 72 | | | 488 | 72 | 4627 | 4627 | 4512 | 4627 | 140 | 153 | 72 | | 4473 | 488 | 4627 |", - "| 543 | 412 | 543 | 412 | 412 | 543 | 24 | | | 543 | 96 | 4627 | 4627 | 4487 | 4555 | 82 | 122 | 24 | | 4442 | 543 | 4555 |", - "| 553 | 488 | 553 | 488 | 488 | 553 | 19 | | | 553 | 115 | 4627 | 4555 | 4473 | 4531 | 89 | 114 | 19 | | 4402 | 553 | 4531 |", - "| 553 | 543 | 553 | 543 | 543 | 553 | 25 | | | 553 | 140 | 4627 | 4531 | 4442 | 4512 | 110 | 105 | 25 | | 4320 | 553 | 4512 |", - "| 591 | 553 | 591 | 553 | 553 | 591 | 14 | | | 591 | 154 | 4627 | 4512 | 4402 | 4487 | 167 | 181 | 14 | | 4320 | 591 | 4487 |", - "| 651 | 553 | 651 | 553 | 553 | 651 | 31 | 72 | 72 | 651 | 185 | 4627 | 4487 | 4320 | 4473 | 153 | 204 | 31 | 72 | 4288 | 651 | 4473 |", - "| 662 | 591 | 662 | 591 | 591 | 662 | 40 | 96 | 96 | 662 | 225 | 4627 | 4473 | 4320 | 4442 | 154 | 141 | 40 | 96 | 4215 | 662 | 4442 |", - "| 697 | 651 | 697 | 651 | 651 | 697 | 82 | 115 | 115 | 697 | 307 | 4627 | 4442 | 4288 | 4402 | 187 | 65 | 82 | 115 | 4139 | 697 | 4402 |", - "| 758 | 662 | 758 | 662 | 662 | 758 | | 140 | 140 | 758 | 307 | 4627 | 4402 | 4215 | 4320 | 181 | 48 | | 140 | 4084 | 758 | 4320 |", - "+-----+-----+-----+-----+-----+-----+----+-----+-----+-----+-----+------+------+------+------+-----+-----+-----+-----+------+-----+------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_groups_multiple_order_columns() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_null_cases_csv(&ctx).await?; - let sql = "SELECT - SUM(c1) OVER (ORDER BY c2, c3 GROUPS BETWEEN 9 PRECEDING AND 11 FOLLOWING) as a, - SUM(c1) OVER (ORDER BY c2, c3 DESC GROUPS BETWEEN 9 PRECEDING AND 11 FOLLOWING) as b, - SUM(c1) OVER (ORDER BY c2, c3 NULLS first GROUPS BETWEEN 9 PRECEDING AND 11 FOLLOWING) as c, - SUM(c1) OVER (ORDER BY c2, c3 DESC NULLS last GROUPS BETWEEN 9 PRECEDING AND 11 FOLLOWING) as d, - SUM(c1) OVER (ORDER BY c2, c3 DESC NULLS first GROUPS BETWEEN 9 PRECEDING AND 11 FOLLOWING) as e, - SUM(c1) OVER (ORDER BY c2, c3 NULLS first GROUPS BETWEEN 9 PRECEDING AND 11 FOLLOWING) as f, - SUM(c1) OVER (ORDER BY c2, c3 GROUPS current row) as a1, - SUM(c1) OVER (ORDER BY c2, c3 GROUPS BETWEEN 9 PRECEDING AND 5 PRECEDING) as a2, - SUM(c1) OVER (ORDER BY c2, c3 GROUPS BETWEEN UNBOUNDED PRECEDING AND 5 PRECEDING) as a3, - SUM(c1) OVER (ORDER BY c2, c3 GROUPS BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING) as a4, - SUM(c1) OVER (ORDER BY c2, c3 GROUPS BETWEEN UNBOUNDED PRECEDING AND current row) as a5, - SUM(c1) OVER (ORDER BY c2, c3 GROUPS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as a6, - SUM(c1) OVER (ORDER BY c2, c3 GROUPS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING) as a7, - SUM(c1) OVER (ORDER BY c2, c3 GROUPS BETWEEN 3 FOLLOWING AND UNBOUNDED FOLLOWING) as a8, - SUM(c1) OVER (ORDER BY c2, c3 GROUPS BETWEEN current row AND UNBOUNDED FOLLOWING) as a9, - SUM(c1) OVER (ORDER BY c2, c3 GROUPS BETWEEN current row AND 3 FOLLOWING) as a10, - SUM(c1) OVER (ORDER BY c2, c3 GROUPS BETWEEN 5 FOLLOWING AND 7 FOLLOWING) as a11 - FROM null_cases - ORDER BY c3 - LIMIT 10"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+------+-----+------+-----+-----+------+----+-----+------+------+------+------+------+------+------+-----+-----+", - "| a | b | c | d | e | f | a1 | a2 | a3 | a4 | a5 | a6 | a7 | a8 | a9 | a10 | a11 |", - "+------+-----+------+-----+-----+------+----+-----+------+------+------+------+------+------+------+-----+-----+", - "| 818 | 910 | 818 | 910 | 910 | 818 | | 249 | 249 | 818 | 432 | 4627 | 4234 | 4157 | 4195 | 98 | 82 |", - "| 537 | 979 | 537 | 979 | 979 | 537 | 72 | | | 537 | 210 | 4627 | 4569 | 4378 | 4489 | 169 | 55 |", - "| 811 | 838 | 811 | 838 | 838 | 811 | 24 | 221 | 3075 | 3665 | 3311 | 4627 | 1390 | 1276 | 1340 | 117 | 144 |", - "| 763 | 464 | 763 | 464 | 464 | 763 | 19 | 168 | 3572 | 4167 | 3684 | 4627 | 962 | 829 | 962 | 194 | 80 |", - "| 552 | 964 | 552 | 964 | 964 | 552 | 25 | | | 552 | 235 | 4627 | 4489 | 4320 | 4417 | 167 | 39 |", - "| 963 | 930 | 963 | 930 | 930 | 963 | 14 | 201 | 818 | 1580 | 1098 | 4627 | 3638 | 3455 | 3543 | 177 | 224 |", - "| 1113 | 814 | 1113 | 814 | 814 | 1113 | 31 | 415 | 2653 | 3351 | 2885 | 4627 | 1798 | 1694 | 1773 | 165 | 162 |", - "| 780 | 868 | 780 | 868 | 868 | 780 | 40 | 258 | 3143 | 3665 | 3351 | 4627 | 1340 | 1223 | 1316 | 117 | 102 |", - "| 740 | 466 | 740 | 466 | 466 | 740 | 82 | 164 | 3592 | 4168 | 3766 | 4627 | 962 | 768 | 943 | 244 | 122 |", - "| 772 | 832 | 772 | 832 | 832 | 772 | | 277 | 3189 | 3684 | 3351 | 4627 | 1316 | 1199 | 1276 | 119 | 64 |", - "+------+-----+------+-----+-----+------+----+-----+------+------+------+------+------+------+------+-----+-----+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn window_frame_groups_without_order_by() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - // Try executing an erroneous query (the ORDER BY clause is missing in the - // window frame): - let err = ctx - .sql( - "SELECT - SUM(c4) OVER(PARTITION BY c2 GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING) - FROM aggregate_test_100 - ORDER BY c9;", - ) - .await - .unwrap_err(); - assert_contains!( - err.to_string(), - "Error during planning: GROUPS mode requires an ORDER BY clause".to_owned() - ); - Ok(()) -} - -#[tokio::test] -async fn window_frame_lag() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - // execute the query - let df = ctx - .sql( - "SELECT c2, - lag(c2, c2, c2) OVER () as lag1 - FROM aggregate_test_100;", - ) - .await?; - let err = df.collect().await.unwrap_err(); - assert_eq!( - err.to_string(), - "This feature is not implemented: There is only support Literal types for field at idx: 1 in Window Function".to_owned() - ); - Ok(()) -} - -#[tokio::test] -async fn window_frame_creation() -> Result<()> { - let ctx = SessionContext::new(); - register_aggregate_csv(&ctx).await?; - // execute the query - let df = ctx - .sql( - "SELECT - COUNT(c1) OVER (ORDER BY c2 RANGE BETWEEN 1 PRECEDING AND 2 PRECEDING) - FROM aggregate_test_100;", - ) - .await?; - let results = df.collect().await; - assert_eq!( - results.err().unwrap().to_string(), - "Error during planning: Invalid window frame: start bound (1 PRECEDING) cannot be larger than end bound (2 PRECEDING)" - ); - - let df = ctx - .sql( - "SELECT - COUNT(c1) OVER (ORDER BY c2 RANGE BETWEEN 2 FOLLOWING AND 1 FOLLOWING) - FROM aggregate_test_100;", - ) - .await?; - let results = df.collect().await; - assert_eq!( - results.err().unwrap().to_string(), - "Error during planning: Invalid window frame: start bound (2 FOLLOWING) cannot be larger than end bound (1 FOLLOWING)" - ); - - let err = ctx - .sql( - "SELECT - COUNT(c1) OVER(GROUPS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) - FROM aggregate_test_100;", - ) - .await - .unwrap_err(); - assert_contains!( - err.to_string(), - "Error during planning: GROUPS mode requires an ORDER BY clause" - ); - - Ok(()) -} - #[tokio::test] async fn window_frame_creation_type_checking() -> Result<()> { // The following query has type error. We should test the error could be detected @@ -1104,225 +57,10 @@ async fn window_frame_creation_type_checking() -> Result<()> { // Error is returned from the logical plan. check_query( false, - "Internal error: Optimizer rule 'type_coercion' failed due to unexpected error: Arrow error: Cast error: Cannot cast string '1 DAY' to value of UInt32 type" + "Internal error: Optimizer rule 'type_coercion' failed due to unexpected error: Execution error: Cannot cast Utf8(\"1 DAY\") to UInt32." ).await } -#[tokio::test] -async fn test_window_row_number_aggregate() -> Result<()> { - let config = SessionConfig::new(); - let ctx = SessionContext::with_config(config); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT - c8, - ROW_NUMBER() OVER(ORDER BY c9) AS rn1, - ROW_NUMBER() OVER(ORDER BY c9 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as rn2 - FROM aggregate_test_100 - ORDER BY c8 - LIMIT 5"; - - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----+-----+-----+", - "| c8 | rn1 | rn2 |", - "+-----+-----+-----+", - "| 102 | 73 | 73 |", - "| 299 | 1 | 1 |", - "| 363 | 41 | 41 |", - "| 417 | 14 | 14 |", - "| 794 | 95 | 95 |", - "+-----+-----+-----+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn test_window_range_equivalent_frames() -> Result<()> { - let config = SessionConfig::new(); - let ctx = SessionContext::with_config(config); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT - c9, - COUNT(*) OVER(ORDER BY c9, c1 RANGE BETWEEN CURRENT ROW AND CURRENT ROW) AS cnt1, - COUNT(*) OVER(ORDER BY c9, c1 RANGE UNBOUNDED PRECEDING) AS cnt2, - COUNT(*) OVER(ORDER BY c9, c1 RANGE CURRENT ROW) AS cnt3, - COUNT(*) OVER(RANGE BETWEEN CURRENT ROW AND CURRENT ROW) AS cnt4, - COUNT(*) OVER(RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cnt5, - COUNT(*) OVER(RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS cnt6 - FROM aggregate_test_100 - ORDER BY c9 - LIMIT 5"; - - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----------+------+------+------+------+------+------+", - "| c9 | cnt1 | cnt2 | cnt3 | cnt4 | cnt5 | cnt6 |", - "+-----------+------+------+------+------+------+------+", - "| 28774375 | 1 | 1 | 1 | 100 | 100 | 100 |", - "| 63044568 | 1 | 2 | 1 | 100 | 100 | 100 |", - "| 141047417 | 1 | 3 | 1 | 100 | 100 | 100 |", - "| 141680161 | 1 | 4 | 1 | 100 | 100 | 100 |", - "| 145294611 | 1 | 5 | 1 | 100 | 100 | 100 |", - "+-----------+------+------+------+------+------+------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn test_window_cume_dist() -> Result<()> { - let config = SessionConfig::new(); - let ctx = SessionContext::with_config(config); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT - c8, - CUME_DIST() OVER(ORDER BY c9) as cd1, - CUME_DIST() OVER(ORDER BY c9 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as cd2 - FROM aggregate_test_100 - ORDER BY c8 - LIMIT 5"; - - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----+------+------+", - "| c8 | cd1 | cd2 |", - "+-----+------+------+", - "| 102 | 0.73 | 0.73 |", - "| 299 | 0.01 | 0.01 |", - "| 363 | 0.41 | 0.41 |", - "| 417 | 0.14 | 0.14 |", - "| 794 | 0.95 | 0.95 |", - "+-----+------+------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn test_window_rank() -> Result<()> { - let config = SessionConfig::new(); - let ctx = SessionContext::with_config(config); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT - c9, - RANK() OVER(ORDER BY c1) AS rank1, - RANK() OVER(ORDER BY c1 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as rank2, - DENSE_RANK() OVER(ORDER BY c1) as dense_rank1, - DENSE_RANK() OVER(ORDER BY c1 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as dense_rank2, - PERCENT_RANK() OVER(ORDER BY c1) as percent_rank1, - PERCENT_RANK() OVER(ORDER BY c1 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as percent_rank2 - FROM aggregate_test_100 - ORDER BY c9 - LIMIT 5"; - - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----------+-------+-------+-------------+-------------+---------------------+---------------------+", - "| c9 | rank1 | rank2 | dense_rank1 | dense_rank2 | percent_rank1 | percent_rank2 |", - "+-----------+-------+-------+-------------+-------------+---------------------+---------------------+", - "| 28774375 | 80 | 80 | 5 | 5 | 0.797979797979798 | 0.797979797979798 |", - "| 63044568 | 62 | 62 | 4 | 4 | 0.6161616161616161 | 0.6161616161616161 |", - "| 141047417 | 1 | 1 | 1 | 1 | 0 | 0 |", - "| 141680161 | 41 | 41 | 3 | 3 | 0.40404040404040403 | 0.40404040404040403 |", - "| 145294611 | 1 | 1 | 1 | 1 | 0 | 0 |", - "+-----------+-------+-------+-------------+-------------+---------------------+---------------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn test_lag_lead() -> Result<()> { - let config = SessionConfig::new(); - let ctx = SessionContext::with_config(config); - register_aggregate_csv(&ctx).await?; - let sql = "SELECT - c9, - LAG(c9, 2, 10101) OVER(ORDER BY c9) as lag1, - LAG(c9, 2, 10101) OVER(ORDER BY c9 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as lag2, - LEAD(c9, 2, 10101) OVER(ORDER BY c9) as lead1, - LEAD(c9, 2, 10101) OVER(ORDER BY c9 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as lead2 - FROM aggregate_test_100 - ORDER BY c9 - LIMIT 5"; - - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-----------+-----------+-----------+-----------+-----------+", - "| c9 | lag1 | lag2 | lead1 | lead2 |", - "+-----------+-----------+-----------+-----------+-----------+", - "| 28774375 | 10101 | 10101 | 141047417 | 141047417 |", - "| 63044568 | 10101 | 10101 | 141680161 | 141680161 |", - "| 141047417 | 28774375 | 28774375 | 145294611 | 145294611 |", - "| 141680161 | 63044568 | 63044568 | 225513085 | 225513085 |", - "| 145294611 | 141047417 | 141047417 | 243203849 | 243203849 |", - "+-----------+-----------+-----------+-----------+-----------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn test_window_frame_first_value_last_value_aggregate() -> Result<()> { - let config = SessionConfig::new(); - let ctx = SessionContext::with_config(config); - register_aggregate_csv(&ctx).await?; - - let sql = "SELECT - FIRST_VALUE(c4) OVER(ORDER BY c9 ASC ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING) as first_value1, - FIRST_VALUE(c4) OVER(ORDER BY c9 ASC ROWS BETWEEN 2 PRECEDING AND 3 FOLLOWING) as first_value2, - LAST_VALUE(c4) OVER(ORDER BY c9 ASC ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING) as last_value1, - LAST_VALUE(c4) OVER(ORDER BY c9 ASC ROWS BETWEEN 2 PRECEDING AND 3 FOLLOWING) as last_value2 - FROM aggregate_test_100 - ORDER BY c9 - LIMIT 5"; - - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+--------------+--------------+-------------+-------------+", - "| first_value1 | first_value2 | last_value1 | last_value2 |", - "+--------------+--------------+-------------+-------------+", - "| -16110 | -16110 | 3917 | -1114 |", - "| -16110 | -16110 | -16974 | 15673 |", - "| -16110 | -16110 | -1114 | 13630 |", - "| -16110 | 3917 | 15673 | -13217 |", - "| -16110 | -16974 | 13630 | 20690 |", - "+--------------+--------------+-------------+-------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - -#[tokio::test] -async fn test_window_frame_nth_value_aggregate() -> Result<()> { - let config = SessionConfig::new(); - let ctx = SessionContext::with_config(config); - register_aggregate_csv(&ctx).await?; - - let sql = "SELECT - NTH_VALUE(c4, 3) OVER(ORDER BY c9 ASC ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING) as nth_value1, - NTH_VALUE(c4, 2) OVER(ORDER BY c9 ASC ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING) as nth_value2 - FROM aggregate_test_100 - ORDER BY c9 - LIMIT 5"; - - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+------------+------------+", - "| nth_value1 | nth_value2 |", - "+------------+------------+", - "| | 3917 |", - "| -16974 | 3917 |", - "| -16974 | -16974 |", - "| -1114 | -1114 |", - "| 15673 | 15673 |", - "+------------+------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) -} - #[tokio::test] async fn test_window_agg_sort() -> Result<()> { // We need to specify the target partition number. @@ -2174,7 +912,6 @@ async fn test_window_agg_with_global_limit() -> Result<()> { " AggregateExec: mode=Partial, gby=[], aggr=[ARRAYAGG(aggregate_test_100.c13)]", " GlobalLimitExec: skip=0, fetch=1", " SortExec: fetch=1, expr=[c13@0 ASC NULLS LAST]", - " ProjectionExec: expr=[c13@0 as c13]", ] }; @@ -2521,11 +1258,11 @@ mod tests { "+------+------+------+------+------+------+--------+--------+-------------------+-------------------+", "| sum1 | sum2 | min1 | min2 | max1 | max2 | count1 | count2 | avg1 | avg2 |", "+------+------+------+------+------+------+--------+--------+-------------------+-------------------+", - "| 16 | 6 | 1 | 1 | 10 | 5 | 3 | 2 | 5.333333333333333 | 3 |", - "| 16 | 6 | 1 | 1 | 10 | 5 | 3 | 2 | 5.333333333333333 | 3 |", + "| 16 | 6 | 1 | 1 | 10 | 5 | 3 | 2 | 5.333333333333333 | 3.0 |", + "| 16 | 6 | 1 | 1 | 10 | 5 | 3 | 2 | 5.333333333333333 | 3.0 |", "| 51 | 16 | 1 | 1 | 20 | 10 | 5 | 3 | 10.2 | 5.333333333333333 |", - "| 72 | 72 | 1 | 1 | 21 | 21 | 6 | 6 | 12 | 12 |", - "| 72 | 72 | 1 | 1 | 21 | 21 | 6 | 6 | 12 | 12 |", + "| 72 | 72 | 1 | 1 | 21 | 21 | 6 | 6 | 12.0 | 12.0 |", + "| 72 | 72 | 1 | 1 | 21 | 21 | 6 | 6 | 12.0 | 12.0 |", "+------+------+------+------+------+------+--------+--------+-------------------+-------------------+", ]; assert_batches_eq!(expected, &actual); diff --git a/datafusion/core/tests/sqllogictests/README.md b/datafusion/core/tests/sqllogictests/README.md index 2f697921a0bc..f3a06ec5a684 100644 --- a/datafusion/core/tests/sqllogictests/README.md +++ b/datafusion/core/tests/sqllogictests/README.md @@ -76,7 +76,7 @@ docker run \ #### Updating tests: Completion Mode -In test script completion mode, `sqllogictests` reads a prototype script and runs the statements and queries against the database engine. The output is is a full script that is a copy of the prototype script with result inserted. +In test script completion mode, `sqllogictests` reads a prototype script and runs the statements and queries against the database engine. The output is a full script that is a copy of the prototype script with result inserted. You can update the tests / generate expected output by passing the `--complete` argument. diff --git a/datafusion/core/tests/sqllogictests/test_files/explain.slt b/datafusion/core/tests/sqllogictests/test_files/explain.slt index 9eca732c45f3..fe1d3ac2e4c4 100644 --- a/datafusion/core/tests/sqllogictests/test_files/explain.slt +++ b/datafusion/core/tests/sqllogictests/test_files/explain.slt @@ -59,9 +59,8 @@ query TT EXPLAIN select count(*) from (values ('a', 1, 100), ('a', 2, 150)) as t (c1,c2,c3) ---- physical_plan -ProjectionExec: expr=[COUNT(UInt8(1))@0 as COUNT(UInt8(1))] - ProjectionExec: expr=[2 as COUNT(UInt8(1))] - EmptyExec: produce_one_row=true +ProjectionExec: expr=[2 as COUNT(UInt8(1))] + EmptyExec: produce_one_row=true statement ok set datafusion.explain.physical_plan_only = false diff --git a/datafusion/core/tests/sqllogictests/test_files/information_schema.slt b/datafusion/core/tests/sqllogictests/test_files/information_schema.slt index 7e4e98125548..25e4195fba6c 100644 --- a/datafusion/core/tests/sqllogictests/test_files/information_schema.slt +++ b/datafusion/core/tests/sqllogictests/test_files/information_schema.slt @@ -156,7 +156,7 @@ datafusion.optimizer.max_passes 3 datafusion.optimizer.prefer_hash_join true datafusion.optimizer.repartition_aggregations true datafusion.optimizer.repartition_file_min_size 10485760 -datafusion.optimizer.repartition_file_scans false +datafusion.optimizer.repartition_file_scans true datafusion.optimizer.repartition_joins true datafusion.optimizer.repartition_sorts true datafusion.optimizer.repartition_windows true diff --git a/datafusion/core/tests/sqllogictests/test_files/intersection.slt b/datafusion/core/tests/sqllogictests/test_files/intersection.slt new file mode 100644 index 000000000000..31121a333df8 --- /dev/null +++ b/datafusion/core/tests/sqllogictests/test_files/intersection.slt @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +statement ok +CREATE EXTERNAL TABLE alltypes_plain STORED AS PARQUET LOCATION '../../parquet-testing/data/alltypes_plain.parquet'; + +query ?I +SELECT * FROM (SELECT null AS id1, 1 AS id2) t1 + INTERSECT SELECT * FROM (SELECT null AS id1, 2 AS id2) t2 +---- + + +query ?I +SELECT * FROM (SELECT null AS id1, 1 AS id2) t1 + INTERSECT SELECT * FROM (SELECT null AS id1, 1 AS id2) t2 +---- +NULL 1 + + +query IR +SELECT int_col, double_col FROM alltypes_plain where int_col > 0 INTERSECT ALL SELECT int_col, double_col FROM alltypes_plain LIMIT 4 +---- +1 10.1 +1 10.1 +1 10.1 +1 10.1 + +query IR +SELECT int_col, double_col FROM alltypes_plain where int_col > 0 INTERSECT SELECT int_col, double_col FROM alltypes_plain +---- +1 10.1 diff --git a/datafusion/core/tests/sqllogictests/test_files/limit.slt b/datafusion/core/tests/sqllogictests/test_files/limit.slt new file mode 100644 index 000000000000..253ca8f335af --- /dev/null +++ b/datafusion/core/tests/sqllogictests/test_files/limit.slt @@ -0,0 +1,302 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +########## +## Limit Tests +########## + +statement ok +CREATE EXTERNAL TABLE aggregate_test_100 ( + c1 VARCHAR NOT NULL, + c2 TINYINT NOT NULL, + c3 SMALLINT NOT NULL, + c4 SMALLINT, + c5 INT, + c6 BIGINT NOT NULL, + c7 SMALLINT NOT NULL, + c8 INT NOT NULL, + c9 BIGINT UNSIGNED NOT NULL, + c10 VARCHAR NOT NULL, + c11 FLOAT NOT NULL, + c12 DOUBLE NOT NULL, + c13 VARCHAR NOT NULL +) +STORED AS CSV +WITH HEADER ROW +LOCATION '../../testing/data/csv/aggregate_test_100.csv' + +# async fn csv_query_limit +query T +SELECT c1 FROM aggregate_test_100 LIMIT 2 +---- +c +d + +# async fn csv_query_limit_bigger_than_nbr_of_rows +query I +SELECT c2 FROM aggregate_test_100 LIMIT 200 +---- +2 +5 +1 +1 +5 +4 +3 +3 +1 +4 +1 +4 +3 +2 +1 +1 +2 +1 +3 +2 +4 +1 +5 +4 +2 +1 +4 +5 +2 +3 +4 +2 +1 +5 +3 +1 +2 +3 +3 +3 +2 +4 +1 +3 +2 +5 +2 +1 +4 +1 +4 +2 +5 +4 +2 +3 +4 +4 +4 +5 +4 +2 +1 +2 +4 +2 +3 +5 +1 +1 +4 +2 +1 +2 +1 +1 +5 +4 +5 +2 +3 +2 +4 +1 +3 +4 +3 +2 +5 +3 +3 +2 +5 +5 +4 +1 +3 +3 +4 +4 + +# async fn csv_query_limit_with_same_nbr_of_rows +query I +SELECT c2 FROM aggregate_test_100 LIMIT 100 +---- +2 +5 +1 +1 +5 +4 +3 +3 +1 +4 +1 +4 +3 +2 +1 +1 +2 +1 +3 +2 +4 +1 +5 +4 +2 +1 +4 +5 +2 +3 +4 +2 +1 +5 +3 +1 +2 +3 +3 +3 +2 +4 +1 +3 +2 +5 +2 +1 +4 +1 +4 +2 +5 +4 +2 +3 +4 +4 +4 +5 +4 +2 +1 +2 +4 +2 +3 +5 +1 +1 +4 +2 +1 +2 +1 +1 +5 +4 +5 +2 +3 +2 +4 +1 +3 +4 +3 +2 +5 +3 +3 +2 +5 +5 +4 +1 +3 +3 +4 +4 + +# async fn csv_query_limit_zero +query T +SELECT c1 FROM aggregate_test_100 LIMIT 0 +---- + +# async fn csv_offset_without_limit_99 +query T +SELECT c1 FROM aggregate_test_100 OFFSET 99 +---- +e + +# async fn csv_offset_without_limit_100 +query T +SELECT c1 FROM aggregate_test_100 OFFSET 100 +---- + +# async fn csv_offset_without_limit_101 +query T +SELECT c1 FROM aggregate_test_100 OFFSET 101 +---- + +# async fn csv_query_offset +query T +SELECT c1 FROM aggregate_test_100 OFFSET 2 LIMIT 2 +---- +b +a + +# async fn csv_query_offset_the_same_as_nbr_of_rows +query T +SELECT c1 FROM aggregate_test_100 LIMIT 1 OFFSET 100 +---- + +# async fn csv_query_offset_bigger_than_nbr_of_rows +query T +SELECT c1 FROM aggregate_test_100 LIMIT 1 OFFSET 101 +---- + +######## +# Clean up after the test +######## + +statement ok +drop table aggregate_test_100; diff --git a/datafusion/core/tests/sqllogictests/test_files/order.slt b/datafusion/core/tests/sqllogictests/test_files/order.slt index 2ae39712c998..6c2bb3abc7d5 100644 --- a/datafusion/core/tests/sqllogictests/test_files/order.slt +++ b/datafusion/core/tests/sqllogictests/test_files/order.slt @@ -257,6 +257,41 @@ ORDER BY time; statement error DataFusion error: This feature is not implemented: SORT BY select * from t SORT BY time; + +# distinct on a column not in the select list should not work +statement error For SELECT DISTINCT, ORDER BY expressions time must appear in select list +SELECT DISTINCT value FROM t ORDER BY time; + +# distinct on an expression of a column not in the select list should not work +statement error For SELECT DISTINCT, ORDER BY expressions time must appear in select list +SELECT DISTINCT date_trunc('hour', time) FROM t ORDER BY time; + +# distinct on a column that is in the select list but aliasted should work +query P +SELECT DISTINCT time as "first_seen" FROM t ORDER BY "first_seen"; +---- +2022-01-01T00:00:30 +2022-01-01T01:00:10 +2022-01-02T00:00:20 + +# distinct on a column that is in the select list, but aliased (though +# the reference is to original expr) should work +query P +SELECT DISTINCT time as "first_seen" FROM t ORDER BY time; +---- +2022-01-01T00:00:30 +2022-01-01T01:00:10 +2022-01-02T00:00:20 + +# distinct on a column that is in the select list, but aliased (though +# the reference is its ordinal position) should work +query P +SELECT DISTINCT time as "first_seen" FROM t ORDER BY 1; +---- +2022-01-01T00:00:30 +2022-01-01T01:00:10 +2022-01-02T00:00:20 + ## Cleanup statement ok drop table t; diff --git a/datafusion/core/tests/sqllogictests/test_files/predicates.slt b/datafusion/core/tests/sqllogictests/test_files/predicates.slt new file mode 100644 index 000000000000..952a369642ea --- /dev/null +++ b/datafusion/core/tests/sqllogictests/test_files/predicates.slt @@ -0,0 +1,279 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +########## +## Limit Tests +########## + +statement ok +CREATE EXTERNAL TABLE aggregate_test_100 ( + c1 VARCHAR NOT NULL, + c2 TINYINT NOT NULL, + c3 SMALLINT NOT NULL, + c4 SMALLINT, + c5 INT, + c6 BIGINT NOT NULL, + c7 SMALLINT NOT NULL, + c8 INT NOT NULL, + c9 BIGINT UNSIGNED NOT NULL, + c10 VARCHAR NOT NULL, + c11 FLOAT NOT NULL, + c12 DOUBLE NOT NULL, + c13 VARCHAR NOT NULL +) +STORED AS CSV +WITH HEADER ROW +LOCATION '../../testing/data/csv/aggregate_test_100.csv' + +statement ok +CREATE EXTERNAL TABLE alltypes_plain STORED AS PARQUET LOCATION '../../parquet-testing/data/alltypes_plain.parquet'; + + +# async fn csv_query_with_predicate() +query TR +SELECT c1, c12 FROM aggregate_test_100 WHERE c12 > 0.376 AND c12 < 0.4 +---- +e 0.391444365692 +d 0.38870280984 + +# async fn csv_query_with_negative_predicate +query TI +SELECT c1, c4 FROM aggregate_test_100 WHERE c3 < -55 AND -c4 > 30000 +---- +e -31500 +c -30187 + +# async fn csv_query_with_negated_predicate() +query I +SELECT COUNT(1) FROM aggregate_test_100 WHERE NOT(c1 != 'a') +---- +21 + +# async fn csv_query_with_is_not_null_predicate +query I +SELECT COUNT(1) FROM aggregate_test_100 WHERE c1 IS NOT NULL +---- +100 + +# async fn csv_query_with_is_null_predicate +query I +SELECT COUNT(1) FROM aggregate_test_100 WHERE c1 IS NULL +---- +0 + +# async fn query_where_neg_num +query II +select c7, c8 from aggregate_test_100 where c7 >= -2 and c7 < 10 +---- +7 45465 +5 40622 +0 61069 +2 20120 +4 39363 + +query II +select c7, c8 from aggregate_test_100 where c7 >= -2.9 and c7 < 10 +---- +7 45465 +5 40622 +0 61069 +2 20120 +4 39363 + +# async fn like +query I +SELECT COUNT(c1) FROM aggregate_test_100 WHERE c13 LIKE '%FB%' +---- +1 + +# async fn csv_between_expr +query I +SELECT c4 FROM aggregate_test_100 WHERE c12 BETWEEN 0.995 AND 1.0 +---- +10837 + +# async fn csv_between_expr_negated +query I +SELECT c4 FROM aggregate_test_100 WHERE c12 NOT BETWEEN 0 AND 0.995 +---- +10837 + +# async fn csv_in_set_test +query I +SELECT count(*) FROM aggregate_test_100 WHERE c7 in ('25','155','204','77','208','67','139','191','26','7','202','113','129','197','249','146','129','220','154','163','220','19','71','243','150','231','196','170','99','255') +---- +36 + +# async fn except_with_null_not_equal +query ?I +SELECT * FROM (SELECT null AS id1, 1 AS id2) t1 +EXCEPT +SELECT * FROM (SELECT null AS id1, 2 AS id2) t2 +---- +NULL 1 + +# async fn except_with_null_equal +query ?I +SELECT * FROM (SELECT null AS id1, 1 AS id2) t1 +EXCEPT +SELECT * FROM (SELECT null AS id1, 1 AS id2) t2 +---- + +statement ok +CREATE TABLE IF NOT EXISTS test AS VALUES('foo'),('bar'),(NULL),('fazzz'); + +# async fn like_on_strings +query T +SELECT * FROM test WHERE column1 LIKE '%a%' +---- +bar +fazzz + +# async fn like_on_string_dictionaries +query T +SELECT * FROM test WHERE column1 LIKE '%a%' +---- +bar +fazzz + +# async fn in_list_string_dictionaries_with_null +query T +SELECT * FROM test WHERE column1 IN ('Bar') +---- + +query T +SELECT * FROM test WHERE column1 IN ('foo') +---- +foo + +query T +SELECT * FROM test WHERE column1 IN ('bar', 'foo') +---- +foo +bar + +query T +SELECT * FROM test WHERE column1 IN ('Bar', 'foo') +---- +foo + +query T +SELECT * FROM test WHERE column1 IN ('foo', 'Bar', 'fazzz') +---- +foo +fazzz + + +# async fn in_set_string_dictionaries +query T +SELECT * FROM test WHERE column1 IN ('foo', 'Bar', 'fazzz') +---- +foo +fazzz + +statement ok +DROP TABLE test; + +statement ok +CREATE TABLE IF NOT EXISTS test AS VALUES('foo'),('Barrr'),('Bazzz'),('ZZZZZ'); + +# async fn test_regexp_is_match +query T +SELECT * FROM test WHERE column1 ~ 'z' +---- +Bazzz + +query T +SELECT * FROM test WHERE column1 ~* 'z' +---- +Bazzz +ZZZZZ + +query T +SELECT * FROM test WHERE column1 !~ 'z' +---- +foo +Barrr +ZZZZZ + +query T +SELECT * FROM test WHERE column1 !~* 'z' +---- +foo +Barrr + +statement ok +DROP TABLE test; + +statement ok +CREATE TABLE IF NOT EXISTS test AS VALUES('foo'),('bar'),('fazzz'); + +# async fn in_list_string_dictionaries +query T +SELECT * FROM test WHERE column1 IN ('Bar') +---- + +query T +SELECT * FROM test WHERE column1 IN ('foo') +---- +foo + +query T +SELECT * FROM test WHERE column1 IN ('bar', 'foo') +---- +foo +bar + +query T +SELECT * FROM test WHERE column1 IN ('Bar', 'foo') +---- +foo + +query T +SELECT * FROM test WHERE column1 IN ('foo', 'Bar', 'fazzz') +---- +foo +fazzz + +# async fn test_expect_all +query IR +SELECT int_col, double_col FROM alltypes_plain where int_col > 0 EXCEPT ALL SELECT int_col, double_col FROM alltypes_plain where int_col < 1 +---- +1 10.1 +1 10.1 +1 10.1 +1 10.1 + +# async fn test_expect_distinct +query IR +SELECT int_col, double_col FROM alltypes_plain where int_col > 0 EXCEPT SELECT int_col, double_col FROM alltypes_plain where int_col < 1 +---- +1 10.1 + + +######## +# Clean up after the test +######## + +statement ok +drop table aggregate_test_100; + +statement ok +drop table alltypes_plain; + +statement ok +DROP TABLE test; diff --git a/datafusion/core/tests/sqllogictests/test_files/window.slt b/datafusion/core/tests/sqllogictests/test_files/window.slt index 7c83c1b9b555..a7054d4ae32f 100644 --- a/datafusion/core/tests/sqllogictests/test_files/window.slt +++ b/datafusion/core/tests/sqllogictests/test_files/window.slt @@ -35,6 +35,16 @@ STORED AS CSV WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +statement ok +CREATE EXTERNAL TABLE null_cases( + c1 BIGINT NULL, + c2 DOUBLE NULL, + c3 BIGINT NULL +) +STORED AS CSV +WITH HEADER ROW +LOCATION 'tests/data/null_cases.csv'; + ### This is the same table as ### execute_with_partition with 4 partitions statement ok @@ -248,18 +258,17 @@ Sort: d.b ASC NULLS LAST Aggregate: groupBy=[[d.b]], aggr=[[MAX(d.a)]] SubqueryAlias: d SubqueryAlias: _data2 - Projection: s.a, s.b - SubqueryAlias: s - SubqueryAlias: _sample_data - Union - Projection: Int64(1) AS a, Utf8("aa") AS b - EmptyRelation - Projection: Int64(3) AS a, Utf8("aa") AS b - EmptyRelation - Projection: Int64(5) AS a, Utf8("bb") AS b - EmptyRelation - Projection: Int64(7) AS a, Utf8("bb") AS b - EmptyRelation + SubqueryAlias: s + SubqueryAlias: _sample_data + Union + Projection: Int64(1) AS a, Utf8("aa") AS b + EmptyRelation + Projection: Int64(3) AS a, Utf8("aa") AS b + EmptyRelation + Projection: Int64(5) AS a, Utf8("bb") AS b + EmptyRelation + Projection: Int64(7) AS a, Utf8("bb") AS b + EmptyRelation physical_plan SortPreservingMergeExec: [b@0 ASC NULLS LAST] SortExec: expr=[b@0 ASC NULLS LAST] @@ -268,16 +277,15 @@ SortPreservingMergeExec: [b@0 ASC NULLS LAST] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([Column { name: "b", index: 0 }], 4), input_partitions=4 AggregateExec: mode=Partial, gby=[b@1 as b], aggr=[MAX(d.a)] - ProjectionExec: expr=[a@0 as a, b@1 as b] - UnionExec - ProjectionExec: expr=[1 as a, aa as b] - EmptyExec: produce_one_row=true - ProjectionExec: expr=[3 as a, aa as b] - EmptyExec: produce_one_row=true - ProjectionExec: expr=[5 as a, bb as b] - EmptyExec: produce_one_row=true - ProjectionExec: expr=[7 as a, bb as b] - EmptyExec: produce_one_row=true + UnionExec + ProjectionExec: expr=[1 as a, aa as b] + EmptyExec: produce_one_row=true + ProjectionExec: expr=[3 as a, aa as b] + EmptyExec: produce_one_row=true + ProjectionExec: expr=[5 as a, bb as b] + EmptyExec: produce_one_row=true + ProjectionExec: expr=[7 as a, bb as b] + EmptyExec: produce_one_row=true # Check actual result: query TI @@ -368,11 +376,6 @@ SortPreservingMergeExec: [b@0 ASC NULLS LAST] EmptyExec: produce_one_row=true - - - - - # check actual result query TII @@ -398,3 +401,792 @@ WITH _sample_data AS ( ---- aa 3 2 bb 7 2 + + +# async fn window_in_expression +query I rowsort +select 1 - lag(amount, 1) over (order by idx) as column1 from (values ('a', 1, 100), ('a', 2, 150)) as t (col1, idx, amount) +--- +---- +-99 +NULL + + +# async fn window_with_agg_in_expression +query TIIIII +select col1, idx, count(*), sum(amount), lag(sum(amount), 1) over (order by idx) as prev_amount, +sum(amount) - lag(sum(amount), 1) over (order by idx) as difference from ( +select * from (values ('a', 1, 100), ('a', 2, 150)) as t (col1, idx, amount) +) a +group by col1, idx +---- +a 1 1 100 NULL NULL +a 2 1 150 100 50 + + +# async fn window_frame_empty +query II +SELECT +SUM(c3) OVER() as sum1, +COUNT(*) OVER () as count1 +FROM aggregate_test_100 +ORDER BY c9 +LIMIT 5 +---- +781 100 +781 100 +781 100 +781 100 +781 100 + +# async fn window_frame_rows_preceding +query IRI +SELECT +SUM(c4) OVER(ORDER BY c4 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING), +AVG(c4) OVER(ORDER BY c4 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING), +COUNT(*) OVER(ORDER BY c4 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) +FROM aggregate_test_100 +ORDER BY c9 +LIMIT 5 +---- +-48302 -16100.666666666666 3 +11243 3747.666666666667 3 +-51311 -17103.666666666668 3 +-2391 -797 3 +46756 15585.333333333334 3 + + +# async fn window_frame_rows_preceding_stddev_variance +query RRRR +SELECT +VAR(c4) OVER(ORDER BY c4 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING), +VAR_POP(c4) OVER(ORDER BY c4 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING), +STDDEV(c4) OVER(ORDER BY c4 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING), +STDDEV_POP(c4) OVER(ORDER BY c4 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) +FROM aggregate_test_100 +ORDER BY c9 +LIMIT 5 +---- +46721.33333333174 31147.555555554496 216.151181660734 176.486700789477 +2639429.333333332 1759619.5555555548 1624.632060908971 1326.50652299774 +746202.3333333324 497468.2222222216 863.830037295146 705.314271954156 +768422.9999999981 512281.9999999988 876.597399037893 715.738779164577 +66526.3333333288 44350.88888888587 257.926992254259 210.596507304575 + +# async fn window_frame_rows_preceding_with_partition_unique_order_by +query IRI +SELECT +SUM(c4) OVER(PARTITION BY c1 ORDER BY c9 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING), +AVG(c4) OVER(PARTITION BY c1 ORDER BY c9 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING), +COUNT(*) OVER(PARTITION BY c2 ORDER BY c9 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) +FROM aggregate_test_100 +ORDER BY c9 +LIMIT 5 +---- +-38611 -19305.5 2 +17547 8773.5 2 +-1301 -650.5 2 +26638 13319 3 +26861 8953.666666666666 3 + +# /// The partition by clause conducts sorting according to given partition column by default. If the +# /// sorting columns have non unique values, the unstable sorting may produce indeterminate results. +# /// Therefore, we are commenting out the following test for now. + +#// #[tokio::test] +#// async fn window_frame_rows_preceding_with_non_unique_partition +#// let ctx = SessionContext::new(); +#// register_aggregate_csv(&ctx).await?; +#// let sql = "SELECT +#// SUM(c4) OVER(PARTITION BY c1 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING), +#// COUNT(*) OVER(PARTITION BY c2 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) +#// FROM aggregate_test_100 +#// ORDER BY c9 +#// LIMIT 5 +#// let actual = execute_to_batches(&ctx, sql).await; +#// let expected = vec![ +#// "+----------------------------+-----------------+", +#// "| SUM(aggregate_test_100.c4) | COUNT(UInt8(1)) |", +#// "+----------------------------+-----------------+", +#// "| -33822 | 3|", +#// "| 20808 | 3|", +#// "| -29881 | 3|", +#// "| -47613 | 3|", +#// "| -13474 | 3|", +#// "+----------------------------+-----------------+", +#// ]; +#// assert_batches_eq!(expected, &actual); +#// Ok(()) +#// } + +# async fn window_frame_ranges_preceding_following_desc +query III +SELECT +SUM(c4) OVER(ORDER BY c2 DESC RANGE BETWEEN 1 PRECEDING AND 1 FOLLOWING), +SUM(c3) OVER(ORDER BY c2 DESC RANGE BETWEEN 10000 PRECEDING AND 10000 FOLLOWING), +COUNT(*) OVER(ORDER BY c2 DESC RANGE BETWEEN 1 PRECEDING AND 1 FOLLOWING) +FROM aggregate_test_100 +ORDER BY c9 +LIMIT 5 +---- +52276 781 56 +260620 781 63 +-28623 781 37 +260620 781 63 +260620 781 63 + +# async fn window_frame_large_range +# Range offset 10000 is too big for Int8 (i.e. the type of c3). +# In this case, we should be able to still produce correct results. +# See the issue: https://github.com/apache/arrow-datafusion/issues/5346 +# below over clause is equivalent to OVER(ORDER BY c3 DESC RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) +# in terms of behaviour. +query I +SELECT +SUM(c3) OVER(ORDER BY c3 DESC RANGE BETWEEN 10000 PRECEDING AND 10000 FOLLOWING) as summation1 +FROM aggregate_test_100 +ORDER BY c9 +LIMIT 5 +---- +781 +781 +781 +781 +781 + +# async fn window_frame_order_by_asc_desc_large +query I +SELECT + SUM(c5) OVER (ORDER BY c2 ASC, c6 DESC) as sum1 + FROM aggregate_test_100 + LIMIT 5 +---- +-1383162419 +-3265456275 +-3909681744 +-5241214934 +-4246910946 + + +# async fn window_frame_order_by_desc_large +query I +SELECT + SUM(c5) OVER (ORDER BY c2 DESC, c6 ASC) as sum1 + FROM aggregate_test_100 + ORDER BY c9 + LIMIT 5 +---- +11212193439 +22799733943 +2935356871 +15810962683 +18035025006 + +# async fn window_frame_order_by_null_timestamp_order_by +query I +SELECT + SUM(c1) OVER (ORDER BY c2 DESC) as summation1 + FROM null_cases + LIMIT 5 +---- +962 +962 +962 +962 +962 + +# async fn window_frame_order_by_null_desc +query I +SELECT + COUNT(c2) OVER (ORDER BY c1 DESC RANGE BETWEEN 5 PRECEDING AND 3 FOLLOWING) + FROM null_cases + LIMIT 5 +---- +9 +9 +9 +9 +9 + +# async fn window_frame_order_by_null_asc +query I +SELECT + COUNT(c2) OVER (ORDER BY c1 RANGE BETWEEN 5 PRECEDING AND 3 FOLLOWING) + FROM null_cases + ORDER BY c1 + LIMIT 5 +---- +2 +2 +2 +2 +5 + +# async fn window_frame_order_by_null_asc_null_first +query I +SELECT + COUNT(c2) OVER (ORDER BY c1 NULLS FIRST RANGE BETWEEN 5 PRECEDING AND 3 FOLLOWING) + FROM null_cases + LIMIT 5 +---- +9 +9 +9 +9 +9 + +# async fn window_frame_order_by_null_desc_null_last +query I +SELECT + COUNT(c2) OVER (ORDER BY c1 DESC NULLS LAST RANGE BETWEEN 5 PRECEDING AND 3 FOLLOWING) + FROM null_cases + LIMIT 5 +---- +5 +5 +5 +6 +6 + +# async fn window_frame_rows_order_by_null +query IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +SELECT + SUM(c1) OVER (ORDER BY c3 RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING) as a, + SUM(c1) OVER (ORDER BY c3 RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING) as b, + SUM(c1) OVER (ORDER BY c3 DESC RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING) as c, + SUM(c1) OVER (ORDER BY c3 NULLS first RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING) as d, + SUM(c1) OVER (ORDER BY c3 DESC NULLS last RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING) as e, + SUM(c1) OVER (ORDER BY c3 DESC NULLS first RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING) as f, + SUM(c1) OVER (ORDER BY c3 NULLS first RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING) as g, + SUM(c1) OVER (ORDER BY c3) as h, + SUM(c1) OVER (ORDER BY c3 DESC) as i, + SUM(c1) OVER (ORDER BY c3 NULLS first) as j, + SUM(c1) OVER (ORDER BY c3 DESC NULLS first) as k, + SUM(c1) OVER (ORDER BY c3 DESC NULLS last) as l, + SUM(c1) OVER (ORDER BY c3, c2) as m, + SUM(c1) OVER (ORDER BY c3, c1 DESC) as n, + SUM(c1) OVER (ORDER BY c3 DESC, c1) as o, + SUM(c1) OVER (ORDER BY c3, c1 NULLs first) as p, + SUM(c1) OVER (ORDER BY c3 RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING) as a1, + SUM(c1) OVER (ORDER BY c3 RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING) as b1, + SUM(c1) OVER (ORDER BY c3 DESC RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING) as c1, + SUM(c1) OVER (ORDER BY c3 NULLS first RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING) as d1, + SUM(c1) OVER (ORDER BY c3 DESC NULLS last RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING) as e1, + SUM(c1) OVER (ORDER BY c3 DESC NULLS first RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING) as f1, + SUM(c1) OVER (ORDER BY c3 NULLS first RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING) as g1, + SUM(c1) OVER (ORDER BY c3 RANGE BETWEEN UNBOUNDED PRECEDING AND current row) as h1, + SUM(c1) OVER (ORDER BY c3 RANGE BETWEEN UNBOUNDED PRECEDING AND current row) as j1, + SUM(c1) OVER (ORDER BY c3 DESC RANGE BETWEEN UNBOUNDED PRECEDING AND current row) as k1, + SUM(c1) OVER (ORDER BY c3 NULLS first RANGE BETWEEN UNBOUNDED PRECEDING AND current row) as l1, + SUM(c1) OVER (ORDER BY c3 DESC NULLS last RANGE BETWEEN UNBOUNDED PRECEDING AND current row) as m1, + SUM(c1) OVER (ORDER BY c3 DESC NULLS first RANGE BETWEEN UNBOUNDED PRECEDING AND current row) as n1, + SUM(c1) OVER (ORDER BY c3 NULLS first RANGE BETWEEN UNBOUNDED PRECEDING AND current row) as o1, + SUM(c1) OVER (ORDER BY c3 RANGE BETWEEN current row AND UNBOUNDED FOLLOWING) as h11, + SUM(c1) OVER (ORDER BY c3 RANGE BETWEEN current row AND UNBOUNDED FOLLOWING) as j11, + SUM(c1) OVER (ORDER BY c3 DESC RANGE BETWEEN current row AND UNBOUNDED FOLLOWING) as k11, + SUM(c1) OVER (ORDER BY c3 NULLS first RANGE BETWEEN current row AND UNBOUNDED FOLLOWING) as l11, + SUM(c1) OVER (ORDER BY c3 DESC NULLS last RANGE BETWEEN current row AND UNBOUNDED FOLLOWING) as m11, + SUM(c1) OVER (ORDER BY c3 DESC NULLS first RANGE BETWEEN current row AND UNBOUNDED FOLLOWING) as n11, + SUM(c1) OVER (ORDER BY c3 NULLS first RANGE BETWEEN current row AND UNBOUNDED FOLLOWING) as o11 + FROM null_cases + ORDER BY c3 + LIMIT 5 +---- +412 412 339 412 339 339 412 NULL 4627 NULL 4627 4627 NULL NULL 4627 NULL 412 412 4627 412 4627 4627 412 NULL NULL 4627 NULL 4627 4627 NULL 4627 4627 NULL 4627 NULL NULL 4627 +488 488 412 488 412 412 488 72 4627 72 4627 4627 72 72 4627 72 488 488 4627 488 4627 4627 488 72 72 4627 72 4627 4627 72 4627 4627 72 4627 72 72 4627 +543 543 488 543 488 488 543 96 4555 96 4555 4555 96 96 4555 96 543 543 4627 543 4627 4627 543 96 96 4555 96 4555 4555 96 4555 4555 96 4555 96 96 4555 +553 553 543 553 543 543 553 115 4531 115 4531 4531 115 115 4531 115 553 553 4627 553 4627 4627 553 115 115 4531 115 4531 4531 115 4531 4531 115 4531 115 115 4531 +553 553 553 553 553 553 553 140 4512 140 4512 4512 140 140 4512 140 553 553 4627 553 4627 4627 553 140 140 4512 140 4512 4512 140 4512 4512 140 4512 140 140 4512 + + + +# window_frame_rows_preceding_with_unique_partition +query II +SELECT +SUM(c4) OVER(PARTITION BY c1 ORDER BY c9 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING), +COUNT(*) OVER(PARTITION BY c1 ORDER BY c9 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) +FROM aggregate_test_100 +ORDER BY c9 +LIMIT 5 +---- +-38611 2 +17547 2 +-1301 2 +26638 2 +26861 3 + + +#fn window_frame_ranges_preceding_following +statement error DataFusion error: Internal error: Operator \- is not implemented for types +SELECT +SUM(c4) OVER(ORDER BY c2 RANGE BETWEEN 1 PRECEDING AND 1 FOLLOWING), +SUM(c3) OVER(ORDER BY c2 RANGE BETWEEN 10000 PRECEDING AND 10000 FOLLOWING), +COUNT(*) OVER(ORDER BY c2 RANGE BETWEEN 1 PRECEDING AND 1 FOLLOWING) +FROM aggregate_test_100 +ORDER BY c9 +LIMIT 5 + + +#fn window_frame_ranges_ntile +query II +SELECT +NTILE(8) OVER (ORDER BY C4) as ntile1, +NTILE(12) OVER (ORDER BY C12 DESC) as ntile2 +FROM aggregate_test_100 +ORDER BY c7 +LIMIT 5 +---- +8 12 +5 11 +3 11 +2 7 +7 12 + +#fn window_frame_ranges_string_check +query II +SELECT +SUM(LENGTH(c13)) OVER(ORDER BY c13), +SUM(LENGTH(c1)) OVER(ORDER BY c1) +FROM aggregate_test_100 +ORDER BY c9 +LIMIT 5 +---- +2100 100 +510 79 +1440 21 +1830 61 +2010 21 + + +#fn window_frame_order_by_unique +query II +SELECT +SUM(c5) OVER (ORDER BY c5) as sum1, +COUNT(*) OVER (ORDER BY c9) as count1 +FROM aggregate_test_100 +ORDER BY c9 +LIMIT 5 +---- +-49877765574 1 +-50025861694 2 +-45402230071 3 +-14557735645 4 +-18365391649 5 + + +# /// If the sorting columns have non unique values, the unstable sorting may produce +# /// indeterminate results. Therefore, we are commenting out the following test for now. +# /// +# // #[tokio::test] +# // async fn window_frame_order_by_non_unique +# // let ctx = SessionContext::new(); +# // register_aggregate_csv(&ctx).await?; +# // let sql = "SELECT \ +# // c2, \ +# // c9, \ +# // SUM(c5) OVER (ORDER BY c2), \ +# // COUNT(*) OVER (ORDER BY c2) \ +# // FROM aggregate_test_100 \ +# // ORDER BY c2 \ +# // LIMIT 5"; +# // let actual = execute_to_batches(&ctx, sql).await; +# // let expected = vec![ +# // "+----+------------+----------------------------+-----------------+", +# // "| c2 | c9 | SUM(aggregate_test_100.c5) | COUNT(UInt8(1)) |", +# // "+----+------------+----------------------------+-----------------+", +# // "| 1 | 879082834 | -438598674 | 22 |", +# // "| 1 | 3542840110 | -438598674 | 22 |", +# // "| 1 | 3275293996 | -438598674 | 22 |", +# // "| 1 | 774637006 | -438598674 | 22 |", +# // "| 1 | 4015442341 | -438598674 | 22 |", +# // "+----+------------+----------------------------+-----------------+", +# // ]; +# // assert_batches_eq!(expected, &actual); +# // Ok(()) +# // } + +#fn window_frame_ranges_unbounded_preceding_following +query II +SELECT +SUM(c2) OVER (ORDER BY c2 RANGE BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING) as sum1, +COUNT(*) OVER (ORDER BY c2 RANGE BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING) as cnt1 +FROM aggregate_test_100 +ORDER BY c9 +LIMIT 5 +---- +285 100 +123 63 +285 100 +123 63 +123 63 + + +#fn window_frame_ranges_preceding_and_preceding +query II +SELECT +SUM(c2) OVER (ORDER BY c2 RANGE BETWEEN 3 PRECEDING AND 1 PRECEDING), +COUNT(*) OVER (ORDER BY c2 RANGE BETWEEN 3 PRECEDING AND 1 PRECEDING) +FROM aggregate_test_100 +ORDER BY c9 +LIMIT 5 +---- +123 63 +22 22 +193 64 +22 22 +22 22 + +#fn window_frame_ranges_unbounded_preceding_following_diff_col +query II +SELECT +SUM(c2) OVER (ORDER BY c2 RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING), +COUNT(*) OVER (ORDER BY c2 RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) +FROM aggregate_test_100 +ORDER BY c9 +LIMIT 5 +---- +162 37 +101 41 +70 14 +101 41 +101 41 + +#fn window_frame_partition_by_order_by_desc +query I +SELECT +SUM(c4) OVER(PARTITION BY c1 ORDER BY c2 DESC RANGE BETWEEN 1 PRECEDING AND 1 FOLLOWING) +FROM aggregate_test_100 +ORDER BY c9 +LIMIT 5 +---- +-124618 +205080 +-40819 +-19517 +47246 + + +#fn window_frame_range_float +query R +SELECT + SUM(c12) OVER (ORDER BY C12 RANGE BETWEEN 0.2 PRECEDING AND 0.2 FOLLOWING) + FROM aggregate_test_100 + ORDER BY C9 + LIMIT 5 +---- +2.547670180363 +10.629941254821 +2.547670180363 +20.349518503437 +21.408674363508 + +#fn window_frame_ranges_timestamp + +statement ok +create table temp as values +(1664264591000000000), +(1664264592000000000), +(1664264592000000000), +(1664264593000000000), +(1664264594000000000), +(1664364594000000000), +(1664464594000000000), +(1664564594000000000); + +statement ok +create table t as select cast(column1 as timestamp) as ts from temp; + +query PIII +SELECT + ts, + COUNT(*) OVER (ORDER BY ts RANGE BETWEEN INTERVAL '1' DAY PRECEDING AND INTERVAL '2 DAY' FOLLOWING) AS cnt1, + COUNT(*) OVER (ORDER BY ts RANGE BETWEEN '0 DAY' PRECEDING AND '0' DAY FOLLOWING) as cnt2, + COUNT(*) OVER (ORDER BY ts RANGE BETWEEN '5' SECOND PRECEDING AND CURRENT ROW) as cnt3 + FROM t + ORDER BY ts +---- +2022-09-27T07:43:11 6 1 1 +2022-09-27T07:43:12 6 2 3 +2022-09-27T07:43:12 6 2 3 +2022-09-27T07:43:13 6 1 4 +2022-09-27T07:43:14 6 1 5 +2022-09-28T11:29:54 2 1 1 +2022-09-29T15:16:34 2 1 1 +2022-09-30T19:03:14 1 1 1 + +statement ok +drop table t + +statement ok +drop table temp + + +#fn window_frame_ranges_unbounded_preceding_err +statement error DataFusion error: Execution error: Invalid window frame: end bound cannot be unbounded preceding +SELECT +SUM(c2) OVER (ORDER BY c2 RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED PRECEDING), +COUNT(*) OVER (ORDER BY c2 RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED PRECEDING) +FROM aggregate_test_100 +ORDER BY c9 +LIMIT 5 + + +#fn window_frame_groups_preceding_following_desc +query III +SELECT +SUM(c4) OVER(ORDER BY c2 DESC GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING), +SUM(c3) OVER(ORDER BY c2 DESC GROUPS BETWEEN 10000 PRECEDING AND 10000 FOLLOWING), +COUNT(*) OVER(ORDER BY c2 DESC GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING) +FROM aggregate_test_100 +ORDER BY c9 +LIMIT 5 +---- +52276 781 56 +260620 781 63 +-28623 781 37 +260620 781 63 +260620 781 63 + +#fn window_frame_groups_order_by_null_desc +query I +SELECT +COUNT(c2) OVER (ORDER BY c1 DESC GROUPS BETWEEN 5 PRECEDING AND 3 FOLLOWING) +FROM null_cases +LIMIT 5 +---- +12 +12 +12 +12 +12 + +#fn window_frame_groups +query IIIIIIIIIIIIIIIIIIIIII +SELECT +SUM(c1) OVER (ORDER BY c3 GROUPS BETWEEN 9 PRECEDING AND 11 FOLLOWING) as a, +SUM(c1) OVER (ORDER BY c3 DESC GROUPS BETWEEN 9 PRECEDING AND 11 FOLLOWING) as b, +SUM(c1) OVER (ORDER BY c3 NULLS first GROUPS BETWEEN 9 PRECEDING AND 11 FOLLOWING) as c, +SUM(c1) OVER (ORDER BY c3 DESC NULLS last GROUPS BETWEEN 9 PRECEDING AND 11 FOLLOWING) as d, +SUM(c1) OVER (ORDER BY c3 DESC NULLS first GROUPS BETWEEN 9 PRECEDING AND 11 FOLLOWING) as e, +SUM(c1) OVER (ORDER BY c3 NULLS first GROUPS BETWEEN 9 PRECEDING AND 11 FOLLOWING) as f, +SUM(c1) OVER (ORDER BY c3 GROUPS current row) as a1, +SUM(c1) OVER (ORDER BY c3 GROUPS BETWEEN 9 PRECEDING AND 5 PRECEDING) as a2, +SUM(c1) OVER (ORDER BY c3 GROUPS BETWEEN UNBOUNDED PRECEDING AND 5 PRECEDING) as a3, +SUM(c1) OVER (ORDER BY c3 GROUPS BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING) as a4, +SUM(c1) OVER (ORDER BY c3 GROUPS BETWEEN UNBOUNDED PRECEDING AND current row) as a5, +SUM(c1) OVER (ORDER BY c3 GROUPS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as a6, +SUM(c1) OVER (ORDER BY c3 GROUPS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING) as a7, +SUM(c1) OVER (ORDER BY c3 GROUPS BETWEEN 3 FOLLOWING AND UNBOUNDED FOLLOWING) as a8, +SUM(c1) OVER (ORDER BY c3 GROUPS BETWEEN current row AND UNBOUNDED FOLLOWING) as a9, +SUM(c1) OVER (ORDER BY c3 GROUPS BETWEEN current row AND 3 FOLLOWING) as a10, +SUM(c1) OVER (ORDER BY c3 GROUPS BETWEEN 5 FOLLOWING AND 7 FOLLOWING) as a11, +SUM(c1) OVER (ORDER BY c3 DESC GROUPS current row) as a21, +SUM(c1) OVER (ORDER BY c3 NULLS first GROUPS BETWEEN 9 PRECEDING AND 5 PRECEDING) as a22, +SUM(c1) OVER (ORDER BY c3 DESC NULLS last GROUPS BETWEEN UNBOUNDED PRECEDING AND 5 PRECEDING) as a23, +SUM(c1) OVER (ORDER BY c3 NULLS last GROUPS BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING) as a24, +SUM(c1) OVER (ORDER BY c3 DESC NULLS first GROUPS BETWEEN UNBOUNDED PRECEDING AND current row) as a25 +FROM null_cases +ORDER BY c3 +LIMIT 10 +---- +412 307 412 307 307 412 NULL NULL NULL 412 NULL 4627 4627 4531 4627 115 85 NULL NULL 4487 412 4627 +488 339 488 339 339 488 72 NULL NULL 488 72 4627 4627 4512 4627 140 153 72 NULL 4473 488 4627 +543 412 543 412 412 543 24 NULL NULL 543 96 4627 4627 4487 4555 82 122 24 NULL 4442 543 4555 +553 488 553 488 488 553 19 NULL NULL 553 115 4627 4555 4473 4531 89 114 19 NULL 4402 553 4531 +553 543 553 543 543 553 25 NULL NULL 553 140 4627 4531 4442 4512 110 105 25 NULL 4320 553 4512 +591 553 591 553 553 591 14 NULL NULL 591 154 4627 4512 4402 4487 167 181 14 NULL 4320 591 4487 +651 553 651 553 553 651 31 72 72 651 185 4627 4487 4320 4473 153 204 31 72 4288 651 4473 +662 591 662 591 591 662 40 96 96 662 225 4627 4473 4320 4442 154 141 40 96 4215 662 4442 +697 651 697 651 651 697 82 115 115 697 307 4627 4442 4288 4402 187 65 82 115 4139 697 4402 +758 662 758 662 662 758 NULL 140 140 758 307 4627 4402 4215 4320 181 48 NULL 140 4084 758 4320 + +#fn window_frame_groups_multiple_order_columns +query IIIIIIIIIIIIIIIII +SELECT +SUM(c1) OVER (ORDER BY c2, c3 GROUPS BETWEEN 9 PRECEDING AND 11 FOLLOWING) as a, +SUM(c1) OVER (ORDER BY c2, c3 DESC GROUPS BETWEEN 9 PRECEDING AND 11 FOLLOWING) as b, +SUM(c1) OVER (ORDER BY c2, c3 NULLS first GROUPS BETWEEN 9 PRECEDING AND 11 FOLLOWING) as c, +SUM(c1) OVER (ORDER BY c2, c3 DESC NULLS last GROUPS BETWEEN 9 PRECEDING AND 11 FOLLOWING) as d, +SUM(c1) OVER (ORDER BY c2, c3 DESC NULLS first GROUPS BETWEEN 9 PRECEDING AND 11 FOLLOWING) as e, +SUM(c1) OVER (ORDER BY c2, c3 NULLS first GROUPS BETWEEN 9 PRECEDING AND 11 FOLLOWING) as f, +SUM(c1) OVER (ORDER BY c2, c3 GROUPS current row) as a1, +SUM(c1) OVER (ORDER BY c2, c3 GROUPS BETWEEN 9 PRECEDING AND 5 PRECEDING) as a2, +SUM(c1) OVER (ORDER BY c2, c3 GROUPS BETWEEN UNBOUNDED PRECEDING AND 5 PRECEDING) as a3, +SUM(c1) OVER (ORDER BY c2, c3 GROUPS BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING) as a4, +SUM(c1) OVER (ORDER BY c2, c3 GROUPS BETWEEN UNBOUNDED PRECEDING AND current row) as a5, +SUM(c1) OVER (ORDER BY c2, c3 GROUPS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as a6, +SUM(c1) OVER (ORDER BY c2, c3 GROUPS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING) as a7, +SUM(c1) OVER (ORDER BY c2, c3 GROUPS BETWEEN 3 FOLLOWING AND UNBOUNDED FOLLOWING) as a8, +SUM(c1) OVER (ORDER BY c2, c3 GROUPS BETWEEN current row AND UNBOUNDED FOLLOWING) as a9, +SUM(c1) OVER (ORDER BY c2, c3 GROUPS BETWEEN current row AND 3 FOLLOWING) as a10, +SUM(c1) OVER (ORDER BY c2, c3 GROUPS BETWEEN 5 FOLLOWING AND 7 FOLLOWING) as a11 +FROM null_cases +ORDER BY c3 +LIMIT 10 +---- +818 910 818 910 910 818 NULL 249 249 818 432 4627 4234 4157 4195 98 82 +537 979 537 979 979 537 72 NULL NULL 537 210 4627 4569 4378 4489 169 55 +811 838 811 838 838 811 24 221 3075 3665 3311 4627 1390 1276 1340 117 144 +763 464 763 464 464 763 19 168 3572 4167 3684 4627 962 829 962 194 80 +552 964 552 964 964 552 25 NULL NULL 552 235 4627 4489 4320 4417 167 39 +963 930 963 930 930 963 14 201 818 1580 1098 4627 3638 3455 3543 177 224 +1113 814 1113 814 814 1113 31 415 2653 3351 2885 4627 1798 1694 1773 165 162 +780 868 780 868 868 780 40 258 3143 3665 3351 4627 1340 1223 1316 117 102 +740 466 740 466 466 740 82 164 3592 4168 3766 4627 962 768 943 244 122 +772 832 772 832 832 772 NULL 277 3189 3684 3351 4627 1316 1199 1276 119 64 + +#fn window_frame_groups_without_order_by +# Try executing an erroneous query (the ORDER BY clause is missing in the +# window frame): +statement error Error during planning: GROUPS mode requires an ORDER BY clause +SELECT + SUM(c4) OVER(PARTITION BY c2 GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING) + FROM aggregate_test_100 + ORDER BY c9; + + +#fn window_frame_lag +statement error This feature is not implemented: There is only support Literal types for field at idx: 1 in Window Function +SELECT c2, + lag(c2, c2, c2) OVER () as lag1 + FROM aggregate_test_100; + + +#fn window_frame_creation +statement error DataFusion error: Error during planning: Invalid window frame: start bound \(1 PRECEDING\) cannot be larger than end bound \(2 PRECEDING\) +SELECT + COUNT(c1) OVER (ORDER BY c2 RANGE BETWEEN 1 PRECEDING AND 2 PRECEDING) + FROM aggregate_test_100 + +statement error DataFusion error: Error during planning: Invalid window frame: start bound \(2 FOLLOWING\) cannot be larger than end bound \(1 FOLLOWING\) +SELECT + COUNT(c1) OVER (ORDER BY c2 RANGE BETWEEN 2 FOLLOWING AND 1 FOLLOWING) + FROM aggregate_test_100 + +statement error DataFusion error: Error during planning: GROUPS mode requires an ORDER BY clause +SELECT + COUNT(c1) OVER(GROUPS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM aggregate_test_100 + + +#fn test_window_row_number_aggregate +query III +SELECT + c8, + ROW_NUMBER() OVER(ORDER BY c9) AS rn1, + ROW_NUMBER() OVER(ORDER BY c9 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as rn2 + FROM aggregate_test_100 + ORDER BY c8 + LIMIT 5 +---- +102 73 73 +299 1 1 +363 41 41 +417 14 14 +794 95 95 + +#fn test_window_range_equivalent_frames +query IIIIIII +SELECT + c9, + COUNT(*) OVER(ORDER BY c9, c1 RANGE BETWEEN CURRENT ROW AND CURRENT ROW) AS cnt1, + COUNT(*) OVER(ORDER BY c9, c1 RANGE UNBOUNDED PRECEDING) AS cnt2, + COUNT(*) OVER(ORDER BY c9, c1 RANGE CURRENT ROW) AS cnt3, + COUNT(*) OVER(RANGE BETWEEN CURRENT ROW AND CURRENT ROW) AS cnt4, + COUNT(*) OVER(RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cnt5, + COUNT(*) OVER(RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS cnt6 + FROM aggregate_test_100 + ORDER BY c9 + LIMIT 5 +---- +28774375 1 1 1 100 100 100 +63044568 1 2 1 100 100 100 +141047417 1 3 1 100 100 100 +141680161 1 4 1 100 100 100 +145294611 1 5 1 100 100 100 + +#fn test_window_cume_dist +query IRR +SELECT + c8, + CUME_DIST() OVER(ORDER BY c9) as cd1, + CUME_DIST() OVER(ORDER BY c9 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as cd2 + FROM aggregate_test_100 + ORDER BY c8 + LIMIT 5 +---- +102 0.73 0.73 +299 0.01 0.01 +363 0.41 0.41 +417 0.14 0.14 +794 0.95 0.95 + +#fn test_window_rank +query IIIIIRR +SELECT + c9, + RANK() OVER(ORDER BY c1) AS rank1, + RANK() OVER(ORDER BY c1 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as rank2, + DENSE_RANK() OVER(ORDER BY c1) as dense_rank1, + DENSE_RANK() OVER(ORDER BY c1 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as dense_rank2, + PERCENT_RANK() OVER(ORDER BY c1) as percent_rank1, + PERCENT_RANK() OVER(ORDER BY c1 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as percent_rank2 + FROM aggregate_test_100 + ORDER BY c9 + LIMIT 5 +---- +28774375 80 80 5 5 0.79797979798 0.79797979798 +63044568 62 62 4 4 0.616161616162 0.616161616162 +141047417 1 1 1 1 0 0 +141680161 41 41 3 3 0.40404040404 0.40404040404 +145294611 1 1 1 1 0 0 + +#fn test_lag_lead +query IIIII +SELECT + c9, + LAG(c9, 2, 10101) OVER(ORDER BY c9) as lag1, + LAG(c9, 2, 10101) OVER(ORDER BY c9 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as lag2, + LEAD(c9, 2, 10101) OVER(ORDER BY c9) as lead1, + LEAD(c9, 2, 10101) OVER(ORDER BY c9 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as lead2 + FROM aggregate_test_100 + ORDER BY c9 + LIMIT 5 +---- +28774375 10101 10101 141047417 141047417 +63044568 10101 10101 141680161 141680161 +141047417 28774375 28774375 145294611 145294611 +141680161 63044568 63044568 225513085 225513085 +145294611 141047417 141047417 243203849 243203849 + +#fn test_window_frame_first_value_last_value_aggregate +query IIII +SELECT + FIRST_VALUE(c4) OVER(ORDER BY c9 ASC ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING) as first_value1, + FIRST_VALUE(c4) OVER(ORDER BY c9 ASC ROWS BETWEEN 2 PRECEDING AND 3 FOLLOWING) as first_value2, + LAST_VALUE(c4) OVER(ORDER BY c9 ASC ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING) as last_value1, + LAST_VALUE(c4) OVER(ORDER BY c9 ASC ROWS BETWEEN 2 PRECEDING AND 3 FOLLOWING) as last_value2 + FROM aggregate_test_100 + ORDER BY c9 + LIMIT 5 +---- +-16110 -16110 3917 -1114 +-16110 -16110 -16974 15673 +-16110 -16110 -1114 13630 +-16110 3917 15673 -13217 +-16110 -16974 13630 20690 + +#fn test_window_frame_nth_value_aggregate +query II +SELECT + NTH_VALUE(c4, 3) OVER(ORDER BY c9 ASC ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING) as nth_value1, + NTH_VALUE(c4, 2) OVER(ORDER BY c9 ASC ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING) as nth_value2 + FROM aggregate_test_100 + ORDER BY c9 + LIMIT 5 +---- +NULL 3917 +-16974 3917 +-16974 -16974 +-1114 -1114 +15673 15673 diff --git a/datafusion/core/tests/tpcds_planning.rs b/datafusion/core/tests/tpcds_planning.rs index 9642a2880e6d..a4875d5cbf33 100644 --- a/datafusion/core/tests/tpcds_planning.rs +++ b/datafusion/core/tests/tpcds_planning.rs @@ -73,6 +73,9 @@ async fn tpcds_logical_q9() -> Result<()> { create_logical_plan(9).await } +#[ignore] +// Schema error: No field named 'c'.'c_customer_sk'. +// issue: https://github.com/apache/arrow-datafusion/issues/4794 #[tokio::test] async fn tpcds_logical_q10() -> Result<()> { create_logical_plan(10).await @@ -198,6 +201,9 @@ async fn tpcds_logical_q34() -> Result<()> { create_logical_plan(34).await } +#[ignore] +// Schema error: No field named 'c'.'c_customer_sk'. +// issue: https://github.com/apache/arrow-datafusion/issues/4794 #[tokio::test] async fn tpcds_logical_q35() -> Result<()> { create_logical_plan(35).await @@ -229,6 +235,9 @@ async fn tpcds_logical_q40() -> Result<()> { } #[tokio::test] +#[ignore] +// Optimizer rule 'scalar_subquery_to_join' failed: Optimizing disjunctions not supported! +// issue: https://github.com/apache/arrow-datafusion/issues/5368 async fn tpcds_logical_q41() -> Result<()> { create_logical_plan(41).await } diff --git a/datafusion/core/tests/user_defined_aggregates.rs b/datafusion/core/tests/user_defined_aggregates.rs index b00ad12a520c..25183a1b21dd 100644 --- a/datafusion/core/tests/user_defined_aggregates.rs +++ b/datafusion/core/tests/user_defined_aggregates.rs @@ -44,11 +44,11 @@ async fn test_udf_returning_struct() { let ctx = udaf_struct_context(); let sql = "SELECT first(value, time) from t"; let expected = vec![ - "+--------------------------------------------------+", - "| first(t.value,t.time) |", - "+--------------------------------------------------+", - "| {\"value\": 2, \"time\": 1970-01-01T00:00:00.000002} |", - "+--------------------------------------------------+", + "+------------------------------------------------+", + "| first(t.value,t.time) |", + "+------------------------------------------------+", + "| {value: 2.0, time: 1970-01-01T00:00:00.000002} |", + "+------------------------------------------------+", ]; assert_batches_eq!(expected, &execute(&ctx, sql).await); } @@ -62,7 +62,7 @@ async fn test_udf_returning_struct_sq() { "+-----------------+----------------------------+", "| sq.first[value] | sq.first[time] |", "+-----------------+----------------------------+", - "| 2 | 1970-01-01T00:00:00.000002 |", + "| 2.0 | 1970-01-01T00:00:00.000002 |", "+-----------------+----------------------------+", ]; assert_batches_eq!(expected, &execute(&ctx, sql).await); diff --git a/datafusion/core/tests/user_defined_plan.rs b/datafusion/core/tests/user_defined_plan.rs index af7eb61f4605..3b1ea76a84a2 100644 --- a/datafusion/core/tests/user_defined_plan.rs +++ b/datafusion/core/tests/user_defined_plan.rs @@ -218,8 +218,7 @@ async fn topk_plan() -> Result<()> { let mut expected = vec![ "| logical_plan after topk | TopK: k=3 |", - "| | Projection: sales.customer_id, sales.revenue |", - "| | TableScan: sales projection=[customer_id,revenue] |", + "| | TableScan: sales projection=[customer_id,revenue] |", ].join("\n"); let explain_query = format!("EXPLAIN VERBOSE {QUERY}"); diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml index dc72391463a0..8c67d68b10ef 100644 --- a/datafusion/expr/Cargo.toml +++ b/datafusion/expr/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "datafusion-expr" description = "Logical plan and expression representation for DataFusion query engine" -version = "18.0.0" +version = "19.0.0" homepage = "https://github.com/apache/arrow-datafusion" repository = "https://github.com/apache/arrow-datafusion" readme = "README.md" @@ -36,8 +36,8 @@ path = "src/lib.rs" [dependencies] ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } -arrow = { version = "32.0.0", default-features = false } -datafusion-common = { path = "../common", version = "18.0.0" } +arrow = { version = "34.0.0", default-features = false } +datafusion-common = { path = "../common", version = "19.0.0" } log = "^0.4" sqlparser = "0.30" diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index a78808b26c29..8b6c39043b6c 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -238,35 +238,6 @@ impl BinaryExpr { pub fn new(left: Box, op: Operator, right: Box) -> Self { Self { left, op, right } } - - /// Get the operator precedence - /// use as a reference - pub fn precedence(&self) -> u8 { - match self.op { - Operator::Or => 5, - Operator::And => 10, - Operator::NotEq - | Operator::Eq - | Operator::Lt - | Operator::LtEq - | Operator::Gt - | Operator::GtEq => 20, - Operator::Plus | Operator::Minus => 30, - Operator::Multiply | Operator::Divide | Operator::Modulo => 40, - Operator::IsDistinctFrom - | Operator::IsNotDistinctFrom - | Operator::RegexMatch - | Operator::RegexNotMatch - | Operator::RegexIMatch - | Operator::RegexNotIMatch - | Operator::BitwiseAnd - | Operator::BitwiseOr - | Operator::BitwiseShiftLeft - | Operator::BitwiseShiftRight - | Operator::BitwiseXor - | Operator::StringConcat => 0, - } - } } impl Display for BinaryExpr { @@ -283,7 +254,7 @@ impl Display for BinaryExpr { ) -> fmt::Result { match expr { Expr::BinaryExpr(child) => { - let p = child.precedence(); + let p = child.op.precedence(); if p == 0 || p < precedence { write!(f, "({child})")?; } else { @@ -295,7 +266,7 @@ impl Display for BinaryExpr { Ok(()) } - let precedence = self.precedence(); + let precedence = self.op.precedence(); write_child(f, self.left.as_ref(), precedence)?; write!(f, " {} ", self.op)?; write_child(f, self.right.as_ref(), precedence) diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index d312a4c8e7a7..e802d4a8519a 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -340,9 +340,37 @@ impl LogicalPlanBuilder { } /// Add missing sort columns to all downstream projection + /// + /// Thus, if you have a LogialPlan that selects A and B and have + /// not requested a sort by C, this code will add C recursively to + /// all input projections. + /// + /// Adding a new column is not correct if there is a `Distinct` + /// node, which produces only distinct values of its + /// inputs. Adding a new column to its input will result in + /// potententially different results than with the original column. + /// + /// For example, if the input is like: + /// + /// Distinct(A, B) + /// + /// If the input looks like + /// + /// a | b | c + /// --+---+--- + /// 1 | 2 | 3 + /// 1 | 2 | 4 + /// + /// Distinct (A, B) --> (1,2) + /// + /// But Distinct (A, B, C) --> (1, 2, 3), (1, 2, 4) + /// (which will appear as a (1, 2), (1, 2) if a and b are projected + /// + /// See for more details fn add_missing_columns( curr_plan: LogicalPlan, missing_cols: &[Column], + is_distinct: bool, ) -> Result { match curr_plan { LogicalPlan::Projection(Projection { @@ -362,15 +390,24 @@ impl LogicalPlanBuilder { // missing_cols may be already present but without the new // projected alias. missing_exprs.retain(|e| !expr.contains(e)); + if is_distinct { + Self::ambiguous_distinct_check(&missing_exprs, missing_cols, &expr)?; + } expr.extend(missing_exprs); Ok(project((*input).clone(), expr)?) } _ => { + let is_distinct = + is_distinct || matches!(curr_plan, LogicalPlan::Distinct(_)); let new_inputs = curr_plan .inputs() .into_iter() .map(|input_plan| { - Self::add_missing_columns((*input_plan).clone(), missing_cols) + Self::add_missing_columns( + (*input_plan).clone(), + missing_cols, + is_distinct, + ) }) .collect::>>()?; @@ -380,6 +417,45 @@ impl LogicalPlanBuilder { } } + fn ambiguous_distinct_check( + missing_exprs: &[Expr], + missing_cols: &[Column], + projection_exprs: &[Expr], + ) -> Result<()> { + if missing_exprs.is_empty() { + return Ok(()); + } + + // if the missing columns are all only aliases for things in + // the existing select list, it is ok + // + // This handles the special case for + // SELECT col as ORDER BY + // + // As described in https://github.com/apache/arrow-datafusion/issues/5293 + let all_aliases = missing_exprs.iter().all(|e| { + projection_exprs.iter().any(|proj_expr| { + if let Expr::Alias(expr, _) = proj_expr { + e == expr.as_ref() + } else { + false + } + }) + }); + if all_aliases { + return Ok(()); + } + + let missing_col_names = missing_cols + .iter() + .map(|col| col.flat_name()) + .collect::(); + + Err(DataFusionError::Plan(format!( + "For SELECT DISTINCT, ORDER BY expressions {missing_col_names} must appear in select list", + ))) + } + /// Apply a sort pub fn sort( self, @@ -406,16 +482,6 @@ impl LogicalPlanBuilder { Ok(()) })?; - self.create_sort_plan(exprs, missing_cols) - } - - pub fn create_sort_plan( - self, - exprs: impl IntoIterator> + Clone, - missing_cols: Vec, - ) -> Result { - let schema = self.plan.schema(); - if missing_cols.is_empty() { return Ok(Self::from(LogicalPlan::Sort(Sort { expr: normalize_cols(exprs, &self.plan)?, @@ -431,7 +497,8 @@ impl LogicalPlanBuilder { .map(|f| Expr::Column(f.qualified_column())) .collect(); - let plan = Self::add_missing_columns(self.plan, &missing_cols)?; + let is_distinct = false; + let plan = Self::add_missing_columns(self.plan, &missing_cols, is_distinct)?; let sort_plan = LogicalPlan::Sort(Sort { expr: normalize_cols(exprs, &plan)?, input: Arc::new(plan), diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index d86a44e5dccd..c3ef861eb3b4 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -962,10 +962,13 @@ impl LogicalPlan { let mut full_filter = vec![]; let mut partial_filter = vec![]; let mut unsupported_filters = vec![]; + let filters: Vec<&Expr> = filters.iter().collect(); - filters.iter().for_each(|x| { - if let Ok(t) = source.supports_filter_pushdown(x) { - match t { + if let Ok(results) = + source.supports_filters_pushdown(&filters) + { + filters.iter().zip(results.iter()).for_each( + |(x, res)| match res { TableProviderFilterPushDown::Exact => { full_filter.push(x) } @@ -975,9 +978,9 @@ impl LogicalPlan { TableProviderFilterPushDown::Unsupported => { unsupported_filters.push(x) } - } - } - }); + }, + ); + } if !full_filter.is_empty() { write!(f, ", full_filters={full_filter:?}")?; @@ -1588,7 +1591,7 @@ pub struct CreateExternalTable { pub if_not_exists: bool, /// SQL used to create the table, if available pub definition: Option, - /// File compression type (GZIP, BZIP2, XZ) + /// File compression type (GZIP, BZIP2, XZ, ZSTD) pub file_compression_type: CompressionTypeVariant, /// Table(provider) specific options pub options: HashMap, diff --git a/datafusion/expr/src/operator.rs b/datafusion/expr/src/operator.rs index fac81654c4a8..ca6fb75276ff 100644 --- a/datafusion/expr/src/operator.rs +++ b/datafusion/expr/src/operator.rs @@ -142,6 +142,35 @@ impl Operator { | Operator::StringConcat => None, } } + + /// Get the operator precedence + /// use as a reference + pub fn precedence(&self) -> u8 { + match self { + Operator::Or => 5, + Operator::And => 10, + Operator::NotEq + | Operator::Eq + | Operator::Lt + | Operator::LtEq + | Operator::Gt + | Operator::GtEq => 20, + Operator::Plus | Operator::Minus => 30, + Operator::Multiply | Operator::Divide | Operator::Modulo => 40, + Operator::IsDistinctFrom + | Operator::IsNotDistinctFrom + | Operator::RegexMatch + | Operator::RegexNotMatch + | Operator::RegexIMatch + | Operator::RegexNotIMatch + | Operator::BitwiseAnd + | Operator::BitwiseOr + | Operator::BitwiseShiftLeft + | Operator::BitwiseShiftRight + | Operator::BitwiseXor + | Operator::StringConcat => 0, + } + } } impl fmt::Display for Operator { diff --git a/datafusion/expr/src/table_source.rs b/datafusion/expr/src/table_source.rs index 10984f779936..5954335c676d 100644 --- a/datafusion/expr/src/table_source.rs +++ b/datafusion/expr/src/table_source.rs @@ -17,6 +17,7 @@ use crate::{Expr, LogicalPlan}; use arrow::datatypes::SchemaRef; +use datafusion_common::Result; use std::any::Any; ///! Table source @@ -70,13 +71,27 @@ pub trait TableSource: Sync + Send { /// Tests whether the table provider can make use of a filter expression /// to optimise data retrieval. + #[deprecated(since = "20.0.0", note = "use supports_filters_pushdown instead")] fn supports_filter_pushdown( &self, _filter: &Expr, - ) -> datafusion_common::Result { + ) -> Result { Ok(TableProviderFilterPushDown::Unsupported) } + /// Tests whether the table provider can make use of any or all filter expressions + /// to optimise data retrieval. + #[allow(deprecated)] + fn supports_filters_pushdown( + &self, + filters: &[&Expr], + ) -> Result> { + filters + .iter() + .map(|f| self.supports_filter_pushdown(f)) + .collect() + } + /// Get the Logical plan of this table provider, if available. fn get_logical_plan(&self) -> Option<&LogicalPlan> { None diff --git a/datafusion/expr/src/type_coercion.rs b/datafusion/expr/src/type_coercion.rs index 502925e9b8a0..373f4a7a2d4a 100644 --- a/datafusion/expr/src/type_coercion.rs +++ b/datafusion/expr/src/type_coercion.rs @@ -33,7 +33,7 @@ use arrow::datatypes::DataType; -/// Determine if a DataType is signed numeric or not +/// Determine whether the given data type `dt` represents unsigned numeric values. pub fn is_signed_numeric(dt: &DataType) -> bool { matches!( dt, @@ -48,12 +48,12 @@ pub fn is_signed_numeric(dt: &DataType) -> bool { ) } -// Determine if a DataType is Null or not +/// Determine whether the given data type `dt` is `Null`. pub fn is_null(dt: &DataType) -> bool { *dt == DataType::Null } -/// Determine if a DataType is numeric or not +/// Determine whether the given data type `dt` represents numeric values. pub fn is_numeric(dt: &DataType) -> bool { is_signed_numeric(dt) || matches!( @@ -62,18 +62,19 @@ pub fn is_numeric(dt: &DataType) -> bool { ) } -/// Determine if a DataType is Timestamp or not +/// Determine whether the given data type `dt` is a `Timestamp`. pub fn is_timestamp(dt: &DataType) -> bool { matches!(dt, DataType::Timestamp(_, _)) } -/// Determine if a DataType is Date or not +/// Determine whether the given data type `dt` is a `Date`. pub fn is_date(dt: &DataType) -> bool { matches!(dt, DataType::Date32 | DataType::Date64) } -pub fn is_uft8(dt: &DataType) -> bool { - matches!(dt, DataType::Utf8) +/// Determine whether the given data type `dt` is a `Utf8` or `LargeUtf8`. +pub fn is_utf8_or_large_utf8(dt: &DataType) -> bool { + matches!(dt, DataType::Utf8 | DataType::LargeUtf8) } pub mod aggregates; diff --git a/datafusion/expr/src/window_frame.rs b/datafusion/expr/src/window_frame.rs index c25d2491e45a..7794125d0eec 100644 --- a/datafusion/expr/src/window_frame.rs +++ b/datafusion/expr/src/window_frame.rs @@ -164,7 +164,7 @@ pub fn regularize(mut frame: WindowFrame, order_bys: usize) -> Result Result<()> { + let outer_scan = test_table_scan()?; + let subquery_scan = test_table_scan()?; + let subquery = LogicalPlanBuilder::from(subquery_scan) + .filter(col("test.a").gt(col("test.b")))? + .project(vec![col("c")])? + .build()?; + + let plan = LogicalPlanBuilder::from(outer_scan) + .filter(exists(Arc::new(subquery)))? + .project(vec![col("test.b")])? + .build()?; + + // Subquery and outer query refer to the same table. + let expected = "Projection: test.b [b:UInt32]\ + \n Filter: EXISTS () [a:UInt32, b:UInt32, c:UInt32]\ + \n Subquery: [c:UInt32]\ + \n Projection: test.c [c:UInt32]\ + \n Filter: test.a > test.b [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; + + assert_plan_eq(&plan, expected) + } } diff --git a/datafusion/optimizer/src/decorrelate_where_in.rs b/datafusion/optimizer/src/decorrelate_where_in.rs index c8ff65f12523..bc70098610f9 100644 --- a/datafusion/optimizer/src/decorrelate_where_in.rs +++ b/datafusion/optimizer/src/decorrelate_where_in.rs @@ -1149,4 +1149,35 @@ mod tests { ); Ok(()) } + + #[test] + fn in_subquery_with_same_table() -> Result<()> { + let outer_scan = test_table_scan()?; + let subquery_scan = test_table_scan()?; + let subquery = LogicalPlanBuilder::from(subquery_scan) + .filter(col("test.a").gt(col("test.b")))? + .project(vec![col("c")])? + .build()?; + + let plan = LogicalPlanBuilder::from(outer_scan) + .filter(in_subquery(col("test.a"), Arc::new(subquery)))? + .project(vec![col("test.b")])? + .build()?; + + // Subquery and outer query refer to the same table. + let expected = "Projection: test.b [b:UInt32]\ + \n LeftSemi Join: Filter: test.a = __correlated_sq_1.c [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [c:UInt32]\ + \n Projection: test.c AS c [c:UInt32]\ + \n Filter: test.a > test.b [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; + + assert_optimized_plan_eq_display_indent( + Arc::new(DecorrelateWhereIn::new()), + &plan, + expected, + ); + Ok(()) + } } diff --git a/datafusion/optimizer/src/eliminate_cross_join.rs b/datafusion/optimizer/src/eliminate_cross_join.rs index c19b43d297e3..bcf4b4d5a86e 100644 --- a/datafusion/optimizer/src/eliminate_cross_join.rs +++ b/datafusion/optimizer/src/eliminate_cross_join.rs @@ -38,7 +38,7 @@ impl EliminateCrossJoin { } } -/// Attempt to reorder join tp eliminate cross joins to inner joins. +/// Attempt to reorder join to eliminate cross joins to inner joins. /// for queries: /// 'select ... from a, b where a.x = b.y and b.xx = 100;' /// 'select ... from a, b where (a.x = b.y and b.xx = 100) or (a.x = b.y and b.xx = 200);' @@ -247,10 +247,12 @@ fn intersect( vec1: &[(Expr, Expr)], vec2: &[(Expr, Expr)], ) { - for x1 in vec1.iter() { - for x2 in vec2.iter() { - if x1.0 == x2.0 && x1.1 == x2.1 || x1.1 == x2.0 && x1.0 == x2.1 { - accum.push((x1.0.clone(), x1.1.clone())); + if !(vec1.is_empty() || vec2.is_empty()) { + for x1 in vec1.iter() { + for x2 in vec2.iter() { + if x1.0 == x2.0 && x1.1 == x2.1 || x1.1 == x2.0 && x1.0 == x2.1 { + accum.push((x1.0.clone(), x1.1.clone())); + } } } } diff --git a/datafusion/optimizer/src/eliminate_outer_join.rs b/datafusion/optimizer/src/eliminate_outer_join.rs index 6a7914034bd1..66f22da219d4 100644 --- a/datafusion/optimizer/src/eliminate_outer_join.rs +++ b/datafusion/optimizer/src/eliminate_outer_join.rs @@ -33,7 +33,7 @@ use std::sync::Arc; /// Attempt to replace outer joins with inner joins. /// /// Outer joins are typically more expensive to compute at runtime -/// than inner joins and prevent various forms fo predicate pushdown +/// than inner joins and prevent various forms of predicate pushdown /// and other optimizations, so removing them if possible is beneficial. /// /// Inner joins filter out rows that do match. Outer joins pass rows @@ -44,7 +44,7 @@ use std::sync::Arc; /// For example, in the `select ... from a left join b on ... where b.xx = 100;` /// /// For rows when `b.xx` is null (as it would be after an outer join), -/// the `b.xx = 100` predicate filters them out and there there is no +/// the `b.xx = 100` predicate filters them out and there is no /// need to produce null rows for output. /// /// Generally, an outer join can be rewritten to inner join if the diff --git a/datafusion/optimizer/src/eliminate_project.rs b/datafusion/optimizer/src/eliminate_project.rs new file mode 100644 index 000000000000..143004aaabeb --- /dev/null +++ b/datafusion/optimizer/src/eliminate_project.rs @@ -0,0 +1,96 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::optimizer::ApplyOrder; +use crate::{OptimizerConfig, OptimizerRule}; +use datafusion_common::{DFSchemaRef, Result}; +use datafusion_expr::logical_plan::LogicalPlan; +use datafusion_expr::{Expr, Projection}; + +/// Optimization rule that eliminate unnecessary [LogicalPlan::Projection]. +#[derive(Default)] +pub struct EliminateProjection; + +impl EliminateProjection { + #[allow(missing_docs)] + pub fn new() -> Self { + Self {} + } +} + +impl OptimizerRule for EliminateProjection { + fn try_optimize( + &self, + plan: &LogicalPlan, + _config: &dyn OptimizerConfig, + ) -> Result> { + match plan { + LogicalPlan::Projection(projection) => { + let child_plan = projection.input.as_ref(); + match child_plan { + LogicalPlan::Join(_) + | LogicalPlan::CrossJoin(_) + | LogicalPlan::Union(_) + | LogicalPlan::Filter(_) + | LogicalPlan::TableScan(_) + | LogicalPlan::SubqueryAlias(_) + | LogicalPlan::Sort(_) => { + if can_eliminate(projection, child_plan.schema()) { + Ok(Some(child_plan.clone())) + } else { + Ok(None) + } + } + _ => { + if plan.schema() == child_plan.schema() { + Ok(Some(child_plan.clone())) + } else { + Ok(None) + } + } + } + } + _ => Ok(None), + } + } + + fn name(&self) -> &str { + "eliminate_projection" + } + + fn apply_order(&self) -> Option { + Some(ApplyOrder::TopDown) + } +} + +fn can_eliminate(projection: &Projection, schema: &DFSchemaRef) -> bool { + if projection.expr.len() != schema.fields().len() { + return false; + } + for (i, e) in projection.expr.iter().enumerate() { + match e { + Expr::Column(c) => { + let d = schema.fields().get(i).unwrap(); + if c != &d.qualified_column() && c != &d.unqualified_column() { + return false; + } + } + _ => return false, + } + } + true +} diff --git a/datafusion/optimizer/src/inline_table_scan.rs b/datafusion/optimizer/src/inline_table_scan.rs index 39e7d43845f6..6b58399192ee 100644 --- a/datafusion/optimizer/src/inline_table_scan.rs +++ b/datafusion/optimizer/src/inline_table_scan.rs @@ -49,11 +49,14 @@ impl OptimizerRule for InlineTableScan { source, table_name, filters, + projection, .. }) if filters.is_empty() => { if let Some(sub_plan) = source.get_logical_plan() { + let projection_exprs = + generate_projection_expr(projection, sub_plan)?; let plan = LogicalPlanBuilder::from(sub_plan.clone()) - .project(vec![Expr::Wildcard])? + .project(projection_exprs)? .alias(table_name)?; Ok(Some(plan.build()?)) } else { @@ -73,6 +76,23 @@ impl OptimizerRule for InlineTableScan { } } +fn generate_projection_expr( + projection: &Option>, + sub_plan: &LogicalPlan, +) -> Result> { + let mut exprs = vec![]; + if let Some(projection) = projection { + for i in projection { + exprs.push(Expr::Column( + sub_plan.schema().fields()[*i].qualified_column(), + )); + } + } else { + exprs.push(Expr::Wildcard); + } + Ok(exprs) +} + #[cfg(test)] mod tests { use std::{sync::Arc, vec}; @@ -91,7 +111,10 @@ mod tests { } fn schema(&self) -> arrow::datatypes::SchemaRef { - Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)])) + Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int64, false), + Field::new("b", DataType::Int64, false), + ])) } fn supports_filter_pushdown( @@ -150,9 +173,25 @@ mod tests { let plan = scan.filter(col("x.a").eq(lit(1)))?.build()?; let expected = "Filter: x.a = Int32(1)\ \n SubqueryAlias: x\ - \n Projection: y.a\ + \n Projection: y.a, y.b\ \n TableScan: y"; assert_optimized_plan_eq(Arc::new(InlineTableScan::new()), &plan, expected) } + + #[test] + fn inline_table_scan_with_projection() -> datafusion_common::Result<()> { + let scan = LogicalPlanBuilder::scan( + "x".to_string(), + Arc::new(CustomSource::new()), + Some(vec![0]), + )?; + + let plan = scan.build()?; + let expected = "SubqueryAlias: x\ + \n Projection: y.a\ + \n TableScan: y"; + + assert_optimized_plan_eq(Arc::new(InlineTableScan::new()), &plan, expected) + } } diff --git a/datafusion/optimizer/src/lib.rs b/datafusion/optimizer/src/lib.rs index cd743fcda73b..ca0611d4e17e 100644 --- a/datafusion/optimizer/src/lib.rs +++ b/datafusion/optimizer/src/lib.rs @@ -23,9 +23,11 @@ pub mod eliminate_cross_join; pub mod eliminate_filter; pub mod eliminate_limit; pub mod eliminate_outer_join; +pub mod eliminate_project; pub mod extract_equijoin_predicate; pub mod filter_null_join_keys; pub mod inline_table_scan; +pub mod merge_projection; pub mod optimizer; pub mod propagate_empty_relation; pub mod push_down_filter; @@ -37,6 +39,7 @@ pub mod single_distinct_to_groupby; pub mod type_coercion; pub mod utils; +pub mod replace_distinct_aggregate; pub mod rewrite_disjunctive_predicate; #[cfg(test)] pub mod test; diff --git a/datafusion/optimizer/src/merge_projection.rs b/datafusion/optimizer/src/merge_projection.rs new file mode 100644 index 000000000000..d551283015a9 --- /dev/null +++ b/datafusion/optimizer/src/merge_projection.rs @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::optimizer::ApplyOrder; +use datafusion_common::Result; +use datafusion_expr::{Expr, LogicalPlan, Projection}; +use std::collections::HashMap; + +use crate::push_down_filter::replace_cols_by_name; +use crate::{OptimizerConfig, OptimizerRule}; + +/// Optimization rule that merge [LogicalPlan::Projection]. +#[derive(Default)] +pub struct MergeProjection; + +impl MergeProjection { + #[allow(missing_docs)] + pub fn new() -> Self { + Self {} + } +} + +impl OptimizerRule for MergeProjection { + fn try_optimize( + &self, + plan: &LogicalPlan, + _config: &dyn OptimizerConfig, + ) -> Result> { + match plan { + LogicalPlan::Projection(parent_projection) => { + match parent_projection.input.as_ref() { + LogicalPlan::Projection(child_projection) => { + let replace_map = collect_projection_expr(child_projection); + let new_exprs = parent_projection + .expr + .iter() + .map(|expr| replace_cols_by_name(expr.clone(), &replace_map)) + .enumerate() + .map(|(i, e)| match e { + Ok(e) => { + let parent_expr = parent_projection.schema.fields() + [i] + .qualified_name(); + if e.display_name()? == parent_expr { + Ok(e) + } else { + Ok(e.alias(parent_expr)) + } + } + Err(e) => Err(e), + }) + .collect::>>()?; + let new_plan = + LogicalPlan::Projection(Projection::try_new_with_schema( + new_exprs, + child_projection.input.clone(), + parent_projection.schema.clone(), + )?); + Ok(Some( + self.try_optimize(&new_plan, _config)?.unwrap_or(new_plan), + )) + } + _ => Ok(None), + } + } + _ => Ok(None), + } + } + + fn name(&self) -> &str { + "merge_projection" + } + + fn apply_order(&self) -> Option { + Some(ApplyOrder::TopDown) + } +} + +pub fn collect_projection_expr(projection: &Projection) -> HashMap { + projection + .schema + .fields() + .iter() + .enumerate() + .flat_map(|(i, field)| { + // strip alias + let expr = projection.expr[i].clone().unalias(); + // Convert both qualified and unqualified fields + [ + (field.name().clone(), expr.clone()), + (field.qualified_name(), expr), + ] + }) + .collect::>() +} + +#[cfg(test)] +mod tests { + use crate::merge_projection::MergeProjection; + use datafusion_common::Result; + use datafusion_expr::{ + binary_expr, col, lit, logical_plan::builder::LogicalPlanBuilder, LogicalPlan, + Operator, + }; + use std::sync::Arc; + + use crate::test::*; + + fn assert_optimized_plan_equal(plan: &LogicalPlan, expected: &str) -> Result<()> { + assert_optimized_plan_eq(Arc::new(MergeProjection::new()), plan, expected) + } + + #[test] + fn merge_two_projection() -> Result<()> { + let table_scan = test_table_scan()?; + let plan = LogicalPlanBuilder::from(table_scan) + .project(vec![col("a")])? + .project(vec![binary_expr(lit(1), Operator::Plus, col("a"))])? + .build()?; + + let expected = "Projection: Int32(1) + test.a\ + \n TableScan: test"; + assert_optimized_plan_equal(&plan, expected) + } + + #[test] + fn merge_three_projection() -> Result<()> { + let table_scan = test_table_scan()?; + let plan = LogicalPlanBuilder::from(table_scan) + .project(vec![col("a"), col("b")])? + .project(vec![col("a")])? + .project(vec![binary_expr(lit(1), Operator::Plus, col("a"))])? + .build()?; + + let expected = "Projection: Int32(1) + test.a\ + \n TableScan: test"; + assert_optimized_plan_equal(&plan, expected) + } + + #[test] + fn merge_alias() -> Result<()> { + let table_scan = test_table_scan()?; + let plan = LogicalPlanBuilder::from(table_scan) + .project(vec![col("a")])? + .project(vec![col("a").alias("alias")])? + .build()?; + + let expected = "Projection: test.a AS alias\ + \n TableScan: test"; + assert_optimized_plan_equal(&plan, expected) + } +} diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs index ffd303ea4c64..2738c9a15913 100644 --- a/datafusion/optimizer/src/optimizer.rs +++ b/datafusion/optimizer/src/optimizer.rs @@ -24,13 +24,16 @@ use crate::eliminate_cross_join::EliminateCrossJoin; use crate::eliminate_filter::EliminateFilter; use crate::eliminate_limit::EliminateLimit; use crate::eliminate_outer_join::EliminateOuterJoin; +use crate::eliminate_project::EliminateProjection; use crate::extract_equijoin_predicate::ExtractEquijoinPredicate; use crate::filter_null_join_keys::FilterNullJoinKeys; use crate::inline_table_scan::InlineTableScan; +use crate::merge_projection::MergeProjection; use crate::propagate_empty_relation::PropagateEmptyRelation; use crate::push_down_filter::PushDownFilter; use crate::push_down_limit::PushDownLimit; use crate::push_down_projection::PushDownProjection; +use crate::replace_distinct_aggregate::ReplaceDistinctWithAggregate; use crate::rewrite_disjunctive_predicate::RewriteDisjunctivePredicate; use crate::scalar_subquery_to_join::ScalarSubqueryToJoin; use crate::simplify_expressions::SimplifyExpressions; @@ -63,7 +66,7 @@ pub trait OptimizerRule { /// How should the rule be applied by the optimizer? See comments on [`ApplyOrder`] for details. /// - /// If a rule use default None, its should traverse recursively plan inside itself + /// If a rule use default None, it should traverse recursively plan inside itself fn apply_order(&self) -> Option { None } @@ -207,6 +210,7 @@ impl Optimizer { Arc::new(TypeCoercion::new()), Arc::new(SimplifyExpressions::new()), Arc::new(UnwrapCastInComparison::new()), + Arc::new(ReplaceDistinctWithAggregate::new()), Arc::new(DecorrelateWhereExists::new()), Arc::new(DecorrelateWhereIn::new()), Arc::new(ScalarSubqueryToJoin::new()), @@ -215,6 +219,7 @@ impl Optimizer { // run it again after running the optimizations that potentially converted // subqueries to joins Arc::new(SimplifyExpressions::new()), + Arc::new(MergeProjection::new()), Arc::new(RewriteDisjunctivePredicate::new()), Arc::new(EliminateFilter::new()), Arc::new(EliminateCrossJoin::new()), @@ -233,6 +238,7 @@ impl Optimizer { Arc::new(UnwrapCastInComparison::new()), Arc::new(CommonSubexprEliminate::new()), Arc::new(PushDownProjection::new()), + Arc::new(EliminateProjection::new()), ]; Self::with_rules(rules) diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index 8e90a683f730..6d8db2043c07 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -14,6 +14,7 @@ //! Push Down Filter optimizer rule ensures that filters are applied as early as possible in the plan +use crate::optimizer::ApplyOrder; use crate::utils::{conjunction, split_conjunction}; use crate::{utils, OptimizerConfig, OptimizerRule}; use datafusion_common::{Column, DFSchema, DataFusionError, Result}; @@ -26,10 +27,10 @@ use datafusion_expr::{ utils::from_plan, BinaryExpr, Expr, Filter, Operator, TableProviderFilterPushDown, }; +use itertools::Itertools; use std::collections::{HashMap, HashSet}; use std::iter::once; use std::sync::Arc; -use utils::optimize_children; /// Push Down Filter optimizer rule pushes filter clauses down the plan /// # Introduction @@ -511,25 +512,20 @@ impl OptimizerRule for PushDownFilter { "push_down_filter" } + fn apply_order(&self) -> Option { + Some(ApplyOrder::TopDown) + } + fn try_optimize( &self, plan: &LogicalPlan, - config: &dyn OptimizerConfig, + _config: &dyn OptimizerConfig, ) -> Result> { let filter = match plan { LogicalPlan::Filter(filter) => filter, // we also need to pushdown filter in Join. - LogicalPlan::Join(join) => { - let optimized_plan = push_down_join(plan, join, None)?; - return match optimized_plan { - Some(optimized_plan) => Ok(Some( - optimize_children(self, &optimized_plan, config)? - .unwrap_or(optimized_plan), - )), - None => optimize_children(self, plan, config), - }; - } - _ => return optimize_children(self, plan, config), + LogicalPlan::Join(join) => return push_down_join(plan, join, None), + _ => return Ok(None), }; let child_plan = filter.input.as_ref(); @@ -550,11 +546,12 @@ impl OptimizerRule for PushDownFilter { let new_predicate = conjunction(new_predicates).ok_or_else(|| { DataFusionError::Plan("at least one expression exists".to_string()) })?; - let new_plan = LogicalPlan::Filter(Filter::try_new( + let new_filter = LogicalPlan::Filter(Filter::try_new( new_predicate, child_filter.input.clone(), )?); - return self.try_optimize(&new_plan, config); + self.try_optimize(&new_filter, _config)? + .unwrap_or(new_filter) } LogicalPlan::Repartition(_) | LogicalPlan::Distinct(_) @@ -691,7 +688,7 @@ impl OptimizerRule for PushDownFilter { LogicalPlan::Join(join) => { match push_down_join(&filter.input, join, Some(&filter.predicate))? { Some(optimized_plan) => optimized_plan, - None => plan.clone(), + None => return Ok(None), } } LogicalPlan::CrossJoin(CrossJoin { left, right, .. }) => { @@ -699,30 +696,27 @@ impl OptimizerRule for PushDownFilter { push_down_all_join(predicates, &filter.input, left, right, vec![])? } LogicalPlan::TableScan(scan) => { - let mut new_scan_filters = scan.filters.clone(); - let mut new_predicate = vec![]; - - let filter_predicates = - utils::split_conjunction_owned(filter.predicate.clone()); - - for filter_expr in &filter_predicates { - let (preserve_filter_node, add_to_provider) = - match scan.source.supports_filter_pushdown(filter_expr)? { - TableProviderFilterPushDown::Unsupported => (true, false), - TableProviderFilterPushDown::Inexact => (true, true), - TableProviderFilterPushDown::Exact => (false, true), - }; - if preserve_filter_node { - new_predicate.push(filter_expr.clone()); - } - if add_to_provider { - // avoid reduplicated filter expr. - if new_scan_filters.contains(filter_expr) { - continue; - } - new_scan_filters.push(filter_expr.clone()); - } - } + let filter_predicates = split_conjunction(&filter.predicate); + let results = scan + .source + .supports_filters_pushdown(filter_predicates.as_slice())?; + let zip = filter_predicates.iter().zip(results.into_iter()); + + let new_scan_filters = zip + .clone() + .filter(|(_, res)| res != &TableProviderFilterPushDown::Unsupported) + .map(|(pred, _)| *pred); + let new_scan_filters: Vec = scan + .filters + .iter() + .chain(new_scan_filters) + .unique() + .cloned() + .collect(); + let new_predicate: Vec = zip + .filter(|(_, res)| res != &TableProviderFilterPushDown::Exact) + .map(|(pred, _)| (*pred).clone()) + .collect(); let new_scan = LogicalPlan::TableScan(TableScan { source: scan.source.clone(), @@ -741,12 +735,9 @@ impl OptimizerRule for PushDownFilter { None => new_scan, } } - _ => plan.clone(), + _ => return Ok(None), }; - - Ok(Some( - optimize_children(self, &new_plan, config)?.unwrap_or(new_plan), - )) + Ok(Some(new_plan)) } } @@ -777,6 +768,7 @@ pub fn replace_cols_by_name( #[cfg(test)] mod tests { use super::*; + use crate::optimizer::Optimizer; use crate::rewrite_disjunctive_predicate::RewriteDisjunctivePredicate; use crate::test::*; use crate::OptimizerContext; @@ -791,28 +783,35 @@ mod tests { use std::sync::Arc; fn assert_optimized_plan_eq(plan: &LogicalPlan, expected: &str) -> Result<()> { - let optimized_plan = PushDownFilter::new() - .try_optimize(plan, &OptimizerContext::new()) - .unwrap() - .expect("failed to optimize plan"); - let formatted_plan = format!("{optimized_plan:?}"); - assert_eq!(plan.schema(), optimized_plan.schema()); - assert_eq!(expected, formatted_plan); - Ok(()) + crate::test::assert_optimized_plan_eq( + Arc::new(PushDownFilter::new()), + plan, + expected, + ) } fn assert_optimized_plan_eq_with_rewrite_predicate( plan: &LogicalPlan, expected: &str, ) -> Result<()> { - let mut optimized_plan = RewriteDisjunctivePredicate::new() - .try_optimize(plan, &OptimizerContext::new()) - .unwrap() - .expect("failed to optimize plan"); - optimized_plan = PushDownFilter::new() - .try_optimize(&optimized_plan, &OptimizerContext::new()) - .unwrap() - .expect("failed to optimize plan"); + let optimizer = Optimizer::with_rules(vec![ + Arc::new(RewriteDisjunctivePredicate::new()), + Arc::new(PushDownFilter::new()), + ]); + let mut optimized_plan = optimizer + .optimize_recursively( + optimizer.rules.get(0).unwrap(), + plan, + &OptimizerContext::new(), + )? + .unwrap_or_else(|| plan.clone()); + optimized_plan = optimizer + .optimize_recursively( + optimizer.rules.get(1).unwrap(), + &optimized_plan, + &OptimizerContext::new(), + )? + .unwrap_or_else(|| plan.clone()); let formatted_plan = format!("{optimized_plan:?}"); assert_eq!(plan.schema(), optimized_plan.schema()); assert_eq!(expected, formatted_plan); @@ -1902,7 +1901,7 @@ mod tests { fn supports_filter_pushdown( &self, - _: &Expr, + _e: &Expr, ) -> Result { Ok(self.filter_support.clone()) } @@ -2012,6 +2011,37 @@ mod tests { assert_optimized_plan_eq(&plan, expected) } + #[test] + fn multi_combined_filter_exact() -> Result<()> { + let test_provider = PushDownProvider { + filter_support: TableProviderFilterPushDown::Exact, + }; + + let table_scan = LogicalPlan::TableScan(TableScan { + table_name: "test".to_string(), + filters: vec![], + projected_schema: Arc::new(DFSchema::try_from( + (*test_provider.schema()).clone(), + )?), + projection: Some(vec![0]), + source: Arc::new(test_provider), + fetch: None, + }); + + let plan = LogicalPlanBuilder::from(table_scan) + .filter(and(col("a").eq(lit(10i64)), col("b").gt(lit(11i64))))? + .project(vec![col("a"), col("b")])? + .build()?; + + let expected = r#" +Projection: a, b + TableScan: test projection=[a], full_filters=[a = Int64(10), b > Int64(11)] + "# + .trim(); + + assert_optimized_plan_eq(&plan, expected) + } + #[test] fn test_filter_with_alias() -> Result<()> { // in table scan the true col name is 'test.a', diff --git a/datafusion/optimizer/src/replace_distinct_aggregate.rs b/datafusion/optimizer/src/replace_distinct_aggregate.rs new file mode 100644 index 000000000000..2a604bc3ffea --- /dev/null +++ b/datafusion/optimizer/src/replace_distinct_aggregate.rs @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::optimizer::ApplyOrder; +use crate::{OptimizerConfig, OptimizerRule}; +use datafusion_common::Result; +use datafusion_expr::utils::expand_wildcard; +use datafusion_expr::Distinct; +use datafusion_expr::{Aggregate, LogicalPlan}; +use ApplyOrder::BottomUp; + +/// Optimizer that replaces logical [[Distinct]] with a logical [[Aggregate]] +/// +/// ```text +/// SELECT DISTINCT a, b FROM tab +/// ``` +/// +/// Into +/// ```text +/// SELECT a, b FROM tab GROUP BY a, b +/// ``` + +/// Optimizer that replaces logical [[Distinct]] with a logical [[Aggregate]] +#[derive(Default)] +pub struct ReplaceDistinctWithAggregate {} + +impl ReplaceDistinctWithAggregate { + #[allow(missing_docs)] + pub fn new() -> Self { + Self {} + } +} + +impl OptimizerRule for ReplaceDistinctWithAggregate { + fn try_optimize( + &self, + plan: &LogicalPlan, + _config: &dyn OptimizerConfig, + ) -> Result> { + match plan { + LogicalPlan::Distinct(Distinct { input }) => { + let group_expr = expand_wildcard(input.schema(), input)?; + let aggregate = LogicalPlan::Aggregate(Aggregate::try_new_with_schema( + input.clone(), + group_expr, + vec![], + input.schema().clone(), // input schema and aggregate schema are the same in this case + )?); + Ok(Some(aggregate)) + } + _ => Ok(None), + } + } + + fn name(&self) -> &str { + "replace_distinct_aggregate" + } + + fn apply_order(&self) -> Option { + Some(BottomUp) + } +} + +#[cfg(test)] +mod tests { + use crate::replace_distinct_aggregate::ReplaceDistinctWithAggregate; + use crate::test::{assert_optimized_plan_eq, test_table_scan}; + use datafusion_expr::{col, LogicalPlanBuilder}; + use std::sync::Arc; + + #[test] + fn replace_distinct() -> datafusion_common::Result<()> { + let table_scan = test_table_scan().unwrap(); + let plan = LogicalPlanBuilder::from(table_scan) + .project(vec![col("a"), col("b")])? + .distinct()? + .build()?; + + let expected = "Aggregate: groupBy=[[test.a, test.b]], aggr=[[]]\ + \n Projection: test.a, test.b\ + \n TableScan: test"; + + assert_optimized_plan_eq( + Arc::new(ReplaceDistinctWithAggregate::new()), + &plan, + expected, + ) + } +} diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 3cab42de0f01..2a798fe71794 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -154,7 +154,7 @@ struct ConstEvaluator<'a> { /// traversal when we are N levels deep in the tree, one entry for /// this Expr and each of its parents. /// - /// After visiting all siblings if `can_evauate.top()`` is true, that + /// After visiting all siblings if `can_evaluate.top()` is true, that /// means there were no non evaluatable siblings (or their /// descendants) so this `Expr` can be evaluated can_evaluate: Vec, diff --git a/datafusion/optimizer/src/type_coercion.rs b/datafusion/optimizer/src/type_coercion.rs index 960dc9b376f0..69a11358b8e9 100644 --- a/datafusion/optimizer/src/type_coercion.rs +++ b/datafusion/optimizer/src/type_coercion.rs @@ -34,7 +34,9 @@ use datafusion_expr::type_coercion::functions::data_types; use datafusion_expr::type_coercion::other::{ get_coerce_type_for_case_when, get_coerce_type_for_list, }; -use datafusion_expr::type_coercion::{is_date, is_numeric, is_timestamp, is_uft8}; +use datafusion_expr::type_coercion::{ + is_date, is_numeric, is_timestamp, is_utf8_or_large_utf8, +}; use datafusion_expr::utils::from_plan; use datafusion_expr::{ aggregate_function, function, is_false, is_not_false, is_not_true, is_not_unknown, @@ -411,7 +413,7 @@ impl ExprRewriter for TypeCoercionRewriter { window_frame, }) => { let window_frame = - get_coerced_window_frame(window_frame, &self.schema, &order_by)?; + coerce_window_frame(window_frame, &self.schema, &order_by)?; let expr = Expr::WindowFunction(WindowFunction::new( fun, args, @@ -426,95 +428,127 @@ impl ExprRewriter for TypeCoercionRewriter { } } -/// Casts the ScalarValue `value` to coerced type. -// When coerced type is `Interval` we use `parse_interval` since `try_from_string` not -// supports conversion from string to Interval -fn convert_to_coerced_type( - coerced_type: &DataType, - value: &ScalarValue, -) -> Result { +/// Casts the given `value` to `target_type`. Note that this function +/// only considers `Null` or `Utf8` values. +fn coerce_scalar(target_type: &DataType, value: &ScalarValue) -> Result { match value { - // In here we do casting either for NULL types or - // ScalarValue::Utf8(Some(val)). The other types are already casted. - // The reason is that we convert the sqlparser result - // to the Utf8 for all possible cases. Hence the types other than Utf8 - // are already casted to appropriate type. Therefore they can be returned directly. + // Coerce Utf8 values: ScalarValue::Utf8(Some(val)) => { - // we need special handling for Interval types - if let DataType::Interval(..) = coerced_type { + // When `target_type` is `Interval`, we use `parse_interval` since + // `try_from_string` does not support `String` to `Interval` coercions. + if let DataType::Interval(..) = target_type { parse_interval("millisecond", val) } else { - ScalarValue::try_from_string(val.clone(), coerced_type) + ScalarValue::try_from_string(val.clone(), target_type) } } s => { if s.is_null() { - ScalarValue::try_from(coerced_type) + // Coerce `Null` values: + ScalarValue::try_from(target_type) } else { + // Values except `Utf8`/`Null` variants already have the right type + // (casted before) since we convert `sqlparser` outputs to `Utf8` + // for all possible cases. Therefore, we return a clone here. Ok(s.clone()) } } } } +/// This function coerces `value` to `target_type` in a range-aware fashion. +/// If the coercion is successful, we return an `Ok` value with the result. +/// If the coercion fails because `target_type` is not wide enough (i.e. we +/// can not coerce to `target_type`, but we can to a wider type in the same +/// family), we return a `Null` value of this type to signal this situation. +/// Downstream code uses this signal to treat these values as *unbounded*. +fn coerce_scalar_range_aware( + target_type: &DataType, + value: &ScalarValue, +) -> Result { + coerce_scalar(target_type, value).or_else(|err| { + // If type coercion fails, check if the largest type in family works: + if let Some(largest_type) = get_widest_type_in_family(target_type) { + coerce_scalar(largest_type, value).map_or_else( + |_| { + Err(DataFusionError::Execution(format!( + "Cannot cast {value:?} to {target_type:?}" + ))) + }, + |_| ScalarValue::try_from(target_type), + ) + } else { + Err(err) + } + }) +} + +/// This function returns the widest type in the family of `given_type`. +/// If the given type is already the widest type, it returns `None`. +/// For example, if `given_type` is `Int8`, it returns `Int64`. +fn get_widest_type_in_family(given_type: &DataType) -> Option<&DataType> { + match given_type { + DataType::UInt8 | DataType::UInt16 | DataType::UInt32 => Some(&DataType::UInt64), + DataType::Int8 | DataType::Int16 | DataType::Int32 => Some(&DataType::Int64), + DataType::Float16 | DataType::Float32 => Some(&DataType::Float64), + _ => None, + } +} + +/// Coerces the given (window frame) `bound` to `target_type`. fn coerce_frame_bound( - coerced_type: &DataType, + target_type: &DataType, bound: &WindowFrameBound, ) -> Result { - Ok(match bound { - WindowFrameBound::Preceding(val) => { - WindowFrameBound::Preceding(convert_to_coerced_type(coerced_type, val)?) + match bound { + WindowFrameBound::Preceding(v) => { + coerce_scalar_range_aware(target_type, v).map(WindowFrameBound::Preceding) } - WindowFrameBound::CurrentRow => WindowFrameBound::CurrentRow, - WindowFrameBound::Following(val) => { - WindowFrameBound::Following(convert_to_coerced_type(coerced_type, val)?) + WindowFrameBound::CurrentRow => Ok(WindowFrameBound::CurrentRow), + WindowFrameBound::Following(v) => { + coerce_scalar_range_aware(target_type, v).map(WindowFrameBound::Following) } - }) + } } -fn get_coerced_window_frame( +// Coerces the given `window_frame` to use appropriate natural types. +// For example, ROWS and GROUPS frames use `UInt64` during calculations. +fn coerce_window_frame( window_frame: WindowFrame, schema: &DFSchemaRef, expressions: &[Expr], ) -> Result { - fn get_coerced_type(column_type: &DataType) -> Result { - if is_numeric(column_type) | is_uft8(column_type) { - Ok(column_type.clone()) - } else if is_timestamp(column_type) || is_date(column_type) { - Ok(DataType::Interval(IntervalUnit::MonthDayNano)) - } else { - Err(DataFusionError::Internal(format!( - "Cannot run range queries on datatype: {column_type:?}" - ))) - } - } - let mut window_frame = window_frame; let current_types = expressions .iter() .map(|e| e.get_type(schema)) .collect::>>()?; - match &mut window_frame.units { + let target_type = match window_frame.units { WindowFrameUnits::Range => { - let col_type = current_types.first().ok_or_else(|| { - DataFusionError::Internal("ORDER BY column cannot be empty".to_string()) - })?; - let coerced_type = get_coerced_type(col_type)?; - window_frame.start_bound = - coerce_frame_bound(&coerced_type, &window_frame.start_bound)?; - window_frame.end_bound = - coerce_frame_bound(&coerced_type, &window_frame.end_bound)?; - } - WindowFrameUnits::Rows | WindowFrameUnits::Groups => { - let coerced_type = DataType::UInt64; - window_frame.start_bound = - coerce_frame_bound(&coerced_type, &window_frame.start_bound)?; - window_frame.end_bound = - coerce_frame_bound(&coerced_type, &window_frame.end_bound)?; + if let Some(col_type) = current_types.first() { + if is_numeric(col_type) || is_utf8_or_large_utf8(col_type) { + col_type + } else if is_timestamp(col_type) || is_date(col_type) { + &DataType::Interval(IntervalUnit::MonthDayNano) + } else { + return Err(DataFusionError::Internal(format!( + "Cannot run range queries on datatype: {col_type:?}" + ))); + } + } else { + return Err(DataFusionError::Internal( + "ORDER BY column cannot be empty".to_string(), + )); + } } - } + WindowFrameUnits::Rows | WindowFrameUnits::Groups => &DataType::UInt64, + }; + window_frame.start_bound = + coerce_frame_bound(target_type, &window_frame.start_bound)?; + window_frame.end_bound = coerce_frame_bound(target_type, &window_frame.end_bound)?; Ok(window_frame) } + // Support the `IsTrue` `IsNotTrue` `IsFalse` `IsNotFalse` type coercion. // The above op will be rewrite to the binary op when creating the physical op. fn get_casted_expr_for_bool_op(expr: &Expr, schema: &DFSchemaRef) -> Result { @@ -954,6 +988,35 @@ mod test { assert!(err.unwrap_err().to_string().contains( "There isn't a common type to coerce Int64 and Utf8 in LIKE expression" )); + + // ilike + let expr = Box::new(col("a")); + let pattern = Box::new(lit(ScalarValue::new_utf8("abc"))); + let ilike_expr = Expr::ILike(Like::new(false, expr, pattern, None)); + let empty = empty_with_type(DataType::Utf8); + let plan = LogicalPlan::Projection(Projection::try_new(vec![ilike_expr], empty)?); + let expected = "Projection: a ILIKE Utf8(\"abc\")\n EmptyRelation"; + assert_optimized_plan_eq(&plan, expected)?; + + let expr = Box::new(col("a")); + let pattern = Box::new(lit(ScalarValue::Null)); + let ilike_expr = Expr::ILike(Like::new(false, expr, pattern, None)); + let empty = empty_with_type(DataType::Utf8); + let plan = LogicalPlan::Projection(Projection::try_new(vec![ilike_expr], empty)?); + let expected = "Projection: a ILIKE CAST(NULL AS Utf8) AS a ILIKE NULL \ + \n EmptyRelation"; + assert_optimized_plan_eq(&plan, expected)?; + + let expr = Box::new(col("a")); + let pattern = Box::new(lit(ScalarValue::new_utf8("abc"))); + let ilike_expr = Expr::ILike(Like::new(false, expr, pattern, None)); + let empty = empty_with_type(DataType::Int64); + let plan = LogicalPlan::Projection(Projection::try_new(vec![ilike_expr], empty)?); + let err = assert_optimized_plan_eq(&plan, expected); + assert!(err.is_err()); + assert!(err.unwrap_err().to_string().contains( + "There isn't a common type to coerce Int64 and Utf8 in ILIKE expression" + )); Ok(()) } @@ -1034,6 +1097,21 @@ mod test { #[test] fn test_type_coercion_rewrite() -> Result<()> { + // gt + let schema = Arc::new( + DFSchema::new_with_metadata( + vec![DFField::new::<&str>(None, "a", DataType::Int64, true)], + std::collections::HashMap::new(), + ) + .unwrap(), + ); + let mut rewriter = TypeCoercionRewriter { schema }; + let expr = is_true(lit(12i32).gt(lit(13i64))); + let expected = is_true(cast(lit(12i32), DataType::Int64).gt(lit(13i64))); + let result = expr.rewrite(&mut rewriter)?; + assert_eq!(expected, result); + + // eq let schema = Arc::new( DFSchema::new_with_metadata( vec![DFField::new::<&str>(None, "a", DataType::Int64, true)], @@ -1046,8 +1124,22 @@ mod test { let expected = is_true(cast(lit(12i32), DataType::Int64).eq(lit(13i64))); let result = expr.rewrite(&mut rewriter)?; assert_eq!(expected, result); + + // lt + let schema = Arc::new( + DFSchema::new_with_metadata( + vec![DFField::new::<&str>(None, "a", DataType::Int64, true)], + std::collections::HashMap::new(), + ) + .unwrap(), + ); + let mut rewriter = TypeCoercionRewriter { schema }; + let expr = is_true(lit(12i32).lt(lit(13i64))); + let expected = is_true(cast(lit(12i32), DataType::Int64).lt(lit(13i64))); + let result = expr.rewrite(&mut rewriter)?; + assert_eq!(expected, result); + Ok(()) - // TODO add more test for this } #[test] diff --git a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs index d6e1cb631125..94a6d6404dde 100644 --- a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs +++ b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs @@ -51,7 +51,7 @@ use std::sync::Arc; /// 4. `literal_expr IN (cast(expr1) , cast(expr2), ...)` /// /// If the expression matches one of the forms above, the rule will -/// ensure the value of `literal` is in within range(min, max) of the +/// ensure the value of `literal` is in range(min, max) of the /// expr's data_type, and if the scalar is within range, the literal /// will be casted to the data type of expr on the other side, and the /// cast will be removed from the other side. diff --git a/datafusion/optimizer/src/utils.rs b/datafusion/optimizer/src/utils.rs index 747fa62089ed..c6c03ad793e3 100644 --- a/datafusion/optimizer/src/utils.rs +++ b/datafusion/optimizer/src/utils.rs @@ -416,7 +416,7 @@ pub fn only_or_err(slice: &[T]) -> Result<&T> { /// Rewrites `expr` using `rewriter`, ensuring that the output has the /// same name as `expr` prior to rewrite, adding an alias if necessary. /// -/// This is important when optimzing plans to ensure the the output +/// This is important when optimizing plans to ensure the output /// schema of plan nodes don't change after optimization pub fn rewrite_preserving_name(expr: Expr, rewriter: &mut R) -> Result where @@ -436,7 +436,7 @@ fn name_for_alias(expr: &Expr) -> Result { } } -/// Ensure `expr` has the name name as `original_name` by adding an +/// Ensure `expr` has the name as `original_name` by adding an /// alias if necessary. fn add_alias_if_changed(original_name: String, expr: Expr) -> Result { let new_name = name_for_alias(&expr)?; diff --git a/datafusion/optimizer/tests/integration-test.rs b/datafusion/optimizer/tests/integration-test.rs index eac849e34781..b84d15fae735 100644 --- a/datafusion/optimizer/tests/integration-test.rs +++ b/datafusion/optimizer/tests/integration-test.rs @@ -105,8 +105,7 @@ fn distribute_by() -> Result<()> { let sql = "SELECT col_int32, col_utf8 FROM test DISTRIBUTE BY (col_utf8)"; let plan = test_sql(sql)?; let expected = "Repartition: DistributeBy(col_utf8)\ - \n Projection: test.col_int32, test.col_utf8\ - \n TableScan: test projection=[col_int32, col_utf8]"; + \n TableScan: test projection=[col_int32, col_utf8]"; assert_eq!(expected, format!("{plan:?}")); Ok(()) } @@ -121,9 +120,8 @@ fn semi_join_with_join_filter() -> Result<()> { let expected = "Projection: test.col_utf8\ \n LeftSemi Join: test.col_int32 = t2.col_int32 Filter: test.col_uint32 != t2.col_uint32\ \n TableScan: test projection=[col_int32, col_uint32, col_utf8]\ - \n Projection: t2.col_int32, t2.col_uint32\ - \n SubqueryAlias: t2\ - \n TableScan: test projection=[col_int32, col_uint32]"; + \n SubqueryAlias: t2\ + \n TableScan: test projection=[col_int32, col_uint32]"; assert_eq!(expected, format!("{plan:?}")); Ok(()) } @@ -136,11 +134,10 @@ fn anti_join_with_join_filter() -> Result<()> { AND test.col_uint32 != t2.col_uint32)"; let plan = test_sql(sql)?; let expected = "Projection: test.col_utf8\ - \n LeftAnti Join: test.col_int32 = t2.col_int32 Filter: test.col_uint32 != t2.col_uint32\ - \n TableScan: test projection=[col_int32, col_uint32, col_utf8]\ - \n Projection: t2.col_int32, t2.col_uint32\ - \n SubqueryAlias: t2\ - \n TableScan: test projection=[col_int32, col_uint32]"; + \n LeftAnti Join: test.col_int32 = t2.col_int32 Filter: test.col_uint32 != t2.col_uint32\ + \n TableScan: test projection=[col_int32, col_uint32, col_utf8]\ + \n SubqueryAlias: t2\ + \n TableScan: test projection=[col_int32, col_uint32]"; assert_eq!(expected, format!("{plan:?}")); Ok(()) } @@ -151,12 +148,10 @@ fn where_exists_distinct() -> Result<()> { let sql = "SELECT col_int32 FROM test WHERE EXISTS (\ SELECT DISTINCT col_int32 FROM test t2 WHERE test.col_int32 = t2.col_int32)"; let plan = test_sql(sql)?; - let expected = "Projection: test.col_int32\ - \n LeftSemi Join: test.col_int32 = t2.col_int32\ - \n TableScan: test projection=[col_int32]\ - \n Projection: t2.col_int32\ - \n SubqueryAlias: t2\ - \n TableScan: test projection=[col_int32]"; + let expected = "LeftSemi Join: test.col_int32 = t2.col_int32\ + \n TableScan: test projection=[col_int32]\ + \n SubqueryAlias: t2\ + \n TableScan: test projection=[col_int32]"; assert_eq!(expected, format!("{plan:?}")); Ok(()) } @@ -169,9 +164,9 @@ fn intersect() -> Result<()> { let plan = test_sql(sql)?; let expected = "LeftSemi Join: test.col_int32 = test.col_int32, test.col_utf8 = test.col_utf8\ - \n Distinct:\ + \n Aggregate: groupBy=[[test.col_int32, test.col_utf8]], aggr=[[]]\ \n LeftSemi Join: test.col_int32 = test.col_int32, test.col_utf8 = test.col_utf8\ - \n Distinct:\ + \n Aggregate: groupBy=[[test.col_int32, test.col_utf8]], aggr=[[]]\ \n TableScan: test projection=[col_int32, col_utf8]\ \n TableScan: test projection=[col_int32, col_utf8]\ \n TableScan: test projection=[col_int32, col_utf8]"; @@ -185,9 +180,9 @@ fn between_date32_plus_interval() -> Result<()> { WHERE col_date32 between '1998-03-18' AND cast('1998-03-18' as date) + INTERVAL '90 days'"; let plan = test_sql(sql)?; let expected = - "Projection: COUNT(Int64(1))\n Aggregate: groupBy=[[]], aggr=[[COUNT(Int64(1))]]\ - \n Filter: test.col_date32 >= Date32(\"10303\") AND test.col_date32 <= Date32(\"10393\")\ - \n TableScan: test projection=[col_date32]"; + "Aggregate: groupBy=[[]], aggr=[[COUNT(Int64(1))]]\ + \n Filter: test.col_date32 >= Date32(\"10303\") AND test.col_date32 <= Date32(\"10393\")\ + \n TableScan: test projection=[col_date32]"; assert_eq!(expected, format!("{plan:?}")); Ok(()) } @@ -198,9 +193,9 @@ fn between_date64_plus_interval() -> Result<()> { WHERE col_date64 between '1998-03-18T00:00:00' AND cast('1998-03-18' as date) + INTERVAL '90 days'"; let plan = test_sql(sql)?; let expected = - "Projection: COUNT(Int64(1))\n Aggregate: groupBy=[[]], aggr=[[COUNT(Int64(1))]]\ - \n Filter: test.col_date64 >= Date64(\"890179200000\") AND test.col_date64 <= Date64(\"897955200000\")\ - \n TableScan: test projection=[col_date64]"; + "Aggregate: groupBy=[[]], aggr=[[COUNT(Int64(1))]]\ + \n Filter: test.col_date64 >= Date64(\"890179200000\") AND test.col_date64 <= Date64(\"897955200000\")\ + \n TableScan: test projection=[col_date64]"; assert_eq!(expected, format!("{plan:?}")); Ok(()) } @@ -276,15 +271,15 @@ fn propagate_empty_relation() { fn join_keys_in_subquery_alias() { let sql = "SELECT * FROM test AS A, ( SELECT col_int32 as key FROM test ) AS B where A.col_int32 = B.key;"; let plan = test_sql(sql).unwrap(); - let expected = "Projection: a.col_int32, a.col_uint32, a.col_utf8, a.col_date32, a.col_date64, a.col_ts_nano_none, a.col_ts_nano_utc, b.key\ - \n Inner Join: a.col_int32 = b.key\ - \n SubqueryAlias: a\ + let expected = "Inner Join: a.col_int32 = b.key\ + \n SubqueryAlias: a\ + \n Filter: test.col_int32 IS NOT NULL\ + \n TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]\ + \n SubqueryAlias: b\ + \n Projection: test.col_int32 AS key\ \n Filter: test.col_int32 IS NOT NULL\ - \n TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]\ - \n SubqueryAlias: b\ - \n Projection: test.col_int32 AS key\ - \n Filter: test.col_int32 IS NOT NULL\ - \n TableScan: test projection=[col_int32]"; + \n TableScan: test projection=[col_int32]"; + assert_eq!(expected, format!("{plan:?}")); } @@ -292,19 +287,18 @@ fn join_keys_in_subquery_alias() { fn join_keys_in_subquery_alias_1() { let sql = "SELECT * FROM test AS A, ( SELECT test.col_int32 AS key FROM test JOIN test AS C on test.col_int32 = C.col_int32 ) AS B where A.col_int32 = B.key;"; let plan = test_sql(sql).unwrap(); - let expected = "Projection: a.col_int32, a.col_uint32, a.col_utf8, a.col_date32, a.col_date64, a.col_ts_nano_none, a.col_ts_nano_utc, b.key\ - \n Inner Join: a.col_int32 = b.key\ - \n SubqueryAlias: a\ - \n Filter: test.col_int32 IS NOT NULL\ - \n TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]\ - \n SubqueryAlias: b\ - \n Projection: test.col_int32 AS key\ - \n Inner Join: test.col_int32 = c.col_int32\ + let expected = "Inner Join: a.col_int32 = b.key\ + \n SubqueryAlias: a\ + \n Filter: test.col_int32 IS NOT NULL\ + \n TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]\ + \n SubqueryAlias: b\ + \n Projection: test.col_int32 AS key\ + \n Inner Join: test.col_int32 = c.col_int32\ + \n Filter: test.col_int32 IS NOT NULL\ + \n TableScan: test projection=[col_int32]\ + \n SubqueryAlias: c\ \n Filter: test.col_int32 IS NOT NULL\ - \n TableScan: test projection=[col_int32]\ - \n SubqueryAlias: c\ - \n Filter: test.col_int32 IS NOT NULL\ - \n TableScan: test projection=[col_int32]"; + \n TableScan: test projection=[col_int32]"; assert_eq!(expected, format!("{plan:?}")); } @@ -312,11 +306,25 @@ fn join_keys_in_subquery_alias_1() { fn push_down_filter_groupby_expr_contains_alias() { let sql = "SELECT * FROM (SELECT (col_int32 + col_uint32) AS c, count(*) FROM test GROUP BY 1) where c > 3"; let plan = test_sql(sql).unwrap(); - let expected = "Projection: c, COUNT(UInt8(1))\ - \n Projection: test.col_int32 + test.col_uint32 AS c, COUNT(UInt8(1))\ - \n Aggregate: groupBy=[[test.col_int32 + CAST(test.col_uint32 AS Int32)]], aggr=[[COUNT(UInt8(1))]]\ - \n Filter: test.col_int32 + CAST(test.col_uint32 AS Int32) > Int32(3)\ - \n TableScan: test projection=[col_int32, col_uint32]"; + let expected = "Projection: test.col_int32 + test.col_uint32 AS c, COUNT(UInt8(1))\ + \n Aggregate: groupBy=[[test.col_int32 + CAST(test.col_uint32 AS Int32)]], aggr=[[COUNT(UInt8(1))]]\ + \n Filter: test.col_int32 + CAST(test.col_uint32 AS Int32) > Int32(3)\ + \n TableScan: test projection=[col_int32, col_uint32]"; + assert_eq!(expected, format!("{plan:?}")); +} + +#[test] +// issue: https://github.com/apache/arrow-datafusion/issues/5334 +fn test_same_name_but_not_ambiguous() { + let sql = "SELECT t1.col_int32 AS col_int32 FROM test t1 intersect SELECT col_int32 FROM test t2"; + let plan = test_sql(sql).unwrap(); + let expected = "LeftSemi Join: col_int32 = t2.col_int32\ + \n Aggregate: groupBy=[[col_int32]], aggr=[[]]\ + \n Projection: t1.col_int32 AS col_int32\ + \n SubqueryAlias: t1\ + \n TableScan: test projection=[col_int32]\ + \n SubqueryAlias: t2\ + \n TableScan: test projection=[col_int32]"; assert_eq!(expected, format!("{plan:?}")); } @@ -348,10 +356,7 @@ struct MySchemaProvider { } impl ContextProvider for MySchemaProvider { - fn get_table_provider( - &self, - name: TableReference, - ) -> datafusion_common::Result> { + fn get_table_provider(&self, name: TableReference) -> Result> { let table_name = name.table(); if table_name.starts_with("test") { let schema = Schema::new_with_metadata( diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index 8c0e1c623df7..d8cc89d79925 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "datafusion-physical-expr" description = "Physical expression implementation for DataFusion query engine" -version = "18.0.0" +version = "19.0.0" homepage = "https://github.com/apache/arrow-datafusion" repository = "https://github.com/apache/arrow-datafusion" readme = "README.md" @@ -43,15 +43,15 @@ unicode_expressions = ["unicode-segmentation"] [dependencies] ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } -arrow = { version = "32.0.0", features = ["prettyprint"] } -arrow-buffer = "32.0.0" -arrow-schema = "32.0.0" +arrow = { version = "34.0.0", features = ["prettyprint"] } +arrow-buffer = "34.0.0" +arrow-schema = "34.0.0" blake2 = { version = "^0.10.2", optional = true } blake3 = { version = "1.0", optional = true } chrono = { version = "0.4.23", default-features = false } -datafusion-common = { path = "../common", version = "18.0.0" } -datafusion-expr = { path = "../expr", version = "18.0.0" } -datafusion-row = { path = "../row", version = "18.0.0" } +datafusion-common = { path = "../common", version = "19.0.0" } +datafusion-expr = { path = "../expr", version = "19.0.0" } +datafusion-row = { path = "../row", version = "19.0.0" } half = { version = "2.1", default-features = false } hashbrown = { version = "0.13", features = ["raw"] } indexmap = "1.9.2" diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index ec9096ffc02a..0024deb3d4d8 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -118,7 +118,34 @@ impl BinaryExpr { impl std::fmt::Display for BinaryExpr { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{} {} {}", self.left, self.op, self.right) + // Put parentheses around child binary expressions so that we can see the difference + // between `(a OR b) AND c` and `a OR (b AND c)`. We only insert parentheses when needed, + // based on operator precedence. For example, `(a AND b) OR c` and `a AND b OR c` are + // equivalent and the parentheses are not necessary. + + fn write_child( + f: &mut std::fmt::Formatter, + expr: &dyn PhysicalExpr, + precedence: u8, + ) -> std::fmt::Result { + if let Some(child) = expr.as_any().downcast_ref::() { + let p = child.op.precedence(); + if p == 0 || p < precedence { + write!(f, "({child})")?; + } else { + write!(f, "{child}")?; + } + } else { + write!(f, "{expr}")?; + } + + Ok(()) + } + + let precedence = self.op.precedence(); + write_child(f, self.left.as_ref(), precedence)?; + write!(f, " {} ", self.op)?; + write_child(f, self.right.as_ref(), precedence) } } @@ -4249,4 +4276,67 @@ mod tests { Ok(()) } + + #[test] + fn test_display_and_or_combo() { + let expr = BinaryExpr::new( + Arc::new(BinaryExpr::new( + lit(ScalarValue::from(1)), + Operator::And, + lit(ScalarValue::from(2)), + )), + Operator::And, + Arc::new(BinaryExpr::new( + lit(ScalarValue::from(3)), + Operator::And, + lit(ScalarValue::from(4)), + )), + ); + assert_eq!(expr.to_string(), "1 AND 2 AND 3 AND 4"); + + let expr = BinaryExpr::new( + Arc::new(BinaryExpr::new( + lit(ScalarValue::from(1)), + Operator::Or, + lit(ScalarValue::from(2)), + )), + Operator::Or, + Arc::new(BinaryExpr::new( + lit(ScalarValue::from(3)), + Operator::Or, + lit(ScalarValue::from(4)), + )), + ); + assert_eq!(expr.to_string(), "1 OR 2 OR 3 OR 4"); + + let expr = BinaryExpr::new( + Arc::new(BinaryExpr::new( + lit(ScalarValue::from(1)), + Operator::And, + lit(ScalarValue::from(2)), + )), + Operator::Or, + Arc::new(BinaryExpr::new( + lit(ScalarValue::from(3)), + Operator::And, + lit(ScalarValue::from(4)), + )), + ); + assert_eq!(expr.to_string(), "1 AND 2 OR 3 AND 4"); + + let expr = BinaryExpr::new( + Arc::new(BinaryExpr::new( + lit(ScalarValue::from(1)), + Operator::Or, + lit(ScalarValue::from(2)), + )), + Operator::And, + Arc::new(BinaryExpr::new( + lit(ScalarValue::from(3)), + Operator::Or, + lit(ScalarValue::from(4)), + )), + ); + assert_eq!(expr.to_string(), "(1 OR 2) AND (3 OR 4)"); + } } diff --git a/datafusion/physical-expr/src/expressions/try_cast.rs b/datafusion/physical-expr/src/expressions/try_cast.rs index ce5f1c2794b5..bceed0a34c48 100644 --- a/datafusion/physical-expr/src/expressions/try_cast.rs +++ b/datafusion/physical-expr/src/expressions/try_cast.rs @@ -58,7 +58,7 @@ impl TryCastExpr { impl fmt::Display for TryCastExpr { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "CAST({} AS {:?})", self.expr, self.cast_type) + write!(f, "TRY_CAST({} AS {:?})", self.expr, self.cast_type) } } @@ -132,7 +132,7 @@ pub fn try_cast( Ok(Arc::new(TryCastExpr::new(expr, cast_type))) } else { Err(DataFusionError::NotImplemented(format!( - "Unsupported CAST from {expr_type:?} to {cast_type:?}" + "Unsupported TRY_CAST from {expr_type:?} to {cast_type:?}" ))) } } @@ -155,7 +155,7 @@ mod tests { // runs an end-to-end test of physical type cast // 1. construct a record batch with a column "a" of type A - // 2. construct a physical expression of CAST(a AS B) + // 2. construct a physical expression of TRY_CAST(a AS B) // 3. evaluate the expression // 4. verify that the resulting expression is of type B // 5. verify that the resulting values are downcastable and correct @@ -171,7 +171,7 @@ mod tests { // verify that its display is correct assert_eq!( - format!("CAST(a@0 AS {:?})", $TYPE), + format!("TRY_CAST(a@0 AS {:?})", $TYPE), format!("{}", expression) ); @@ -202,7 +202,7 @@ mod tests { // runs an end-to-end test of physical type cast // 1. construct a record batch with a column "a" of type A - // 2. construct a physical expression of CAST(a AS B) + // 2. construct a physical expression of TRY_CAST(a AS B) // 3. evaluate the expression // 4. verify that the resulting expression is of type B // 5. verify that the resulting values are downcastable and correct @@ -218,7 +218,7 @@ mod tests { // verify that its display is correct assert_eq!( - format!("CAST(a@0 AS {:?})", $TYPE), + format!("TRY_CAST(a@0 AS {:?})", $TYPE), format!("{}", expression) ); @@ -542,7 +542,7 @@ mod tests { let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); let result = try_cast(col("a", &schema).unwrap(), &schema, DataType::LargeBinary); - result.expect_err("expected Invalid CAST"); + result.expect_err("expected Invalid TRY_CAST"); } // create decimal array with the specified precision and scale diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs index 31d28f820555..105de17c2e74 100644 --- a/datafusion/physical-expr/src/planner.rs +++ b/datafusion/physical-expr/src/planner.rs @@ -393,7 +393,10 @@ pub fn create_physical_expr( execution_props, )?); } - + // udfs with zero params expect null array as input + if args.is_empty() { + physical_args.push(Arc::new(Literal::new(ScalarValue::Null))); + } udf::create_physical_expr(fun.clone().as_ref(), &physical_args, input_schema) } Expr::Between(Between { diff --git a/datafusion/proto/Cargo.toml b/datafusion/proto/Cargo.toml index 7a6529c25d24..bcf0dac02d94 100644 --- a/datafusion/proto/Cargo.toml +++ b/datafusion/proto/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "datafusion-proto" description = "Protobuf serialization of DataFusion logical plan expressions" -version = "18.0.0" +version = "19.0.0" homepage = "https://github.com/apache/arrow-datafusion" repository = "https://github.com/apache/arrow-datafusion" readme = "README.md" @@ -40,11 +40,11 @@ default = [] json = ["pbjson", "serde", "serde_json"] [dependencies] -arrow = "32.0.0" +arrow = "34.0.0" chrono = { version = "0.4", default-features = false } -datafusion = { path = "../core", version = "18.0.0" } -datafusion-common = { path = "../common", version = "18.0.0" } -datafusion-expr = { path = "../expr", version = "18.0.0" } +datafusion = { path = "../core", version = "19.0.0" } +datafusion-common = { path = "../common", version = "19.0.0" } +datafusion-expr = { path = "../expr", version = "19.0.0" } object_store = { version = "0.5.4" } parking_lot = { version = "0.12" } pbjson = { version = "0.5", optional = true } @@ -60,4 +60,4 @@ tokio = "1.18" [build-dependencies] # Pin these dependencies so that the generated output is deterministic pbjson-build = { version = "=0.5.1" } -prost-build = { version = "=0.11.6" } +prost-build = { version = "=0.11.7" } diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 5bbe1e5e648f..103fe3face51 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -743,6 +743,11 @@ message Struct{ repeated Field sub_field_types = 1; } +message Map { + Field field_type = 1; + bool keys_sorted = 2; +} + enum UnionMode{ sparse = 0; dense = 1; @@ -894,6 +899,7 @@ message ArrowType{ Struct STRUCT = 28; Union UNION = 29; Dictionary DICTIONARY = 30; + Map MAP = 33; } } diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 1c84da002503..335f6f1c59da 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -1165,6 +1165,9 @@ impl serde::Serialize for ArrowType { arrow_type::ArrowTypeEnum::Dictionary(v) => { struct_ser.serialize_field("DICTIONARY", v)?; } + arrow_type::ArrowTypeEnum::Map(v) => { + struct_ser.serialize_field("MAP", v)?; + } } } struct_ser.end() @@ -1214,6 +1217,7 @@ impl<'de> serde::Deserialize<'de> for ArrowType { "STRUCT", "UNION", "DICTIONARY", + "MAP", ]; #[allow(clippy::enum_variant_names)] @@ -1250,6 +1254,7 @@ impl<'de> serde::Deserialize<'de> for ArrowType { Struct, Union, Dictionary, + Map, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -1303,6 +1308,7 @@ impl<'de> serde::Deserialize<'de> for ArrowType { "STRUCT" => Ok(GeneratedField::Struct), "UNION" => Ok(GeneratedField::Union), "DICTIONARY" => Ok(GeneratedField::Dictionary), + "MAP" => Ok(GeneratedField::Map), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -1542,6 +1548,13 @@ impl<'de> serde::Deserialize<'de> for ArrowType { return Err(serde::de::Error::duplicate_field("DICTIONARY")); } arrow_type_enum__ = map.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Dictionary) +; + } + GeneratedField::Map => { + if arrow_type_enum__.is_some() { + return Err(serde::de::Error::duplicate_field("MAP")); + } + arrow_type_enum__ = map.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Map) ; } } @@ -11139,6 +11152,116 @@ impl<'de> serde::Deserialize<'de> for LogicalPlanNode { deserializer.deserialize_struct("datafusion.LogicalPlanNode", FIELDS, GeneratedVisitor) } } +impl serde::Serialize for Map { + #[allow(deprecated)] + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let mut len = 0; + if self.field_type.is_some() { + len += 1; + } + if self.keys_sorted { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion.Map", len)?; + if let Some(v) = self.field_type.as_ref() { + struct_ser.serialize_field("fieldType", v)?; + } + if self.keys_sorted { + struct_ser.serialize_field("keysSorted", &self.keys_sorted)?; + } + struct_ser.end() + } +} +impl<'de> serde::Deserialize<'de> for Map { + #[allow(deprecated)] + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + "field_type", + "fieldType", + "keys_sorted", + "keysSorted", + ]; + + #[allow(clippy::enum_variant_names)] + enum GeneratedField { + FieldType, + KeysSorted, + } + impl<'de> serde::Deserialize<'de> for GeneratedField { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = GeneratedField; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + #[allow(unused_variables)] + fn visit_str(self, value: &str) -> std::result::Result + where + E: serde::de::Error, + { + match value { + "fieldType" | "field_type" => Ok(GeneratedField::FieldType), + "keysSorted" | "keys_sorted" => Ok(GeneratedField::KeysSorted), + _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + } + } + } + deserializer.deserialize_identifier(GeneratedVisitor) + } + } + struct GeneratedVisitor; + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = Map; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("struct datafusion.Map") + } + + fn visit_map(self, mut map: V) -> std::result::Result + where + V: serde::de::MapAccess<'de>, + { + let mut field_type__ = None; + let mut keys_sorted__ = None; + while let Some(k) = map.next_key()? { + match k { + GeneratedField::FieldType => { + if field_type__.is_some() { + return Err(serde::de::Error::duplicate_field("fieldType")); + } + field_type__ = map.next_value()?; + } + GeneratedField::KeysSorted => { + if keys_sorted__.is_some() { + return Err(serde::de::Error::duplicate_field("keysSorted")); + } + keys_sorted__ = Some(map.next_value()?); + } + } + } + Ok(Map { + field_type: field_type__, + keys_sorted: keys_sorted__.unwrap_or_default(), + }) + } + } + deserializer.deserialize_struct("datafusion.Map", FIELDS, GeneratedVisitor) + } +} impl serde::Serialize for NegativeNode { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index 5497a878372c..029380a99ed5 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -931,6 +931,14 @@ pub struct Struct { } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] +pub struct Map { + #[prost(message, optional, boxed, tag = "1")] + pub field_type: ::core::option::Option<::prost::alloc::boxed::Box>, + #[prost(bool, tag = "2")] + pub keys_sorted: bool, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] pub struct Union { #[prost(message, repeated, tag = "1")] pub union_types: ::prost::alloc::vec::Vec, @@ -1139,7 +1147,7 @@ pub struct Decimal128 { pub struct ArrowType { #[prost( oneof = "arrow_type::ArrowTypeEnum", - tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 32, 15, 16, 31, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30" + tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 32, 15, 16, 31, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 33" )] pub arrow_type_enum: ::core::option::Option, } @@ -1217,6 +1225,8 @@ pub mod arrow_type { Union(super::Union), #[prost(message, tag = "30")] Dictionary(::prost::alloc::boxed::Box), + #[prost(message, tag = "33")] + Map(::prost::alloc::boxed::Box), } } /// Useful for representing an empty enum variant in rust diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index e54dee9f212c..250aa34435e2 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -348,6 +348,12 @@ impl TryFrom<&protobuf::arrow_type::ArrowTypeEnum> for DataType { let value_datatype = dict.as_ref().value.as_deref().required("value")?; DataType::Dictionary(Box::new(key_datatype), Box::new(value_datatype)) } + arrow_type::ArrowTypeEnum::Map(map) => { + let field: Field = + map.as_ref().field_type.as_deref().required("field_type")?; + let keys_sorted = map.keys_sorted; + DataType::Map(Box::new(field), keys_sorted) + } }) } } diff --git a/datafusion/proto/src/logical_plan/mod.rs b/datafusion/proto/src/logical_plan/mod.rs index 6f44542b90b2..bf3c733c005a 100644 --- a/datafusion/proto/src/logical_plan/mod.rs +++ b/datafusion/proto/src/logical_plan/mod.rs @@ -2212,6 +2212,17 @@ mod roundtrip_tests { 4, )), ), + DataType::Map( + new_box_field( + "entries", + DataType::Struct(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Int32, true), + ]), + true, + ), + false, + ), ]; for test_case in test_cases.into_iter() { diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index dbd91faa4db8..d228274e3958 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -218,9 +218,12 @@ impl TryFrom<&DataType> for protobuf::arrow_type::ArrowTypeEnum { DataType::Decimal256(_, _) => { return Err(Error::General("Proto serialization error: The Decimal256 data type is not yet supported".to_owned())) } - DataType::Map(_, _) => { - return Err(Error::General( - "Proto serialization error: The Map data type is not yet supported".to_owned() + DataType::Map(field, sorted) => { + Self::Map(Box::new( + protobuf::Map { + field_type: Some(Box::new(field.as_ref().try_into()?)), + keys_sorted: *sorted, + } )) } DataType::RunEndEncoded(_, _) => { diff --git a/datafusion/row/Cargo.toml b/datafusion/row/Cargo.toml index 62cdf8897698..0d7c313e5daf 100644 --- a/datafusion/row/Cargo.toml +++ b/datafusion/row/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "datafusion-row" description = "Row backed by raw bytes for DataFusion query engine" -version = "18.0.0" +version = "19.0.0" homepage = "https://github.com/apache/arrow-datafusion" repository = "https://github.com/apache/arrow-datafusion" readme = "README.md" @@ -37,8 +37,8 @@ path = "src/lib.rs" jit = ["datafusion-jit"] [dependencies] -arrow = "32.0.0" -datafusion-common = { path = "../common", version = "18.0.0" } -datafusion-jit = { path = "../jit", version = "18.0.0", optional = true } +arrow = "34.0.0" +datafusion-common = { path = "../common", version = "19.0.0" } +datafusion-jit = { path = "../jit", version = "19.0.0", optional = true } paste = "^1.0" rand = "0.8" diff --git a/datafusion/sql/Cargo.toml b/datafusion/sql/Cargo.toml index 62c1def0364b..c0c2e5567f31 100644 --- a/datafusion/sql/Cargo.toml +++ b/datafusion/sql/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "datafusion-sql" description = "DataFusion SQL Query Planner" -version = "18.0.0" +version = "19.0.0" homepage = "https://github.com/apache/arrow-datafusion" repository = "https://github.com/apache/arrow-datafusion" readme = "README.md" @@ -37,9 +37,9 @@ default = ["unicode_expressions"] unicode_expressions = [] [dependencies] -arrow-schema = "32.0.0" -datafusion-common = { path = "../common", version = "18.0.0" } -datafusion-expr = { path = "../expr", version = "18.0.0" } +arrow-schema = "34.0.0" +datafusion-common = { path = "../common", version = "19.0.0" } +datafusion-expr = { path = "../expr", version = "19.0.0" } log = "^0.4" sqlparser = "0.30" diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index d25268c87eb0..521466214ff8 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -399,7 +399,7 @@ impl<'a> DFParser<'a> { let token = self.parser.next_token(); match &token.token { Token::Word(w) => CompressionTypeVariant::from_str(&w.value), - _ => self.expected("one of GZIP, BZIP2, XZ", token), + _ => self.expected("one of GZIP, BZIP2, XZ, ZSTD", token), } } @@ -586,6 +586,7 @@ mod tests { ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV COMPRESSION TYPE GZIP LOCATION 'foo.csv'", "GZIP"), ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV COMPRESSION TYPE BZIP2 LOCATION 'foo.csv'", "BZIP2"), ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV COMPRESSION TYPE XZ LOCATION 'foo.csv'", "XZ"), + ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV COMPRESSION TYPE ZSTD LOCATION 'foo.csv'", "ZSTD"), ]; for (sql, file_compression_type) in sqls { let expected = Statement::CreateExternalTable(CreateExternalTable { diff --git a/datafusion/sql/src/query.rs b/datafusion/sql/src/query.rs index c59c42e93c0d..eb7ece87d6b1 100644 --- a/datafusion/sql/src/query.rs +++ b/datafusion/sql/src/query.rs @@ -17,9 +17,8 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use crate::utils::normalize_ident; -use datafusion_common::{Column, DFSchema, DataFusionError, Result, ScalarValue}; -use datafusion_expr::expr_rewriter::rewrite_sort_cols_by_aggs; -use datafusion_expr::{Expr, LogicalPlan, LogicalPlanBuilder, Repartition}; +use datafusion_common::{DFSchema, DataFusionError, Result, ScalarValue}; +use datafusion_expr::{Expr, LogicalPlan, LogicalPlanBuilder}; use sqlparser::ast::{Expr as SQLExpr, Offset as SQLOffset, OrderByExpr, Query}; use sqlparser::parser::ParserError::ParserError; @@ -151,55 +150,11 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { return Ok(plan); } - let mut order_by_rex = order_by + let order_by_rex = order_by .into_iter() .map(|e| self.order_by_to_sort_expr(e, plan.schema())) .collect::>>()?; - order_by_rex = rewrite_sort_cols_by_aggs(order_by_rex, &plan)?; - let schema = plan.schema(); - - // if current plan is distinct or current plan is repartition and its child plan is distinct, - // then this plan is a select distinct plan - let is_select_distinct = match plan { - LogicalPlan::Distinct(_) => true, - LogicalPlan::Repartition(Repartition { ref input, .. }) => { - matches!(input.as_ref(), &LogicalPlan::Distinct(_)) - } - _ => false, - }; - - let mut missing_cols: Vec = vec![]; - // Collect sort columns that are missing in the input plan's schema - order_by_rex - .clone() - .into_iter() - .try_for_each::<_, Result<()>>(|expr| { - let columns = expr.to_columns()?; - - columns.into_iter().for_each(|c| { - if schema.field_from_column(&c).is_err() { - missing_cols.push(c); - } - }); - - Ok(()) - })?; - - // for select distinct, order by expressions must exist in select list - if is_select_distinct && !missing_cols.is_empty() { - let missing_col_names = missing_cols - .iter() - .map(|col| col.flat_name()) - .collect::(); - let error_msg = format!( - "For SELECT DISTINCT, ORDER BY expressions {missing_col_names} must appear in select list", - ); - return Err(DataFusionError::Plan(error_msg)); - } - - LogicalPlanBuilder::from(plan) - .create_sort_plan(order_by_rex, missing_cols)? - .build() + LogicalPlanBuilder::from(plan).sort(order_by_rex)?.build() } } diff --git a/datafusion/substrait/Cargo.toml b/datafusion/substrait/Cargo.toml index 02108067c988..8cd5ee4cadb1 100644 --- a/datafusion/substrait/Cargo.toml +++ b/datafusion/substrait/Cargo.toml @@ -19,14 +19,14 @@ name = "datafusion-substrait" description = "DataFusion Substrait Producer and Consumer" license = "Apache-2.0" -version = "18.0.0" +version = "19.0.0" edition = "2021" rust-version = "1.62" [dependencies] async-recursion = "1.0" chrono = "0.4.23" -datafusion = { version = "18.0.0", path = "../core" } +datafusion = { version = "19.0.0", path = "../core" } itertools = "0.10.5" object_store = "0.5.4" prost = "0.11" diff --git a/datafusion/substrait/tests/roundtrip_logical_plan.rs b/datafusion/substrait/tests/roundtrip_logical_plan.rs index c2ab3df3cb40..47ce1bbcd9b6 100644 --- a/datafusion/substrait/tests/roundtrip_logical_plan.rs +++ b/datafusion/substrait/tests/roundtrip_logical_plan.rs @@ -178,9 +178,8 @@ mod tests { async fn aggregate_case() -> Result<()> { assert_expected_plan( "SELECT SUM(CASE WHEN a > 0 THEN 1 ELSE NULL END) FROM data", - "Projection: SUM(CASE WHEN data.a > Int64(0) THEN Int64(1) ELSE Int64(NULL) END)\ - \n Aggregate: groupBy=[[]], aggr=[[SUM(CASE WHEN data.a > Int64(0) THEN Int64(1) ELSE Int64(NULL) END)]]\ - \n TableScan: data projection=[a]", + "Aggregate: groupBy=[[]], aggr=[[SUM(CASE WHEN data.a > Int64(0) THEN Int64(1) ELSE Int64(NULL) END)]]\ + \n TableScan: data projection=[a]", ) .await } @@ -227,13 +226,11 @@ mod tests { async fn simple_intersect() -> Result<()> { assert_expected_plan( "SELECT COUNT(*) FROM (SELECT data.a FROM data INTERSECT SELECT data2.a FROM data2);", - "Projection: COUNT(Int16(1))\ - \n Aggregate: groupBy=[[]], aggr=[[COUNT(Int16(1))]]\ - \n LeftSemi Join: data.a = data2.a\ - \n Aggregate: groupBy=[[data.a]], aggr=[[]]\ - \n TableScan: data projection=[a]\ - \n Projection: data2.a\ - \n TableScan: data2 projection=[a]", + "Aggregate: groupBy=[[]], aggr=[[COUNT(Int16(1))]]\ + \n LeftSemi Join: data.a = data2.a\ + \n Aggregate: groupBy=[[data.a]], aggr=[[]]\ + \n TableScan: data projection=[a]\ + \n TableScan: data2 projection=[a]", ) .await } diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 37b750c59ba7..1e457ee9b608 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -60,7 +60,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.optimizer.repartition_aggregations | true | Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level | | datafusion.optimizer.repartition_file_min_size | 10485760 | Minimum total files size in bytes to perform file scan repartitioning. | | datafusion.optimizer.repartition_joins | true | Should DataFusion repartition data using the join keys to execute joins in parallel using the provided `target_partitions` level | -| datafusion.optimizer.repartition_file_scans | false | When set to true, file groups will be repartitioned to achieve maximum parallelism. Currently supported only for Parquet format in which case multiple row groups from the same file may be read concurrently. If false then each row group is read serially, though different files may be read in parallel. | +| datafusion.optimizer.repartition_file_scans | true | When set to true, file groups will be repartitioned to achieve maximum parallelism. Currently supported only for Parquet format in which case multiple row groups from the same file may be read concurrently. If false then each row group is read serially, though different files may be read in parallel. | | datafusion.optimizer.repartition_windows | true | Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level | | datafusion.optimizer.repartition_sorts | true | Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below "SortExec: [a@0 ASC]", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", would turn into the plan below which performs better in multithreaded environments "SortPreservingMergeExec: [a@0 ASC]", " SortExec: [a@0 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", | | datafusion.optimizer.skip_failed_rules | true | When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail | diff --git a/docs/source/user-guide/dataframe.md b/docs/source/user-guide/dataframe.md index c7d490e40484..8ebf4cc678e1 100644 --- a/docs/source/user-guide/dataframe.md +++ b/docs/source/user-guide/dataframe.md @@ -72,7 +72,7 @@ execution. The plan is evaluated (executed) when an action method is invoked, su | limit | Limit the number of rows returned from this DataFrame. | | repartition | Repartition a DataFrame based on a logical partitioning scheme. | | sort | Sort the DataFrame by the specified sorting expressions. Any expression can be turned into a sort expression by calling its `sort` method. | -| select | Create a projection based on arbitrary expressions. Example: `df..select(vec![col("c1"), abs(col("c2"))])?` | +| select | Create a projection based on arbitrary expressions. Example: `df.select(vec![col("c1"), abs(col("c2"))])?` | | select_columns | Create a projection based on column names. Example: `df.select_columns(&["id", "name"])?`. | | union | Calculate the union of two DataFrames, preserving duplicate rows. The two DataFrames must have exactly the same schema. | | union_distinct | Calculate the distinct union of two DataFrames. The two DataFrames must have exactly the same schema. | diff --git a/parquet-test-utils/Cargo.toml b/parquet-test-utils/Cargo.toml index c63af8362be2..0d41aa823de1 100644 --- a/parquet-test-utils/Cargo.toml +++ b/parquet-test-utils/Cargo.toml @@ -25,4 +25,4 @@ edition = "2021" [dependencies] datafusion = { path = "../datafusion/core" } object_store = "0.5.4" -parquet = "32.0.0" +parquet = "34.0.0" diff --git a/test-utils/Cargo.toml b/test-utils/Cargo.toml index 2d591926be21..b7863a385f05 100644 --- a/test-utils/Cargo.toml +++ b/test-utils/Cargo.toml @@ -23,7 +23,7 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -arrow = { version = "32.0.0", features = ["prettyprint"] } +arrow = { version = "34.0.0", features = ["prettyprint"] } datafusion-common = { path = "../datafusion/common" } env_logger = "0.10.0" rand = "0.8" From eb65472fb15504786e07786c3bc7b23fac98f6d5 Mon Sep 17 00:00:00 2001 From: Jefffrey <22608443+Jefffrey@users.noreply.github.com> Date: Wed, 1 Mar 2023 21:27:59 +1100 Subject: [PATCH 03/13] Update column new() docstring --- datafusion/common/src/column.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index f6a660149954..39242eea9ee0 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -35,7 +35,12 @@ pub struct Column { } impl Column { - /// Create Column from optional qualifier and name + /// Create Column from optional qualifier and name. The optional qualifier, if present, + /// will be parsed and normalized by default. + /// + /// See full details on [`TableReference::parse_str`] + /// + /// [`TableReference::parse_str`]: crate::TableReference::parse_str pub fn new( relation: Option>, name: impl Into, From 1d613ed073c202aa2a6d1165f4456c7b2f1ac1da Mon Sep 17 00:00:00 2001 From: Jefffrey <22608443+Jefffrey@users.noreply.github.com> Date: Fri, 3 Mar 2023 22:23:11 +1100 Subject: [PATCH 04/13] Add tests for dfschema search --- datafusion/common/src/table_reference.rs | 12 +- datafusion/sql/src/expr/identifier.rs | 330 +++++++++++++++++------ 2 files changed, 255 insertions(+), 87 deletions(-) diff --git a/datafusion/common/src/table_reference.rs b/datafusion/common/src/table_reference.rs index 60973d46410d..3ba3ee8f4f15 100644 --- a/datafusion/common/src/table_reference.rs +++ b/datafusion/common/src/table_reference.rs @@ -87,25 +87,25 @@ impl<'a> TableReference<'a> { /// Convenience method for creating a `Partial` variant of `TableReference` pub fn partial( - table: impl Into>, schema: impl Into>, + table: impl Into>, ) -> TableReference<'a> { TableReference::Partial { - table: table.into(), schema: schema.into(), + table: table.into(), } } /// Convenience method for creating a `Full` variant of `TableReference` pub fn full( - table: impl Into>, - schema: impl Into>, catalog: impl Into>, + schema: impl Into>, + table: impl Into>, ) -> TableReference<'a> { TableReference::Full { - table: table.into(), - schema: schema.into(), catalog: catalog.into(), + schema: schema.into(), + table: table.into(), } } diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs index fb726f5e8713..9714ad46ab55 100644 --- a/datafusion/sql/src/expr/identifier.rs +++ b/datafusion/sql/src/expr/identifier.rs @@ -18,7 +18,7 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use crate::utils::normalize_ident; use datafusion_common::{ - Column, DFSchema, DataFusionError, Result, ScalarValue, TableReference, + Column, DFField, DFSchema, DataFusionError, Result, ScalarValue, TableReference, }; use datafusion_expr::{Case, Expr, GetIndexedField}; use sqlparser::ast::{Expr as SQLExpr, Ident}; @@ -50,38 +50,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } } - // (relation, column name) - fn form_identifier(idents: &[String]) -> Result<(Option, &String)> { - match idents.len() { - 1 => Ok((None, &idents[0])), - 2 => Ok(( - Some(TableReference::Bare { - table: (&idents[0]).into(), - }), - &idents[1], - )), - 3 => Ok(( - Some(TableReference::Partial { - schema: (&idents[0]).into(), - table: (&idents[1]).into(), - }), - &idents[2], - )), - 4 => Ok(( - Some(TableReference::Full { - catalog: (&idents[0]).into(), - schema: (&idents[1]).into(), - table: (&idents[2]).into(), - }), - &idents[3], - )), - _ => Err(DataFusionError::Internal(format!( - "Incorrect number of identifiers: {}", - idents.len() - ))), - } - } - pub(super) fn sql_compound_identifier_to_expr( &self, ids: Vec, @@ -116,38 +84,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { }) .collect::>(); - // Possibilities we search with, in order from top to bottom for each len: - // - // len = 2: - // 1. (table.column) - // 2. (column).nested - // - // len = 3: - // 1. (schema.table.column) - // 2. (table.column).nested - // 3. (column).nested1.nested2 - // - // len = 4: - // 1. (catalog.schema.table.column) - // 2. (schema.table.column).nested1 - // 3. (table.column).nested1.nested2 - // 4. (column).nested1.nested2.nested3 - // - // len = 5: - // 1. (catalog.schema.table.column).nested - // 2. (schema.table.column).nested1.nested2 - // 3. (table.column).nested1.nested2.nested3 - // 4. (column).nested1.nested2.nested3.nested4 - // - // len > 5: - // 1. (catalog.schema.table.column).nested[.nestedN]+ - // 2. (schema.table.column).nested1.nested2[.nestedN]+ - // 3. (table.column).nested1.nested2.nested3[.nestedN]+ - // 4. (column).nested1.nested2.nested3.nested4[.nestedN]+ - // // Currently not supporting more than one nested level // Though ideally once that support is in place, this code should work with it - // TODO: remove when can support multiple nested identifiers if ids.len() > 5 { return Err(DataFusionError::Internal(format!( @@ -155,37 +93,27 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ))); } - // take at most 4 identifiers to form a Column to search with - // - 1 for the column name - // - 0 to 3 for the TableReference - let bound = ids.len().min(4); - // search from most specific to least specific - let search_result = (0..bound).rev().find_map(|i| { - let nested_names_index = i + 1; - let s = &ids[0..nested_names_index]; - let (relation, column_name) = Self::form_identifier(s).unwrap(); - let field = schema.field_with_name(relation.as_ref(), column_name).ok(); - field.map(|f| (f, nested_names_index)) - }); - + let search_result = search_dfschema(&ids, schema); match search_result { // found matching field with spare identifier(s) for nested field(s) in structure - Some((field, index)) if index < ids.len() => { + Some((field, nested_names)) if !nested_names.is_empty() => { // TODO: remove when can support multiple nested identifiers - if index < ids.len() - 1 { + if nested_names.len() > 1 { return Err(DataFusionError::Internal(format!( "Nested identifiers not yet supported for column {}", field.qualified_column().quoted_flat_name() ))); } - let nested_name = ids[index].to_string(); + let nested_name = nested_names[0].to_string(); Ok(Expr::GetIndexedField(GetIndexedField::new( Box::new(Expr::Column(field.qualified_column())), ScalarValue::Utf8(Some(nested_name)), ))) } // found matching field with no spare identifier(s) - Some((field, _index)) => Ok(Expr::Column(field.qualified_column())), + Some((field, _nested_names)) => { + Ok(Expr::Column(field.qualified_column())) + } // found no matching field, will return a default None => { // return default where use all identifiers to not have a nested field @@ -196,7 +124,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ))) } else { let s = &ids[0..ids.len()]; - let (relation, column_name) = Self::form_identifier(s).unwrap(); + let (relation, column_name) = form_identifier(s).unwrap(); let relation = relation.map(|r| r.to_owned_reference()); Ok(Expr::Column(Column::new(relation, column_name))) } @@ -252,3 +180,243 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ))) } } + +// (relation, column name) +fn form_identifier(idents: &[String]) -> Result<(Option, &String)> { + match idents.len() { + 1 => Ok((None, &idents[0])), + 2 => Ok(( + Some(TableReference::Bare { + table: (&idents[0]).into(), + }), + &idents[1], + )), + 3 => Ok(( + Some(TableReference::Partial { + schema: (&idents[0]).into(), + table: (&idents[1]).into(), + }), + &idents[2], + )), + 4 => Ok(( + Some(TableReference::Full { + catalog: (&idents[0]).into(), + schema: (&idents[1]).into(), + table: (&idents[2]).into(), + }), + &idents[3], + )), + _ => Err(DataFusionError::Internal(format!( + "Incorrect number of identifiers: {}", + idents.len() + ))), + } +} + +fn search_dfschema<'ids, 'schema>( + ids: &'ids [String], + schema: &'schema DFSchema, +) -> Option<(&'schema DFField, &'ids [String])> { + generate_schema_search_terms(ids).find_map(|(qualifier, column, nested_names)| { + let field = schema.field_with_name(qualifier.as_ref(), column).ok(); + field.map(|f| (f, nested_names)) + }) +} + +// Possibilities we search with, in order from top to bottom for each len: +// +// len = 2: +// 1. (table.column) +// 2. (column).nested +// +// len = 3: +// 1. (schema.table.column) +// 2. (table.column).nested +// 3. (column).nested1.nested2 +// +// len = 4: +// 1. (catalog.schema.table.column) +// 2. (schema.table.column).nested1 +// 3. (table.column).nested1.nested2 +// 4. (column).nested1.nested2.nested3 +// +// len = 5: +// 1. (catalog.schema.table.column).nested +// 2. (schema.table.column).nested1.nested2 +// 3. (table.column).nested1.nested2.nested3 +// 4. (column).nested1.nested2.nested3.nested4 +// +// len > 5: +// 1. (catalog.schema.table.column).nested[.nestedN]+ +// 2. (schema.table.column).nested1.nested2[.nestedN]+ +// 3. (table.column).nested1.nested2.nested3[.nestedN]+ +// 4. (column).nested1.nested2.nested3.nested4[.nestedN]+ +fn generate_schema_search_terms( + ids: &[String], +) -> impl Iterator, &String, &[String])> { + // take at most 4 identifiers to form a Column to search with + // - 1 for the column name + // - 0 to 3 for the TableReference + let bound = ids.len().min(4); + // search terms from most specific to least specific + (0..bound).rev().map(|i| { + let nested_names_index = i + 1; + let qualifier_and_column = &ids[0..nested_names_index]; + let (relation, column_name) = form_identifier(qualifier_and_column).unwrap(); + (relation, column_name, &ids[nested_names_index..]) + }) +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + // testing according to documentation of generate_schema_search_terms function + // where ensure generated search terms are in correct order with correct values + fn test_generate_schema_search_terms() -> Result<()> { + type ExpectedItem = ( + Option>, + &'static str, + &'static [&'static str], + ); + fn assert_vec_eq( + expected: Vec, + actual: Vec<(Option, &String, &[String])>, + ) { + for (expected, actual) in expected.into_iter().zip(actual) { + assert_eq!(expected.0, actual.0, "qualifier"); + assert_eq!(expected.1, actual.1, "column name"); + assert_eq!(expected.2, actual.2, "nested names"); + } + } + + let actual = generate_schema_search_terms(&[]).collect::>(); + assert!(actual.is_empty()); + + let ids = vec!["a".to_string()]; + let actual = generate_schema_search_terms(&ids).collect::>(); + let expected: Vec = vec![(None, "a", &[])]; + assert_vec_eq(expected, actual); + + let ids = vec!["a".to_string(), "b".to_string()]; + let actual = generate_schema_search_terms(&ids).collect::>(); + let expected: Vec = vec![ + (Some(TableReference::bare("a")), "b", &[]), + (None, "a", &["b"]), + ]; + assert_vec_eq(expected, actual); + + let ids = vec!["a".to_string(), "b".to_string(), "c".to_string()]; + let actual = generate_schema_search_terms(&ids).collect::>(); + let expected: Vec = vec![ + (Some(TableReference::partial("a", "b")), "c", &[]), + (Some(TableReference::bare("a")), "b", &["c"]), + (None, "a", &["b", "c"]), + ]; + assert_vec_eq(expected, actual); + + let ids = vec![ + "a".to_string(), + "b".to_string(), + "c".to_string(), + "d".to_string(), + ]; + let actual = generate_schema_search_terms(&ids).collect::>(); + let expected: Vec = vec![ + (Some(TableReference::full("a", "b", "c")), "d", &[]), + (Some(TableReference::partial("a", "b")), "c", &["d"]), + (Some(TableReference::bare("a")), "b", &["c", "d"]), + (None, "a", &["b", "c", "d"]), + ]; + assert_vec_eq(expected, actual); + + let ids = vec![ + "a".to_string(), + "b".to_string(), + "c".to_string(), + "d".to_string(), + "e".to_string(), + ]; + let actual = generate_schema_search_terms(&ids).collect::>(); + let expected: Vec = vec![ + (Some(TableReference::full("a", "b", "c")), "d", &["e"]), + (Some(TableReference::partial("a", "b")), "c", &["d", "e"]), + (Some(TableReference::bare("a")), "b", &["c", "d", "e"]), + (None, "a", &["b", "c", "d", "e"]), + ]; + assert_vec_eq(expected, actual); + + let ids = vec![ + "a".to_string(), + "b".to_string(), + "c".to_string(), + "d".to_string(), + "e".to_string(), + "f".to_string(), + ]; + let actual = generate_schema_search_terms(&ids).collect::>(); + let expected: Vec = vec![ + (Some(TableReference::full("a", "b", "c")), "d", &["e", "f"]), + ( + Some(TableReference::partial("a", "b")), + "c", + &["d", "e", "f"], + ), + (Some(TableReference::bare("a")), "b", &["c", "d", "e", "f"]), + (None, "a", &["b", "c", "d", "e", "f"]), + ]; + assert_vec_eq(expected, actual); + + Ok(()) + } + + #[test] + fn test_form_identifier() -> Result<()> { + let err = form_identifier(&[]).expect_err("empty identifiers didn't fail"); + let expected = "Internal error: Incorrect number of identifiers: 0. \ + This was likely caused by a bug in DataFusion's code and we would \ + welcome that you file an bug report in our issue tracker"; + assert_eq!(err.to_string(), expected); + + let ids = vec!["a".to_string()]; + let (qualifier, column) = form_identifier(&ids)?; + assert_eq!(qualifier, None); + assert_eq!(column, "a"); + + let ids = vec!["a".to_string(), "b".to_string()]; + let (qualifier, column) = form_identifier(&ids)?; + assert_eq!(qualifier, Some(TableReference::bare("a"))); + assert_eq!(column, "b"); + + let ids = vec!["a".to_string(), "b".to_string(), "c".to_string()]; + let (qualifier, column) = form_identifier(&ids)?; + assert_eq!(qualifier, Some(TableReference::partial("a", "b"))); + assert_eq!(column, "c"); + + let ids = vec![ + "a".to_string(), + "b".to_string(), + "c".to_string(), + "d".to_string(), + ]; + let (qualifier, column) = form_identifier(&ids)?; + assert_eq!(qualifier, Some(TableReference::full("a", "b", "c"))); + assert_eq!(column, "d"); + + let err = form_identifier(&[ + "a".to_string(), + "b".to_string(), + "c".to_string(), + "d".to_string(), + "e".to_string(), + ]) + .expect_err("too many identifiers didn't fail"); + let expected = "Internal error: Incorrect number of identifiers: 5. \ + This was likely caused by a bug in DataFusion's code and we would \ + welcome that you file an bug report in our issue tracker"; + assert_eq!(err.to_string(), expected); + + Ok(()) + } +} From 91a88e75f452a70373c6f56bffd15a5497d8c749 Mon Sep 17 00:00:00 2001 From: Jefffrey <22608443+Jefffrey@users.noreply.github.com> Date: Fri, 3 Mar 2023 22:34:19 +1100 Subject: [PATCH 05/13] Introduce DFField::new_unqualified --- datafusion/common/src/dfschema.rs | 10 +++++++++- datafusion/core/src/physical_plan/planner.rs | 2 +- datafusion/expr/src/expr_schema.rs | 3 +-- datafusion/expr/src/logical_plan/builder.rs | 6 ++---- .../optimizer/src/common_subexpr_eliminate.rs | 6 +++--- .../simplify_expressions/expr_simplifier.rs | 8 ++++---- datafusion/optimizer/src/type_coercion.rs | 17 ++++++++--------- .../optimizer/src/unwrap_cast_in_comparison.rs | 18 ++++++++---------- 8 files changed, 36 insertions(+), 34 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 61b343b63f17..8f4cde72fb3a 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -616,6 +616,14 @@ impl DFField { } } + /// Convenience method for creating new `DFField` without a qualifier + pub fn new_unqualified(name: &str, data_type: DataType, nullable: bool) -> Self { + DFField { + qualifier: None, + field: Field::new(name, data_type, nullable), + } + } + /// Create an unqualified field from an existing Arrow field pub fn from(field: Field) -> Self { Self { @@ -1117,7 +1125,7 @@ mod tests { let arrow_schema_ref = Arc::new(arrow_schema.clone()); let df_schema = DFSchema::new_with_metadata( - vec![DFField::new::<&str>(None, "c0", DataType::Int64, true)], + vec![DFField::new_unqualified("c0", DataType::Int64, true)], metadata, ) .unwrap(); diff --git a/datafusion/core/src/physical_plan/planner.rs b/datafusion/core/src/physical_plan/planner.rs index 86b1677f4796..9dc5e38110ce 100644 --- a/datafusion/core/src/physical_plan/planner.rs +++ b/datafusion/core/src/physical_plan/planner.rs @@ -2382,7 +2382,7 @@ Internal error: Optimizer rule 'type_coercion' failed due to unexpected error: E Self { schema: DFSchemaRef::new( DFSchema::new_with_metadata( - vec![DFField::new::<&str>(None, "a", DataType::Int32, false)], + vec![DFField::new_unqualified("a", DataType::Int32, false)], HashMap::new(), ) .unwrap(), diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index c778418b42c6..5ceaf1668ada 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -252,8 +252,7 @@ impl ExprSchemable for Expr { self.get_type(input_schema)?, self.nullable(input_schema)?, )), - _ => Ok(DFField::new::<&str>( - None, + _ => Ok(DFField::new_unqualified( &self.display_name()?, self.get_type(input_schema)?, self.nullable(input_schema)?, diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index e802d4a8519a..fbbe9320512a 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -178,8 +178,7 @@ impl LogicalPlanBuilder { .map(|(j, data_type)| { // naming is following convention https://www.postgresql.org/docs/current/queries-values.html let name = &format!("column{}", j + 1); - DFField::new::<&str>( - None, + DFField::new_unqualified( name, data_type.clone().unwrap_or(DataType::Utf8), true, @@ -1116,8 +1115,7 @@ pub fn union(left_plan: LogicalPlan, right_plan: LogicalPlan) -> Result( - None, + Ok(DFField::new_unqualified( left_field.name(), data_type, nullable, diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index be8d4801b58c..33bf676db128 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -312,7 +312,7 @@ fn build_common_expr_project_plan( match expr_set.get(&id) { Some((expr, _, data_type)) => { // todo: check `nullable` - let field = DFField::new::<&str>(None, &id, data_type.clone(), true); + let field = DFField::new_unqualified(&id, data_type.clone(), true); fields_set.insert(field.name().to_owned()); project_exprs.push(expr.clone().alias(&id)); } @@ -624,8 +624,8 @@ mod test { let schema = Arc::new(DFSchema::new_with_metadata( vec![ - DFField::new::<&str>(None, "a", DataType::Int64, false), - DFField::new::<&str>(None, "c", DataType::Int64, false), + DFField::new_unqualified("a", DataType::Int64, false), + DFField::new_unqualified("c", DataType::Int64, false), ], Default::default(), )?); diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 2a798fe71794..e7870c3b2dc9 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1758,10 +1758,10 @@ mod tests { Arc::new( DFSchema::new_with_metadata( vec![ - DFField::new::<&str>(None, "c1", DataType::Utf8, true), - DFField::new::<&str>(None, "c2", DataType::Boolean, true), - DFField::new::<&str>(None, "c1_non_null", DataType::Utf8, false), - DFField::new::<&str>(None, "c2_non_null", DataType::Boolean, false), + DFField::new_unqualified("c1", DataType::Utf8, true), + DFField::new_unqualified("c2", DataType::Boolean, true), + DFField::new_unqualified("c1_non_null", DataType::Utf8, false), + DFField::new_unqualified("c2_non_null", DataType::Boolean, false), ], HashMap::new(), ) diff --git a/datafusion/optimizer/src/type_coercion.rs b/datafusion/optimizer/src/type_coercion.rs index 69a11358b8e9..7cfd6cc8d75c 100644 --- a/datafusion/optimizer/src/type_coercion.rs +++ b/datafusion/optimizer/src/type_coercion.rs @@ -664,7 +664,7 @@ mod test { produce_one_row: false, schema: Arc::new( DFSchema::new_with_metadata( - vec![DFField::new::<&str>(None, "a", DataType::Float64, true)], + vec![DFField::new_unqualified("a", DataType::Float64, true)], std::collections::HashMap::new(), ) .unwrap(), @@ -682,7 +682,7 @@ mod test { produce_one_row: false, schema: Arc::new( DFSchema::new_with_metadata( - vec![DFField::new::<&str>(None, "a", DataType::Float64, true)], + vec![DFField::new_unqualified("a", DataType::Float64, true)], std::collections::HashMap::new(), ) .unwrap(), @@ -881,7 +881,7 @@ mod test { produce_one_row: false, schema: Arc::new( DFSchema::new_with_metadata( - vec![DFField::new::<&str>(None, "a", DataType::Int64, true)], + vec![DFField::new_unqualified("a", DataType::Int64, true)], std::collections::HashMap::new(), ) .unwrap(), @@ -899,8 +899,7 @@ mod test { produce_one_row: false, schema: Arc::new( DFSchema::new_with_metadata( - vec![DFField::new::<&str>( - None, + vec![DFField::new_unqualified( "a", DataType::Decimal128(12, 4), true, @@ -1087,7 +1086,7 @@ mod test { produce_one_row: false, schema: Arc::new( DFSchema::new_with_metadata( - vec![DFField::new::<&str>(None, "a", data_type, true)], + vec![DFField::new_unqualified("a", data_type, true)], std::collections::HashMap::new(), ) .unwrap(), @@ -1100,7 +1099,7 @@ mod test { // gt let schema = Arc::new( DFSchema::new_with_metadata( - vec![DFField::new::<&str>(None, "a", DataType::Int64, true)], + vec![DFField::new_unqualified("a", DataType::Int64, true)], std::collections::HashMap::new(), ) .unwrap(), @@ -1114,7 +1113,7 @@ mod test { // eq let schema = Arc::new( DFSchema::new_with_metadata( - vec![DFField::new::<&str>(None, "a", DataType::Int64, true)], + vec![DFField::new_unqualified("a", DataType::Int64, true)], std::collections::HashMap::new(), ) .unwrap(), @@ -1128,7 +1127,7 @@ mod test { // lt let schema = Arc::new( DFSchema::new_with_metadata( - vec![DFField::new::<&str>(None, "a", DataType::Int64, true)], + vec![DFField::new_unqualified("a", DataType::Int64, true)], std::collections::HashMap::new(), ) .unwrap(), diff --git a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs index 94a6d6404dde..f5c56e5b2c7f 100644 --- a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs +++ b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs @@ -695,20 +695,18 @@ mod tests { Arc::new( DFSchema::new_with_metadata( vec![ - DFField::new::<&str>(None, "c1", DataType::Int32, false), - DFField::new::<&str>(None, "c2", DataType::Int64, false), - DFField::new::<&str>(None, "c3", DataType::Decimal128(18, 2), false), - DFField::new::<&str>(None, "c4", DataType::Decimal128(38, 37), false), - DFField::new::<&str>(None, "c5", DataType::Float32, false), - DFField::new::<&str>(None, "c6", DataType::UInt32, false), - DFField::new::<&str>( - None, + DFField::new_unqualified("c1", DataType::Int32, false), + DFField::new_unqualified("c2", DataType::Int64, false), + DFField::new_unqualified("c3", DataType::Decimal128(18, 2), false), + DFField::new_unqualified("c4", DataType::Decimal128(38, 37), false), + DFField::new_unqualified("c5", DataType::Float32, false), + DFField::new_unqualified("c6", DataType::UInt32, false), + DFField::new_unqualified( "ts_nano_none", timestamp_nano_none_type(), false, ), - DFField::new::<&str>( - None, + DFField::new_unqualified( "ts_nano_utf", timestamp_nano_utc_type(), false, From 587b0b26346bfa16f9a21af2033e28bd2594d183 Mon Sep 17 00:00:00 2001 From: Jefffrey <22608443+Jefffrey@users.noreply.github.com> Date: Fri, 3 Mar 2023 22:45:22 +1100 Subject: [PATCH 06/13] Introduce new_unqualified methods for simpler syntax --- datafusion/common/src/column.rs | 8 ++++++++ datafusion/common/src/dfschema.rs | 6 +++--- datafusion/common/src/error.rs | 12 ++++++++++++ datafusion/common/src/lib.rs | 5 ++++- datafusion/sql/src/planner.rs | 5 +++-- 5 files changed, 30 insertions(+), 6 deletions(-) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index 39242eea9ee0..94b3dd5785a7 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -51,6 +51,14 @@ impl Column { } } + /// Convenience method for when there is no qualifier + pub fn new_unqualified(name: impl Into) -> Self { + Self { + relation: None, + name: name.into(), + } + } + /// Create Column from unqualified name. pub fn from_name(name: impl Into) -> Self { Self { diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 8f4cde72fb3a..e3b156a5cc9c 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -22,7 +22,7 @@ use std::collections::{HashMap, HashSet}; use std::convert::TryFrom; use std::sync::Arc; -use crate::error::{DataFusionError, Result, SchemaError}; +use crate::error::{unqualified_field_not_found, DataFusionError, Result, SchemaError}; use crate::utils::quote_identifier; use crate::{field_not_found, Column, OwnedTableReference, TableReference}; @@ -185,7 +185,7 @@ impl DFSchema { } } - Err(field_not_found::<&str>(None, name, self)) + Err(unqualified_field_not_found(name, self)) } pub fn index_of_column_by_name( @@ -284,7 +284,7 @@ impl DFSchema { pub fn field_with_unqualified_name(&self, name: &str) -> Result<&DFField> { let matches = self.fields_with_unqualified_name(name); match matches.len() { - 0 => Err(field_not_found::<&str>(None, name, self)), + 0 => Err(unqualified_field_not_found(name, self)), 1 => Ok(matches[0]), _ => { // When `matches` size > 1, it doesn't necessarily mean an `ambiguous name` problem. diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index 88b2b3c49bc2..3428587be6f3 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -152,6 +152,18 @@ pub fn field_not_found>( }) } +/// Convenience wrapper over [`field_not_found`] for when there is no qualifier +pub fn unqualified_field_not_found(name: &str, schema: &DFSchema) -> DataFusionError { + DataFusionError::SchemaError(SchemaError::FieldNotFound { + field: Box::new(Column::new_unqualified(name)), + valid_fields: schema + .fields() + .iter() + .map(|f| f.qualified_column()) + .collect(), + }) +} + impl Display for SchemaError { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index 636feb21a489..4af8720b009a 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -34,7 +34,10 @@ pub mod utils; use arrow::compute::SortOptions; pub use column::Column; pub use dfschema::{DFField, DFSchema, DFSchemaRef, ExprSchema, ToDFSchema}; -pub use error::{field_not_found, DataFusionError, Result, SchemaError, SharedResult}; +pub use error::{ + field_not_found, unqualified_field_not_found, DataFusionError, Result, SchemaError, + SharedResult, +}; pub use parsers::parse_interval; pub use scalar::{ScalarType, ScalarValue}; pub use stats::{ColumnStatistics, Statistics}; diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 4c8dd4fcd869..601ec4fa6f1a 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -21,13 +21,14 @@ use std::sync::Arc; use std::vec; use arrow_schema::*; +use datafusion_common::field_not_found; use sqlparser::ast::ExactNumberInfo; use sqlparser::ast::TimezoneInfo; use sqlparser::ast::{ColumnDef as SQLColumnDef, ColumnOption}; use sqlparser::ast::{DataType as SQLDataType, Ident, ObjectName, TableAlias}; use datafusion_common::config::ConfigOptions; -use datafusion_common::{field_not_found, DFSchema, DataFusionError, Result}; +use datafusion_common::{unqualified_field_not_found, DFSchema, DataFusionError, Result}; use datafusion_common::{OwnedTableReference, TableReference}; use datafusion_expr::logical_plan::{LogicalPlan, LogicalPlanBuilder}; use datafusion_expr::utils::find_column_exprs; @@ -204,7 +205,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { if !schema.fields_with_unqualified_name(&col.name).is_empty() { Ok(()) } else { - Err(field_not_found::<&str>(None, col.name.as_str(), schema)) + Err(unqualified_field_not_found(col.name.as_str(), schema)) } } } From f730de71af6919ae39201f27f17f001adf6c1c00 Mon Sep 17 00:00:00 2001 From: Jefffrey <22608443+Jefffrey@users.noreply.github.com> Date: Fri, 3 Mar 2023 22:49:02 +1100 Subject: [PATCH 07/13] Fix merge --- datafusion/common/src/dfschema.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index c8c1474d00ef..36a0f3b217b0 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -343,7 +343,11 @@ impl DFSchema { } /// Find if the field exists with the given qualified name - pub fn has_column_with_qualified_name(&self, qualifier: &str, name: &str) -> bool { + pub fn has_column_with_qualified_name( + &self, + qualifier: &TableReference, + name: &str, + ) -> bool { self.fields().iter().any(|field| { field.qualifier().map(|q| q.eq(qualifier)).unwrap_or(false) && field.name() == name @@ -353,7 +357,9 @@ impl DFSchema { /// Find if the field exists with the given qualified column pub fn has_column(&self, column: &Column) -> bool { match &column.relation { - Some(r) => self.has_column_with_qualified_name(r, &column.name), + Some(r) => { + self.has_column_with_qualified_name(&r.as_table_reference(), &column.name) + } None => self.has_column_with_unqualified_name(&column.name), } } From db57180a83b55c5c426f4e32440ead0079f5a7b1 Mon Sep 17 00:00:00 2001 From: Jefffrey <22608443+Jefffrey@users.noreply.github.com> Date: Wed, 8 Mar 2023 21:33:51 +1100 Subject: [PATCH 08/13] New ident() expr function --- datafusion/expr/src/expr_fn.rs | 8 +++++++- docs/source/user-guide/example-usage.md | 9 ++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 6465ca80b867..ba351b463599 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -39,6 +39,12 @@ pub fn col(ident: impl Into) -> Expr { Expr::Column(ident.into()) } +/// Create an unqualified column expression from the provided name, without normalizing +/// the column +pub fn ident(name: impl Into) -> Expr { + Expr::Column(Column::from_name(name)) +} + /// Return a new expression `left right` pub fn binary_expr(left: Expr, op: Operator, right: Expr) -> Expr { Expr::BinaryExpr(BinaryExpr::new(Box::new(left), op, Box::new(right))) @@ -652,7 +658,7 @@ mod test { #[test] fn filter_is_null_and_is_not_null() { let col_null = col("col1"); - let col_not_null = col("col2"); + let col_not_null = ident("col2"); assert_eq!(format!("{:?}", col_null.is_null()), "col1 IS NULL"); assert_eq!( format!("{:?}", col_not_null.is_not_null()), diff --git a/docs/source/user-guide/example-usage.md b/docs/source/user-guide/example-usage.md index d7782b511a1d..a2cd109a61ef 100644 --- a/docs/source/user-guide/example-usage.md +++ b/docs/source/user-guide/example-usage.md @@ -118,9 +118,12 @@ async fn main() -> datafusion::error::Result<()> { let ctx = SessionContext::new(); let df = ctx.read_csv("tests/data/capitalized_example.csv", CsvReadOptions::new()).await?; - let df = df.filter(col("\"A\"").lt_eq(col("c")))? - .aggregate(vec![col("\"A\"")], vec![min(col("b"))])? - .limit(0, Some(100))?; + let df = df + // col will parse the input string, hence requiring double quotes to maintain the capitalization + .filter(col("\"A\"").lt_eq(col("c")))? + // alternatively use ident to pass in an unqualified column name directly without parsing + .aggregate(vec![ident("A")], vec![min(col("b"))])? + .limit(0, Some(100))?; // execute and print results df.show().await?; From 5240976539d601fa6cabb6a466fbda229bd085bb Mon Sep 17 00:00:00 2001 From: Jefffrey <22608443+Jefffrey@users.noreply.github.com> Date: Wed, 8 Mar 2023 22:35:25 +1100 Subject: [PATCH 09/13] Refactor OwnedTableReference to be a type alies of TableReference<'static> --- datafusion/common/src/column.rs | 12 +- datafusion/common/src/dfschema.rs | 24 +-- datafusion/common/src/error.rs | 2 +- datafusion/common/src/table_reference.rs | 163 ++---------------- datafusion/core/src/catalog/listing_schema.rs | 4 +- datafusion/core/src/execution/context.rs | 30 ++-- datafusion/core/src/physical_plan/planner.rs | 2 +- .../src/engines/datafusion/create_table.rs | 6 +- .../src/engines/datafusion/insert.rs | 8 +- datafusion/expr/src/expr_rewriter.rs | 2 +- datafusion/expr/src/logical_plan/builder.rs | 61 +++---- datafusion/optimizer/src/optimizer.rs | 2 +- .../optimizer/src/push_down_projection.rs | 4 +- .../proto/src/logical_plan/from_proto.rs | 12 +- datafusion/proto/src/logical_plan/to_proto.rs | 14 +- datafusion/sql/src/planner.rs | 15 +- datafusion/sql/src/relation/mod.rs | 5 +- datafusion/sql/src/statement.rs | 22 +-- 18 files changed, 104 insertions(+), 284 deletions(-) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index 94b3dd5785a7..757f1f0727f6 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -76,22 +76,22 @@ impl Column { 1 => (None, idents.remove(0)), 2 => ( Some(OwnedTableReference::Bare { - table: idents.remove(0), + table: idents.remove(0).into(), }), idents.remove(0), ), 3 => ( Some(OwnedTableReference::Partial { - schema: idents.remove(0), - table: idents.remove(0), + schema: idents.remove(0).into(), + table: idents.remove(0).into(), }), idents.remove(0), ), 4 => ( Some(OwnedTableReference::Full { - catalog: idents.remove(0), - schema: idents.remove(0), - table: idents.remove(0), + catalog: idents.remove(0).into(), + schema: idents.remove(0).into(), + table: idents.remove(0).into(), }), idents.remove(0), ), diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index c74b50218082..d8e59ab6809a 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -71,7 +71,7 @@ impl DFSchema { if !qualified_names.insert((qualifier, field.name())) { return Err(DataFusionError::SchemaError( SchemaError::DuplicateQualifiedField { - qualifier: qualifier.clone(), + qualifier: Box::new(qualifier.clone()), name: field.name().to_string(), }, )); @@ -114,7 +114,7 @@ impl DFSchema { schema .fields() .iter() - .map(|f| DFField::from_qualified(qualifier, f.clone())) + .map(|f| DFField::from_qualified(qualifier.to_string(), f.clone())) .collect(), schema.metadata().clone(), ) @@ -139,9 +139,7 @@ impl DFSchema { for field in other_schema.fields() { // skip duplicate columns let duplicated_field = match field.qualifier() { - Some(q) => self - .field_with_name(Some(&q.as_table_reference()), field.name()) - .is_ok(), + Some(q) => self.field_with_name(Some(q), field.name()).is_ok(), // for unqualified columns, check as unqualified name None => self.field_with_unqualified_name(field.name()).is_ok(), }; @@ -203,7 +201,7 @@ impl DFSchema { // current field is qualified and not shared between relations, compare both // qualifier and name. (Some(q), Some(field_q)) => { - q.resolved_eq(&field_q.as_table_reference()) && field.name() == name + q.resolved_eq(field_q) && field.name() == name } // field to lookup is qualified but current field is unqualified. (Some(qq), None) => { @@ -240,15 +238,13 @@ impl DFSchema { /// Find the index of the column with the given qualifier and name pub fn index_of_column(&self, col: &Column) -> Result { - let tr = col.relation.as_ref().map(|r| r.as_table_reference()); - self.index_of_column_by_name(tr.as_ref(), &col.name)? + self.index_of_column_by_name(col.relation.as_ref(), &col.name)? .ok_or_else(|| field_not_found(col.relation.clone(), &col.name, self)) } /// Check if the column is in the current schema pub fn is_column_from_schema(&self, col: &Column) -> Result { - let tr = col.relation.as_ref().map(|r| r.as_table_reference()); - self.index_of_column_by_name(tr.as_ref(), &col.name) + self.index_of_column_by_name(col.relation.as_ref(), &col.name) .map(|idx| idx.is_some()) } @@ -331,9 +327,7 @@ impl DFSchema { /// Find the field with the given qualified column pub fn field_from_column(&self, column: &Column) -> Result<&DFField> { match &column.relation { - Some(r) => { - self.field_with_qualified_name(&r.as_table_reference(), &column.name) - } + Some(r) => self.field_with_qualified_name(r, &column.name), None => self.field_with_unqualified_name(&column.name), } } @@ -358,9 +352,7 @@ impl DFSchema { /// Find if the field exists with the given qualified column pub fn has_column(&self, column: &Column) -> bool { match &column.relation { - Some(r) => { - self.has_column_with_qualified_name(&r.as_table_reference(), &column.name) - } + Some(r) => self.has_column_with_qualified_name(r, &column.name), None => self.has_column_with_unqualified_name(&column.name), } } diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index 3428587be6f3..0231895f7742 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -124,7 +124,7 @@ pub enum SchemaError { AmbiguousReference { field: Column }, /// Schema contains duplicate qualified field name DuplicateQualifiedField { - qualifier: OwnedTableReference, + qualifier: Box, name: String, }, /// Schema contains duplicate unqualified field name diff --git a/datafusion/common/src/table_reference.rs b/datafusion/common/src/table_reference.rs index 3ba3ee8f4f15..7fb3dfcc7d94 100644 --- a/datafusion/common/src/table_reference.rs +++ b/datafusion/common/src/table_reference.rs @@ -36,7 +36,7 @@ impl<'a> std::fmt::Display for ResolvedTableReference<'a> { } /// Represents a path to a table that may require further resolution -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub enum TableReference<'a> { /// An unqualified table reference, e.g. "table" Bare { @@ -61,6 +61,8 @@ pub enum TableReference<'a> { }, } +pub type OwnedTableReference = TableReference<'static>; + impl std::fmt::Display for TableReference<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { @@ -195,16 +197,16 @@ impl<'a> TableReference<'a> { schema, table, } => OwnedTableReference::Full { - catalog: catalog.into(), - schema: schema.into(), - table: table.into(), + catalog: catalog.to_string().into(), + schema: schema.to_string().into(), + table: table.to_string().into(), }, Self::Partial { schema, table } => OwnedTableReference::Partial { - schema: schema.into(), - table: table.into(), + schema: schema.to_string().into(), + table: table.to_string().into(), }, Self::Bare { table } => OwnedTableReference::Bare { - table: table.into(), + table: table.to_string().into(), }, } } @@ -258,155 +260,10 @@ impl<'a> TableReference<'a> { } } -/// Represents a path to a table that may require further resolution -/// that owns the underlying names -#[derive(Debug, Clone, PartialEq, Eq, Hash, Ord, PartialOrd)] -pub enum OwnedTableReference { - /// An unqualified table reference, e.g. "table" - Bare { - /// The table name - table: String, - }, - /// A partially resolved table reference, e.g. "schema.table" - Partial { - /// The schema containing the table - schema: String, - /// The table name - table: String, - }, - /// A fully resolved table reference, e.g. "catalog.schema.table" - Full { - /// The catalog (aka database) containing the table - catalog: String, - /// The schema containing the table - schema: String, - /// The table name - table: String, - }, -} - -impl std::fmt::Display for OwnedTableReference { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - OwnedTableReference::Bare { table } => write!(f, "{table}"), - OwnedTableReference::Partial { schema, table } => { - write!(f, "{schema}.{table}") - } - OwnedTableReference::Full { - catalog, - schema, - table, - } => write!(f, "{catalog}.{schema}.{table}"), - } - } -} - -impl OwnedTableReference { - /// Return a `TableReference` view of this `OwnedTableReference` - pub fn as_table_reference(&self) -> TableReference<'_> { - match self { - Self::Bare { table } => TableReference::Bare { - table: table.into(), - }, - Self::Partial { schema, table } => TableReference::Partial { - schema: schema.into(), - table: table.into(), - }, - Self::Full { - catalog, - schema, - table, - } => TableReference::Full { - catalog: catalog.into(), - schema: schema.into(), - table: table.into(), - }, - } - } - - /// Retrieve the actual table name, regardless of qualification - pub fn table(&self) -> &str { - match self { - Self::Full { table, .. } - | Self::Partial { table, .. } - | Self::Bare { table } => table, - } - } - - /// Forms a string where the identifiers are quoted - pub fn to_quoted_string(&self) -> String { - match self { - OwnedTableReference::Bare { table } => quote_identifier(table), - OwnedTableReference::Partial { schema, table } => { - format!("{}.{}", quote_identifier(schema), quote_identifier(table)) - } - OwnedTableReference::Full { - catalog, - schema, - table, - } => format!( - "{}.{}.{}", - quote_identifier(catalog), - quote_identifier(schema), - quote_identifier(table) - ), - } - } -} - -impl PartialEq> for OwnedTableReference { - fn eq(&self, other: &TableReference<'_>) -> bool { - self.as_table_reference().eq(other) - } -} - -impl PartialEq for TableReference<'_> { - fn eq(&self, other: &OwnedTableReference) -> bool { - self.eq(&other.as_table_reference()) - } -} - -/// Parse a `&str` into a OwnedTableReference -impl From<&str> for OwnedTableReference { - fn from(s: &str) -> Self { - let table_reference: TableReference = s.into(); - table_reference.to_owned_reference() - } -} - /// Parse a `String` into a OwnedTableReference impl From for OwnedTableReference { fn from(s: String) -> Self { - Self::from(s.as_str()) - } -} - -/// Parse a `&String` into a OwnedTableReference -impl From<&String> for OwnedTableReference { - fn from(s: &String) -> Self { - Self::from(s.as_str()) - } -} - -/// Parse a `&String` into a OwnedTableReference -impl From<&OwnedTableReference> for OwnedTableReference { - fn from(s: &OwnedTableReference) -> Self { - s.clone() - } -} - -/// Parse a `TableReference` into a OwnedTableReference -impl From<&'_ TableReference<'_>> for OwnedTableReference { - fn from(s: &'_ TableReference) -> Self { - s.to_owned().to_owned_reference() - } -} - -/// Convert `OwnedTableReference` into a `TableReference`. Somewhat -/// awkward to use but 'idiomatic': `(&table_ref).into()` -impl<'a> From<&'a OwnedTableReference> for TableReference<'a> { - fn from(r: &'a OwnedTableReference) -> Self { - r.as_table_reference() + TableReference::parse_str(&s).to_owned_reference() } } diff --git a/datafusion/core/src/catalog/listing_schema.rs b/datafusion/core/src/catalog/listing_schema.rs index 32ee9f62ac3d..3ea7a1098f9f 100644 --- a/datafusion/core/src/catalog/listing_schema.rs +++ b/datafusion/core/src/catalog/listing_schema.rs @@ -128,9 +128,7 @@ impl ListingSchemaProvider { if !self.table_exist(table_name) { let table_url = format!("{}/{}", self.authority, table_path); - let name = OwnedTableReference::Bare { - table: table_name.to_string(), - }; + let name = OwnedTableReference::bare(table_name.to_string()); let provider = self .factory .create( diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs index 0340b4761bc7..f72ea7560f86 100644 --- a/datafusion/core/src/execution/context.rs +++ b/datafusion/core/src/execution/context.rs @@ -329,19 +329,19 @@ impl SessionContext { or_replace, }) => { let input = Arc::try_unwrap(input).unwrap_or_else(|e| e.as_ref().clone()); - let table = self.table(&name).await; + let table = self.table(name.clone()).await; match (if_not_exists, or_replace, table) { (true, false, Ok(_)) => self.return_empty_dataframe(), (false, true, Ok(_)) => { - self.deregister_table(&name)?; + self.deregister_table(name.clone())?; let schema = Arc::new(input.schema().as_ref().into()); let physical = DataFrame::new(self.state(), input); let batches: Vec<_> = physical.collect_partitioned().await?; let table = Arc::new(MemTable::try_new(schema, batches)?); - self.register_table(&name, table)?; + self.register_table(name.clone(), table)?; self.return_empty_dataframe() } (true, true, Ok(_)) => Err(DataFusionError::Execution( @@ -354,7 +354,7 @@ impl SessionContext { let batches: Vec<_> = physical.collect_partitioned().await?; let table = Arc::new(MemTable::try_new(schema, batches)?); - self.register_table(&name, table)?; + self.register_table(name, table)?; self.return_empty_dataframe() } (false, false, Ok(_)) => Err(DataFusionError::Execution(format!( @@ -369,22 +369,22 @@ impl SessionContext { or_replace, definition, }) => { - let view = self.table(&name).await; + let view = self.table(name.clone()).await; match (or_replace, view) { (true, Ok(_)) => { - self.deregister_table(&name)?; + self.deregister_table(name.clone())?; let table = Arc::new(ViewTable::try_new((*input).clone(), definition)?); - self.register_table(&name, table)?; + self.register_table(name.clone(), table)?; self.return_empty_dataframe() } (_, Err(_)) => { let table = Arc::new(ViewTable::try_new((*input).clone(), definition)?); - self.register_table(&name, table)?; + self.register_table(name, table)?; self.return_empty_dataframe() } (false, Ok(_)) => Err(DataFusionError::Execution(format!( @@ -396,7 +396,9 @@ impl SessionContext { LogicalPlan::DropTable(DropTable { name, if_exists, .. }) => { - let result = self.find_and_deregister(&name, TableType::Base).await; + let result = self + .find_and_deregister(name.clone(), TableType::Base) + .await; match (result, if_exists) { (Ok(true), _) => self.return_empty_dataframe(), (_, true) => self.return_empty_dataframe(), @@ -409,7 +411,9 @@ impl SessionContext { LogicalPlan::DropView(DropView { name, if_exists, .. }) => { - let result = self.find_and_deregister(&name, TableType::View).await; + let result = self + .find_and_deregister(name.clone(), TableType::View) + .await; match (result, if_exists) { (Ok(true), _) => self.return_empty_dataframe(), (_, true) => self.return_empty_dataframe(), @@ -567,7 +571,7 @@ impl SessionContext { &self, cmd: &CreateExternalTable, ) -> Result { - let exist = self.table_exist(&cmd.name)?; + let exist = self.table_exist(cmd.name.clone())?; if exist { match cmd.if_not_exists { true => return self.return_empty_dataframe(), @@ -582,7 +586,7 @@ impl SessionContext { let table_provider: Arc = self.create_custom_table(cmd).await?; - self.register_table(&cmd.name, table_provider)?; + self.register_table(cmd.name.clone(), table_provider)?; self.return_empty_dataframe() } @@ -1914,7 +1918,7 @@ impl SessionState { self.config.options.sql_parser.parse_float_as_decimal; for reference in references { let table = reference.table(); - let resolved = self.resolve_table_ref(reference.as_table_reference()); + let resolved = self.resolve_table_ref(reference.clone()); if let Entry::Vacant(v) = provider.tables.entry(resolved.to_string()) { if let Ok(schema) = self.schema_for_ref(resolved) { if let Some(table) = schema.table(table).await { diff --git a/datafusion/core/src/physical_plan/planner.rs b/datafusion/core/src/physical_plan/planner.rs index 50e25da76a72..1071e23fd3a0 100644 --- a/datafusion/core/src/physical_plan/planner.rs +++ b/datafusion/core/src/physical_plan/planner.rs @@ -2539,7 +2539,7 @@ Internal error: Optimizer rule 'type_coercion' failed due to unexpected error: E .projected_schema .as_ref() .clone() - .replace_qualifier(name); + .replace_qualifier(name.to_string()); scan.projected_schema = Arc::new(new_schema); LogicalPlan::TableScan(scan) } diff --git a/datafusion/core/tests/sqllogictests/src/engines/datafusion/create_table.rs b/datafusion/core/tests/sqllogictests/src/engines/datafusion/create_table.rs index 981dd75b56d7..a753cb79953c 100644 --- a/datafusion/core/tests/sqllogictests/src/engines/datafusion/create_table.rs +++ b/datafusion/core/tests/sqllogictests/src/engines/datafusion/create_table.rs @@ -36,11 +36,11 @@ pub async fn create_table( ) -> Result { let table_reference = object_name_to_table_reference(name, ctx.enable_ident_normalization())?; - let existing_table = ctx.table(&table_reference).await; + let existing_table = ctx.table(table_reference.clone()).await; match (if_not_exists, or_replace, existing_table) { (true, false, Ok(_)) => Ok(DBOutput::StatementComplete(0)), (false, true, Ok(_)) => { - ctx.deregister_table(&table_reference)?; + ctx.deregister_table(table_reference.clone())?; create_new_table(ctx, table_reference, columns) } (true, true, Ok(_)) => { @@ -78,6 +78,6 @@ fn create_new_table( ); let schema = Arc::new(sql_to_rel.build_schema(columns)?); let table_provider = Arc::new(MemTable::try_new(schema, vec![])?); - ctx.register_table(&table_reference, table_provider)?; + ctx.register_table(table_reference.clone(), table_provider)?; Ok(DBOutput::StatementComplete(0)) } diff --git a/datafusion/core/tests/sqllogictests/src/engines/datafusion/insert.rs b/datafusion/core/tests/sqllogictests/src/engines/datafusion/insert.rs index a8fca3b16c06..e518db2204f7 100644 --- a/datafusion/core/tests/sqllogictests/src/engines/datafusion/insert.rs +++ b/datafusion/core/tests/sqllogictests/src/engines/datafusion/insert.rs @@ -56,9 +56,9 @@ pub async fn insert(ctx: &SessionContext, insert_stmt: SQLStatement) -> Result Result DFField { - DFField::new(Some(relation), column, DataType::Int8, false) + DFField::new(Some(relation.to_string()), column, DataType::Int8, false) } #[test] diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 1c12ca6c1b31..ee5e5d804120 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -225,7 +225,10 @@ impl LogicalPlanBuilder { DFSchema::new_with_metadata( p.iter() .map(|i| { - DFField::from_qualified(&table_name, schema.field(*i).clone()) + DFField::from_qualified( + table_name.to_string(), + schema.field(*i).clone(), + ) }) .collect(), schema.metadata().clone(), @@ -608,22 +611,14 @@ impl LogicalPlanBuilder { match (&l.relation, &r.relation) { (Some(lr), Some(rr)) => { - let l_is_left = self.plan.schema().field_with_qualified_name( - &lr.as_table_reference(), - &l.name, - ); - let l_is_right = right.schema().field_with_qualified_name( - &lr.as_table_reference(), - &l.name, - ); - let r_is_left = self.plan.schema().field_with_qualified_name( - &rr.as_table_reference(), - &r.name, - ); - let r_is_right = right.schema().field_with_qualified_name( - &rr.as_table_reference(), - &r.name, - ); + let l_is_left = + self.plan.schema().field_with_qualified_name(lr, &l.name); + let l_is_right = + right.schema().field_with_qualified_name(lr, &l.name); + let r_is_left = + self.plan.schema().field_with_qualified_name(rr, &r.name); + let r_is_right = + right.schema().field_with_qualified_name(rr, &r.name); match (l_is_left, l_is_right, r_is_left, r_is_right) { (_, Ok(_), Ok(_), _) => (Ok(r), Ok(l)), @@ -635,14 +630,10 @@ impl LogicalPlanBuilder { } } (Some(lr), None) => { - let l_is_left = self.plan.schema().field_with_qualified_name( - &lr.as_table_reference(), - &l.name, - ); - let l_is_right = right.schema().field_with_qualified_name( - &lr.as_table_reference(), - &l.name, - ); + let l_is_left = + self.plan.schema().field_with_qualified_name(lr, &l.name); + let l_is_right = + right.schema().field_with_qualified_name(lr, &l.name); match (l_is_left, l_is_right) { (Ok(_), _) => (Ok(l), Self::normalize(&right, r)), @@ -654,14 +645,10 @@ impl LogicalPlanBuilder { } } (None, Some(rr)) => { - let r_is_left = self.plan.schema().field_with_qualified_name( - &rr.as_table_reference(), - &r.name, - ); - let r_is_right = right.schema().field_with_qualified_name( - &rr.as_table_reference(), - &r.name, - ); + let r_is_left = + self.plan.schema().field_with_qualified_name(rr, &r.name); + let r_is_right = + right.schema().field_with_qualified_name(rr, &r.name); match (r_is_left, r_is_right) { (Ok(_), _) => (Ok(r), Self::normalize(&right, l)), @@ -1113,7 +1100,7 @@ pub fn union(left_plan: LogicalPlan, right_plan: LogicalPlan) -> Result Result { DataType::List(field) | DataType::FixedSizeList(field, _) | DataType::LargeList(field) => DFField::new( - unnest_field.qualifier(), + unnest_field.qualifier().cloned(), unnest_field.name(), field.data_type().clone(), unnest_field.is_nullable(), @@ -1621,7 +1608,7 @@ mod tests { name, }, })) => { - assert_eq!("employee_csv", table.as_str()); + assert_eq!("employee_csv", table); assert_eq!("id", &name); Ok(()) } @@ -1650,7 +1637,7 @@ mod tests { name, }, })) => { - assert_eq!("employee_csv", table.as_str()); + assert_eq!("employee_csv", table); assert_eq!("state", &name); Ok(()) } diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs index 3a1974d48eaa..6091c4f6f477 100644 --- a/datafusion/optimizer/src/optimizer.rs +++ b/datafusion/optimizer/src/optimizer.rs @@ -521,7 +521,7 @@ mod tests { let new_arrow_field = f.field().clone().with_metadata(metadata); if let Some(qualifier) = f.qualifier() { - DFField::from_qualified(qualifier, new_arrow_field) + DFField::from_qualified(qualifier.clone(), new_arrow_field) } else { DFField::from(new_arrow_field) } diff --git a/datafusion/optimizer/src/push_down_projection.rs b/datafusion/optimizer/src/push_down_projection.rs index 1abeca4829e8..0496669ae5c9 100644 --- a/datafusion/optimizer/src/push_down_projection.rs +++ b/datafusion/optimizer/src/push_down_projection.rs @@ -538,7 +538,9 @@ fn push_down_scan( // create the projected schema let projected_fields: Vec = projection .iter() - .map(|i| DFField::from_qualified(&scan.table_name, schema.fields()[*i].clone())) + .map(|i| { + DFField::from_qualified(scan.table_name.clone(), schema.fields()[*i].clone()) + }) .collect(); let projected_schema = projected_fields.to_dfschema_ref()?; diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index 250aa34435e2..aa416e63b8a6 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -187,7 +187,7 @@ impl TryFrom<&protobuf::DfField> for DFField { let field = df_field.field.as_ref().required("field")?; Ok(match &df_field.qualifier { - Some(q) => DFField::from_qualified(&q.relation, field), + Some(q) => DFField::from_qualified(q.relation.clone(), field), None => DFField::from(field), }) } @@ -214,21 +214,17 @@ impl TryFrom for OwnedTableReference { match table_reference_enum { TableReferenceEnum::Bare(protobuf::BareTableReference { table }) => { - Ok(OwnedTableReference::Bare { table }) + Ok(OwnedTableReference::bare(table)) } TableReferenceEnum::Partial(protobuf::PartialTableReference { schema, table, - }) => Ok(OwnedTableReference::Partial { schema, table }), + }) => Ok(OwnedTableReference::partial(schema, table)), TableReferenceEnum::Full(protobuf::FullTableReference { catalog, schema, table, - }) => Ok(OwnedTableReference::Full { - catalog, - schema, - table, - }), + }) => Ok(OwnedTableReference::full(catalog, schema, table)), } } } diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index d228274e3958..a794c9bd0605 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -1321,12 +1321,14 @@ impl From for protobuf::OwnedTableReference { use protobuf::owned_table_reference::TableReferenceEnum; let table_reference_enum = match t { OwnedTableReference::Bare { table } => { - TableReferenceEnum::Bare(protobuf::BareTableReference { table }) + TableReferenceEnum::Bare(protobuf::BareTableReference { + table: table.to_string(), + }) } OwnedTableReference::Partial { schema, table } => { TableReferenceEnum::Partial(protobuf::PartialTableReference { - schema, - table, + schema: schema.to_string(), + table: table.to_string(), }) } OwnedTableReference::Full { @@ -1334,9 +1336,9 @@ impl From for protobuf::OwnedTableReference { schema, table, } => TableReferenceEnum::Full(protobuf::FullTableReference { - catalog, - schema, - table, + catalog: catalog.to_string(), + schema: schema.to_string(), + table: table.to_string(), }), }; diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 601ec4fa6f1a..d9a936abc851 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -195,10 +195,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .try_for_each(|col| match col { Expr::Column(col) => match &col.relation { Some(r) => { - schema.field_with_qualified_name( - &r.as_table_reference(), - &col.name, - )?; + schema.field_with_qualified_name(r, &col.name)?; Ok(()) } None => { @@ -374,22 +371,18 @@ pub(crate) fn idents_to_table_reference( match taker.0.len() { 1 => { let table = taker.take(enable_normalization); - Ok(OwnedTableReference::Bare { table }) + Ok(OwnedTableReference::bare(table)) } 2 => { let table = taker.take(enable_normalization); let schema = taker.take(enable_normalization); - Ok(OwnedTableReference::Partial { schema, table }) + Ok(OwnedTableReference::partial(schema, table)) } 3 => { let table = taker.take(enable_normalization); let schema = taker.take(enable_normalization); let catalog = taker.take(enable_normalization); - Ok(OwnedTableReference::Full { - catalog, - schema, - table, - }) + Ok(OwnedTableReference::full(catalog, schema, table)) } _ => Err(DataFusionError::Plan(format!( "Unsupported compound identifier '{:?}'", diff --git a/datafusion/sql/src/relation/mod.rs b/datafusion/sql/src/relation/mod.rs index 12f5bd6b49fc..19a278666efb 100644 --- a/datafusion/sql/src/relation/mod.rs +++ b/datafusion/sql/src/relation/mod.rs @@ -35,10 +35,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let table_name = table_ref.to_string(); let cte = planner_context.ctes.get(&table_name); ( - match ( - cte, - self.schema_provider.get_table_provider((&table_ref).into()), - ) { + match (cte, self.schema_provider.get_table_provider(table_ref)) { (Some(cte_plan), _) => Ok(cte_plan.clone()), (_, Ok(provider)) => { LogicalPlanBuilder::scan(&table_name, provider, None)?.build() diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 201cf7a85243..561899491700 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -413,9 +413,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let DescribeTableStmt { table_name } = statement; let table_ref = self.object_name_to_table_reference(table_name)?; - let table_source = self - .schema_provider - .get_table_provider((&table_ref).into())?; + let table_source = self.schema_provider.get_table_provider(table_ref)?; let schema = table_source.schema(); @@ -463,7 +461,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let schema = self.build_schema(columns)?; // External tables do not support schemas at the moment, so the name is just a table name - let name = OwnedTableReference::Bare { table: name }; + let name = OwnedTableReference::bare(name); Ok(LogicalPlan::CreateExternalTable(PlanCreateExternalTable { schema: schema.to_dfschema_ref()?, @@ -631,9 +629,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // Do a table lookup to verify the table exists let table_ref = self.object_name_to_table_reference(table_name.clone())?; - let provider = self - .schema_provider - .get_table_provider((&table_ref).into())?; + let provider = self.schema_provider.get_table_provider(table_ref.clone())?; let schema = (*provider.schema()).clone(); let schema = DFSchema::try_from(schema)?; let scan = @@ -685,7 +681,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let table_name = self.object_name_to_table_reference(table_name)?; let provider = self .schema_provider - .get_table_provider((&table_name).into())?; + .get_table_provider(table_name.clone())?; let arrow_schema = (*provider.schema()).clone(); let table_schema = Arc::new(DFSchema::try_from(arrow_schema)?); let values = table_schema.fields().iter().map(|f| { @@ -787,7 +783,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let table_name = self.object_name_to_table_reference(table_name)?; let provider = self .schema_provider - .get_table_provider((&table_name).into())?; + .get_table_provider(table_name.clone())?; let arrow_schema = (*provider.schema()).clone(); let table_schema = DFSchema::try_from(arrow_schema)?; @@ -893,9 +889,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // Do a table lookup to verify the table exists let table_ref = self.object_name_to_table_reference(sql_table_name)?; - let _ = self - .schema_provider - .get_table_provider((&table_ref).into())?; + let _ = self.schema_provider.get_table_provider(table_ref)?; // treat both FULL and EXTENDED as the same let select_list = if full || extended { @@ -931,9 +925,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // Do a table lookup to verify the table exists let table_ref = self.object_name_to_table_reference(sql_table_name)?; - let _ = self - .schema_provider - .get_table_provider((&table_ref).into())?; + let _ = self.schema_provider.get_table_provider(table_ref)?; let query = format!( "SELECT table_catalog, table_schema, table_name, definition FROM information_schema.views WHERE {where_clause}" From 44bb3fa0bd05f420ac5373bba6698069cdf5e31c Mon Sep 17 00:00:00 2001 From: Jefffrey <22608443+Jefffrey@users.noreply.github.com> Date: Thu, 9 Mar 2023 19:23:43 +1100 Subject: [PATCH 10/13] Update comments --- datafusion/expr/src/logical_plan/plan.rs | 2 ++ datafusion/optimizer/src/push_down_projection.rs | 1 - datafusion/sql/src/expr/identifier.rs | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index f5866d5a96d5..330e40e5155d 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -1502,6 +1502,8 @@ pub struct Window { #[derive(Clone)] pub struct TableScan { /// The name of the table + // TODO: change to OwnedTableReference + // see: https://github.com/apache/arrow-datafusion/issues/5522 pub table_name: String, /// The source of the table pub source: Arc, diff --git a/datafusion/optimizer/src/push_down_projection.rs b/datafusion/optimizer/src/push_down_projection.rs index 0496669ae5c9..4e9d5f039554 100644 --- a/datafusion/optimizer/src/push_down_projection.rs +++ b/datafusion/optimizer/src/push_down_projection.rs @@ -495,7 +495,6 @@ fn push_down_scan( let schema = scan.source.schema(); let mut projection: BTreeSet = used_columns .iter() - // TODO: change scan.table_name from String? .filter(|c| { c.relation.is_none() || c.relation.as_ref().unwrap().to_string() == scan.table_name diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs index 9714ad46ab55..7548fc8bd46b 100644 --- a/datafusion/sql/src/expr/identifier.rs +++ b/datafusion/sql/src/expr/identifier.rs @@ -124,6 +124,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ))) } else { let s = &ids[0..ids.len()]; + // safe unwrap as s can never be empty or exceed the bounds let (relation, column_name) = form_identifier(s).unwrap(); let relation = relation.map(|r| r.to_owned_reference()); Ok(Expr::Column(Column::new(relation, column_name))) @@ -262,6 +263,7 @@ fn generate_schema_search_terms( (0..bound).rev().map(|i| { let nested_names_index = i + 1; let qualifier_and_column = &ids[0..nested_names_index]; + // safe unwrap as qualifier_and_column can never be empty or exceed the bounds let (relation, column_name) = form_identifier(qualifier_and_column).unwrap(); (relation, column_name, &ids[nested_names_index..]) }) From b9a9af5514aef32db85a463b764f56cbd5db3a4d Mon Sep 17 00:00:00 2001 From: Jefffrey <22608443+Jefffrey@users.noreply.github.com> Date: Thu, 9 Mar 2023 19:30:22 +1100 Subject: [PATCH 11/13] Comments --- datafusion/common/src/column.rs | 2 ++ datafusion/common/src/dfschema.rs | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index 757f1f0727f6..a7de87e99164 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -112,6 +112,8 @@ impl Column { /// Serialize column into a quoted flat name string pub fn quoted_flat_name(&self) -> String { + // TODO: quote identifiers only when special characters present + // see: https://github.com/apache/arrow-datafusion/issues/5523 match &self.relation { Some(r) => { format!("{}.{}", r.to_quoted_string(), quote_identifier(&self.name)) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index d8e59ab6809a..5e9f2187a633 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -739,7 +739,7 @@ mod tests { let col = Column::from_name("t1.c0"); let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; // lookup with unqualified name "t1.c0" - let err = schema.index_of_column(&col).err().unwrap(); + let err = schema.index_of_column(&col).unwrap_err(); assert_eq!( r#"Schema error: No field named "t1.c0". Valid fields are "t1"."c0", "t1"."c1"."#, &format!("{err}") From 57ce66c24858cedac4f2c56194ea72f84d7be2f5 Mon Sep 17 00:00:00 2001 From: Jefffrey <22608443+Jefffrey@users.noreply.github.com> Date: Thu, 9 Mar 2023 19:49:33 +1100 Subject: [PATCH 12/13] Update docstrings --- datafusion/core/src/execution/context.rs | 2 +- .../src/physical_plan/file_format/parquet.rs | 4 +++ datafusion/expr/src/expr_fn.rs | 34 ++++++++++++++++--- .../physical-expr/src/intervals/cp_solver.rs | 2 +- datafusion/sql/src/expr/arrow_cast.rs | 2 +- 5 files changed, 36 insertions(+), 8 deletions(-) diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs index 2b99fdc11eea..027bdb37f31b 100644 --- a/datafusion/core/src/execution/context.rs +++ b/datafusion/core/src/execution/context.rs @@ -282,7 +282,7 @@ impl SessionContext { self.session_id.clone() } - /// Return the [`TableFactoryProvider`] that is registered for the + /// Return the [`TableProviderFactory`] that is registered for the /// specified file type, if any. pub fn table_factory( &self, diff --git a/datafusion/core/src/physical_plan/file_format/parquet.rs b/datafusion/core/src/physical_plan/file_format/parquet.rs index 2c1bfc9caa51..3f3b0bb74212 100644 --- a/datafusion/core/src/physical_plan/file_format/parquet.rs +++ b/datafusion/core/src/physical_plan/file_format/parquet.rs @@ -204,6 +204,8 @@ impl ParquetExec { /// `ParquetRecordBatchStream`. These filters are applied by the /// parquet decoder to skip unecessairly decoding other columns /// which would not pass the predicate. Defaults to false + /// + /// [`Expr`]: datafusion_expr::Expr pub fn with_pushdown_filters(mut self, pushdown_filters: bool) -> Self { self.pushdown_filters = Some(pushdown_filters); self @@ -219,6 +221,8 @@ impl ParquetExec { /// minimize the cost of filter evaluation by reordering the /// predicate [`Expr`]s. If false, the predicates are applied in /// the same order as specified in the query. Defaults to false. + /// + /// [`Expr`]: datafusion_expr::Expr pub fn with_reorder_filters(mut self, reorder_filters: bool) -> Self { self.reorder_filters = Some(reorder_filters); self diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index ba351b463599..ef6f8ac50fd8 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -28,19 +28,43 @@ use arrow::datatypes::DataType; use datafusion_common::{Column, Result}; use std::sync::Arc; -/// Create a column expression based on a qualified or unqualified column name +/// Create a column expression based on a qualified or unqualified column name. Will +/// normalize unquoted identifiers according to SQL rules (identifiers will become lowercase). /// -/// example: -/// ``` +/// For example: +/// +/// ```rust /// # use datafusion_expr::col; -/// let c = col("my_column"); +/// let c1 = col("a"); +/// let c2 = col("A"); +/// assert_eq!(c1, c2); +/// +/// // note how quoting with double quotes preserves the case +/// let c3 = col(r#""A""#); +/// assert_ne!(c1, c3); /// ``` pub fn col(ident: impl Into) -> Expr { Expr::Column(ident.into()) } /// Create an unqualified column expression from the provided name, without normalizing -/// the column +/// the column. +/// +/// For example: +/// +/// ```rust +/// # use datafusion_expr::{col, ident}; +/// let c1 = ident("A"); // not normalized staying as column 'A' +/// let c2 = col("A"); // normalized via SQL rules becoming column 'a' +/// assert_ne!(c1, c2); +/// +/// let c3 = col(r#""A""#); +/// assert_eq!(c1, c3); +/// +/// let c4 = col("t1.a"); // parses as relation 't1' column 'a' +/// let c5 = ident("t1.a"); // parses as column 't1.a' +/// assert_ne!(c4, c5); +/// ``` pub fn ident(name: impl Into) -> Expr { Expr::Column(Column::from_name(name)) } diff --git a/datafusion/physical-expr/src/intervals/cp_solver.rs b/datafusion/physical-expr/src/intervals/cp_solver.rs index 302a86cdc927..66367001c642 100644 --- a/datafusion/physical-expr/src/intervals/cp_solver.rs +++ b/datafusion/physical-expr/src/intervals/cp_solver.rs @@ -326,7 +326,7 @@ impl ExprIntervalGraph { // ``` /// This function associates stable node indices with [PhysicalExpr]s so - /// that we can match Arc and NodeIndex objects during + /// that we can match `Arc` and NodeIndex objects during /// membership tests. pub fn gather_node_indices( &mut self, diff --git a/datafusion/sql/src/expr/arrow_cast.rs b/datafusion/sql/src/expr/arrow_cast.rs index bc1313e2c114..83d251a622c7 100644 --- a/datafusion/sql/src/expr/arrow_cast.rs +++ b/datafusion/sql/src/expr/arrow_cast.rs @@ -93,7 +93,7 @@ pub fn create_arrow_cast(mut args: Vec, schema: &DFSchema) -> Result /// assert_eq!(data_type, DataType::Int32); /// ``` /// -/// Remove if added to arrow: https://github.com/apache/arrow-rs/issues/3821 +/// Remove if added to arrow: pub fn parse_data_type(val: &str) -> Result { Parser::new(val).parse() } From 0a2062d1574d4b5b46cf846464f930f7077f06de Mon Sep 17 00:00:00 2001 From: Jefffrey <22608443+Jefffrey@users.noreply.github.com> Date: Thu, 9 Mar 2023 20:17:13 +1100 Subject: [PATCH 13/13] From OwnedTableReference to TableReference impl --- datafusion/common/src/table_reference.rs | 25 +++++++++++++++- datafusion/core/src/execution/context.rs | 30 ++++++++----------- .../src/engines/datafusion/create_table.rs | 6 ++-- .../src/engines/datafusion/insert.rs | 8 ++--- 4 files changed, 44 insertions(+), 25 deletions(-) diff --git a/datafusion/common/src/table_reference.rs b/datafusion/common/src/table_reference.rs index 7fb3dfcc7d94..257073681934 100644 --- a/datafusion/common/src/table_reference.rs +++ b/datafusion/common/src/table_reference.rs @@ -190,7 +190,7 @@ impl<'a> TableReference<'a> { } /// Converts directly into an [`OwnedTableReference`] - pub fn to_owned_reference(self) -> OwnedTableReference { + pub fn to_owned_reference(&self) -> OwnedTableReference { match self { Self::Full { catalog, @@ -267,6 +267,29 @@ impl From for OwnedTableReference { } } +impl<'a> From<&'a OwnedTableReference> for TableReference<'a> { + fn from(value: &'a OwnedTableReference) -> Self { + match value { + OwnedTableReference::Bare { table } => TableReference::Bare { + table: Cow::Borrowed(table), + }, + OwnedTableReference::Partial { schema, table } => TableReference::Partial { + schema: Cow::Borrowed(schema), + table: Cow::Borrowed(table), + }, + OwnedTableReference::Full { + catalog, + schema, + table, + } => TableReference::Full { + catalog: Cow::Borrowed(catalog), + schema: Cow::Borrowed(schema), + table: Cow::Borrowed(table), + }, + } + } +} + /// Parse a string into a TableReference, normalizing where appropriate /// /// See full details on [`TableReference::parse_str`] diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs index 027bdb37f31b..050cdb6ee5f5 100644 --- a/datafusion/core/src/execution/context.rs +++ b/datafusion/core/src/execution/context.rs @@ -329,19 +329,19 @@ impl SessionContext { or_replace, }) => { let input = Arc::try_unwrap(input).unwrap_or_else(|e| e.as_ref().clone()); - let table = self.table(name.clone()).await; + let table = self.table(&name).await; match (if_not_exists, or_replace, table) { (true, false, Ok(_)) => self.return_empty_dataframe(), (false, true, Ok(_)) => { - self.deregister_table(name.clone())?; + self.deregister_table(&name)?; let schema = Arc::new(input.schema().as_ref().into()); let physical = DataFrame::new(self.state(), input); let batches: Vec<_> = physical.collect_partitioned().await?; let table = Arc::new(MemTable::try_new(schema, batches)?); - self.register_table(name.clone(), table)?; + self.register_table(&name, table)?; self.return_empty_dataframe() } (true, true, Ok(_)) => Err(DataFusionError::Execution( @@ -354,7 +354,7 @@ impl SessionContext { let batches: Vec<_> = physical.collect_partitioned().await?; let table = Arc::new(MemTable::try_new(schema, batches)?); - self.register_table(name, table)?; + self.register_table(&name, table)?; self.return_empty_dataframe() } (false, false, Ok(_)) => Err(DataFusionError::Execution(format!( @@ -369,22 +369,22 @@ impl SessionContext { or_replace, definition, }) => { - let view = self.table(name.clone()).await; + let view = self.table(&name).await; match (or_replace, view) { (true, Ok(_)) => { - self.deregister_table(name.clone())?; + self.deregister_table(&name)?; let table = Arc::new(ViewTable::try_new((*input).clone(), definition)?); - self.register_table(name.clone(), table)?; + self.register_table(&name, table)?; self.return_empty_dataframe() } (_, Err(_)) => { let table = Arc::new(ViewTable::try_new((*input).clone(), definition)?); - self.register_table(name, table)?; + self.register_table(&name, table)?; self.return_empty_dataframe() } (false, Ok(_)) => Err(DataFusionError::Execution(format!( @@ -396,9 +396,7 @@ impl SessionContext { LogicalPlan::DropTable(DropTable { name, if_exists, .. }) => { - let result = self - .find_and_deregister(name.clone(), TableType::Base) - .await; + let result = self.find_and_deregister(&name, TableType::Base).await; match (result, if_exists) { (Ok(true), _) => self.return_empty_dataframe(), (_, true) => self.return_empty_dataframe(), @@ -411,9 +409,7 @@ impl SessionContext { LogicalPlan::DropView(DropView { name, if_exists, .. }) => { - let result = self - .find_and_deregister(name.clone(), TableType::View) - .await; + let result = self.find_and_deregister(&name, TableType::View).await; match (result, if_exists) { (Ok(true), _) => self.return_empty_dataframe(), (_, true) => self.return_empty_dataframe(), @@ -571,7 +567,7 @@ impl SessionContext { &self, cmd: &CreateExternalTable, ) -> Result { - let exist = self.table_exist(cmd.name.clone())?; + let exist = self.table_exist(&cmd.name)?; if exist { match cmd.if_not_exists { true => return self.return_empty_dataframe(), @@ -586,7 +582,7 @@ impl SessionContext { let table_provider: Arc = self.create_custom_table(cmd).await?; - self.register_table(cmd.name.clone(), table_provider)?; + self.register_table(&cmd.name, table_provider)?; self.return_empty_dataframe() } @@ -1918,7 +1914,7 @@ impl SessionState { self.config.options.sql_parser.parse_float_as_decimal; for reference in references { let table = reference.table(); - let resolved = self.resolve_table_ref(reference.clone()); + let resolved = self.resolve_table_ref(&reference); if let Entry::Vacant(v) = provider.tables.entry(resolved.to_string()) { if let Ok(schema) = self.schema_for_ref(resolved) { if let Some(table) = schema.table(table).await { diff --git a/datafusion/core/tests/sqllogictests/src/engines/datafusion/create_table.rs b/datafusion/core/tests/sqllogictests/src/engines/datafusion/create_table.rs index a753cb79953c..981dd75b56d7 100644 --- a/datafusion/core/tests/sqllogictests/src/engines/datafusion/create_table.rs +++ b/datafusion/core/tests/sqllogictests/src/engines/datafusion/create_table.rs @@ -36,11 +36,11 @@ pub async fn create_table( ) -> Result { let table_reference = object_name_to_table_reference(name, ctx.enable_ident_normalization())?; - let existing_table = ctx.table(table_reference.clone()).await; + let existing_table = ctx.table(&table_reference).await; match (if_not_exists, or_replace, existing_table) { (true, false, Ok(_)) => Ok(DBOutput::StatementComplete(0)), (false, true, Ok(_)) => { - ctx.deregister_table(table_reference.clone())?; + ctx.deregister_table(&table_reference)?; create_new_table(ctx, table_reference, columns) } (true, true, Ok(_)) => { @@ -78,6 +78,6 @@ fn create_new_table( ); let schema = Arc::new(sql_to_rel.build_schema(columns)?); let table_provider = Arc::new(MemTable::try_new(schema, vec![])?); - ctx.register_table(table_reference.clone(), table_provider)?; + ctx.register_table(&table_reference, table_provider)?; Ok(DBOutput::StatementComplete(0)) } diff --git a/datafusion/core/tests/sqllogictests/src/engines/datafusion/insert.rs b/datafusion/core/tests/sqllogictests/src/engines/datafusion/insert.rs index e518db2204f7..a8fca3b16c06 100644 --- a/datafusion/core/tests/sqllogictests/src/engines/datafusion/insert.rs +++ b/datafusion/core/tests/sqllogictests/src/engines/datafusion/insert.rs @@ -56,9 +56,9 @@ pub async fn insert(ctx: &SessionContext, insert_stmt: SQLStatement) -> Result Result