diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index e893cee089c93..d374dc0d5ceef 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -18,7 +18,7 @@ //! DFSchema is an extended schema struct that DataFusion uses to provide support for //! fields with optional relation names. -use std::collections::{BTreeSet, HashMap, HashSet}; +use std::collections::{HashMap, HashSet}; use std::fmt::{Display, Formatter}; use std::hash::Hash; use std::sync::Arc; @@ -154,7 +154,6 @@ impl DFSchema { field_qualifiers: qualifiers, functional_dependencies: FunctionalDependencies::empty(), }; - dfschema.check_names()?; Ok(dfschema) } @@ -183,7 +182,6 @@ impl DFSchema { field_qualifiers: vec![None; field_count], functional_dependencies: FunctionalDependencies::empty(), }; - dfschema.check_names()?; Ok(dfschema) } @@ -201,7 +199,6 @@ impl DFSchema { field_qualifiers: vec![Some(qualifier); schema.fields.len()], functional_dependencies: FunctionalDependencies::empty(), }; - schema.check_names()?; Ok(schema) } @@ -215,40 +212,9 @@ impl DFSchema { field_qualifiers: qualifiers, functional_dependencies: FunctionalDependencies::empty(), }; - dfschema.check_names()?; Ok(dfschema) } - /// Check if the schema have some fields with the same name - pub fn check_names(&self) -> Result<()> { - let mut qualified_names = BTreeSet::new(); - let mut unqualified_names = BTreeSet::new(); - - for (field, qualifier) in self.inner.fields().iter().zip(&self.field_qualifiers) { - if let Some(qualifier) = qualifier { - if !qualified_names.insert((qualifier, field.name())) { - return _schema_err!(SchemaError::DuplicateQualifiedField { - qualifier: Box::new(qualifier.clone()), - name: field.name().to_string(), - }); - } - } else if !unqualified_names.insert(field.name()) { - return _schema_err!(SchemaError::DuplicateUnqualifiedField { - name: field.name().to_string() - }); - } - } - - for (qualifier, name) in qualified_names { - if unqualified_names.contains(name) { - return _schema_err!(SchemaError::AmbiguousReference { - field: Column::new(Some(qualifier.clone()), name) - }); - } - } - Ok(()) - } - /// Assigns functional dependencies. pub fn with_functional_dependencies( mut self, @@ -285,7 +251,6 @@ impl DFSchema { field_qualifiers: new_qualifiers, functional_dependencies: FunctionalDependencies::empty(), }; - new_self.check_names()?; Ok(new_self) } @@ -1141,10 +1106,10 @@ mod tests { fn join_qualified_duplicate() -> Result<()> { let left = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; let right = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; - let join = left.join(&right); + let join = left.join(&right)?; assert_eq!( - join.unwrap_err().strip_backtrace(), - "Schema error: Schema contains duplicate qualified field name t1.c0", + "fields:[t1.c0, t1.c1, t1.c0, t1.c1], metadata:{}", + join.to_string() ); Ok(()) } @@ -1153,11 +1118,8 @@ mod tests { fn join_unqualified_duplicate() -> Result<()> { let left = DFSchema::try_from(test_schema_1())?; let right = DFSchema::try_from(test_schema_1())?; - let join = left.join(&right); - assert_eq!( - join.unwrap_err().strip_backtrace(), - "Schema error: Schema contains duplicate unqualified field name c0" - ); + let join = left.join(&right)?; + assert_eq!("fields:[c0, c1, c0, c1], metadata:{}", join.to_string()); Ok(()) } diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index 05988d6c6da4c..25f0cd88a8da0 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -150,6 +150,11 @@ pub enum SchemaError { qualifier: Box, name: String, }, + /// Schema duplicate qualified fields with duplicate unqualified names + QualifiedFieldWithDuplicateName { + qualifier: Box, + name: String, + }, /// Schema contains duplicate unqualified field name DuplicateUnqualifiedField { name: String }, /// No field with this name @@ -188,6 +193,14 @@ impl Display for SchemaError { quote_identifier(name) ) } + Self::QualifiedFieldWithDuplicateName { qualifier, name } => { + write!( + f, + "Schema contains qualified fields with duplicate unqualified names {}.{}", + qualifier.to_quoted_string(), + quote_identifier(name) + ) + } Self::DuplicateUnqualifiedField { name } => { write!( f, diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 90235e3f84c48..425689b80816b 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1557,8 +1557,6 @@ pub fn project( _ => projected_expr.push(columnize_expr(normalize_col(e, &plan)?, &plan)?), } } - validate_unique_names("Projections", projected_expr.iter())?; - Projection::try_new(projected_expr, Arc::new(plan)).map(LogicalPlan::Projection) } diff --git a/datafusion/expr/src/logical_plan/ddl.rs b/datafusion/expr/src/logical_plan/ddl.rs index 61684945bb6cb..2ab51f5c836d3 100644 --- a/datafusion/expr/src/logical_plan/ddl.rs +++ b/datafusion/expr/src/logical_plan/ddl.rs @@ -15,22 +15,22 @@ // specific language governing permissions and limitations // under the License. +use crate::expr::Sort; use crate::{Expr, LogicalPlan, SortExpr, Volatility}; +use arrow::datatypes::DataType; +use datafusion_common::{ + schema_err, Column, Constraints, DFSchema, DFSchemaRef, DataFusionError, Result, + SchemaError, SchemaReference, TableReference, +}; +use sqlparser::ast::Ident; use std::cmp::Ordering; -use std::collections::HashMap; +use std::collections::{BTreeSet, HashMap, HashSet}; use std::sync::Arc; use std::{ fmt::{self, Display}, hash::{Hash, Hasher}, }; -use crate::expr::Sort; -use arrow::datatypes::DataType; -use datafusion_common::{ - Constraints, DFSchemaRef, Result, SchemaReference, TableReference, -}; -use sqlparser::ast::Ident; - /// Various types of DDL (CREATE / DROP) catalog manipulation #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)] pub enum DdlStatement { @@ -305,6 +305,7 @@ impl CreateExternalTable { constraints, column_defaults, } = fields; + check_fields_unique(&schema)?; Ok(Self { name, schema, @@ -543,6 +544,7 @@ impl CreateMemoryTable { column_defaults, temporary, } = fields; + check_fields_unique(input.schema())?; Ok(Self { name, constraints, @@ -697,6 +699,7 @@ impl CreateView { definition, temporary, } = fields; + check_fields_unique(input.schema())?; Ok(Self { name, input, @@ -799,6 +802,48 @@ impl CreateViewBuilder { }) } } +fn check_fields_unique(schema: &DFSchema) -> Result<()> { + // Use tree set for deterministic error messages + let mut qualified_names = BTreeSet::new(); + let mut unqualified_names = HashSet::new(); + let mut name_occurrences: HashMap<&String, usize> = HashMap::new(); + + for (qualifier, field) in schema.iter() { + if let Some(qualifier) = qualifier { + // Check for duplicate qualified field names + if !qualified_names.insert((qualifier, field.name())) { + return schema_err!(SchemaError::DuplicateQualifiedField { + qualifier: Box::new(qualifier.clone()), + name: field.name().to_string(), + }); + } + // Check for duplicate unqualified field names + } else if !unqualified_names.insert(field.name()) { + return schema_err!(SchemaError::DuplicateUnqualifiedField { + name: field.name().to_string() + }); + } + *name_occurrences.entry(field.name()).or_default() += 1; + } + + for (qualifier, name) in qualified_names { + // Check for duplicate between qualified and unqualified field names + if unqualified_names.contains(name) { + return schema_err!(SchemaError::AmbiguousReference { + field: Column::new(Some(qualifier.clone()), name) + }); + } + // Check for duplicates between qualified names as the qualification will be stripped off + if name_occurrences[name] > 1 { + return schema_err!(SchemaError::QualifiedFieldWithDuplicateName { + qualifier: Box::new(qualifier.clone()), + name: name.to_owned(), + }); + } + } + + Ok(()) +} /// Creates a catalog (aka "Database"). #[derive(Debug, Clone, PartialEq, Eq, Hash)] @@ -1039,7 +1084,9 @@ impl PartialOrd for CreateIndex { #[cfg(test)] mod test { + use super::*; use crate::{CreateCatalog, DdlStatement, DropView}; + use arrow::datatypes::{DataType, Field, Schema}; use datafusion_common::{DFSchema, DFSchemaRef, TableReference}; use std::cmp::Ordering; @@ -1066,4 +1113,85 @@ mod test { assert_eq!(drop_view.partial_cmp(&catalog), Some(Ordering::Greater)); } + + #[test] + fn test_check_fields_unique() -> Result<()> { + // no duplicate fields, unqualified schema + check_fields_unique(&DFSchema::try_from(Schema::new(vec![ + Field::new("c100", DataType::Boolean, true), + Field::new("c101", DataType::Boolean, true), + ]))?)?; + + // no duplicate fields, qualified schema + check_fields_unique(&DFSchema::try_from_qualified_schema( + "t1", + &Schema::new(vec![ + Field::new("c100", DataType::Boolean, true), + Field::new("c101", DataType::Boolean, true), + ]), + )?)?; + + // duplicate unqualified field with same qualifier + assert_eq!( + check_fields_unique(&DFSchema::try_from(Schema::new(vec![ + Field::new("c0", DataType::Boolean, true), + Field::new("c1", DataType::Boolean, true), + Field::new("c1", DataType::Boolean, true), + Field::new("c2", DataType::Boolean, true), + ]))?) + .unwrap_err() + .to_string(), + "Schema error: Schema contains duplicate unqualified field name c1" + ); + + // duplicate qualified field with same qualifier + assert_eq!( + check_fields_unique(&DFSchema::try_from_qualified_schema( + "t1", + &Schema::new(vec![ + Field::new("c1", DataType::Boolean, true), + Field::new("c1", DataType::Boolean, true), + ]), + )?) + .unwrap_err() + .to_string(), + "Schema error: Schema contains duplicate qualified field name t1.c1" + ); + + // duplicate qualified and unqualified field + assert_eq!( + check_fields_unique(&DFSchema::from_field_specific_qualified_schema( + vec![ + None, + Some(TableReference::from("t1")), + ], + &Arc::new(Schema::new(vec![ + Field::new("c1", DataType::Boolean, true), + Field::new("c1", DataType::Boolean, true), + ])) + )?) + .unwrap_err() + .to_string(), + "Schema error: Schema contains qualified field name t1.c1 and unqualified field name c1 which would be ambiguous" + ); + + // qualified fields with duplicate unqualified names + assert_eq!( + check_fields_unique(&DFSchema::from_field_specific_qualified_schema( + vec![ + Some(TableReference::from("t1")), + Some(TableReference::from("t2")), + ], + &Arc::new(Schema::new(vec![ + Field::new("c1", DataType::Boolean, true), + Field::new("c1", DataType::Boolean, true), + ])) + )?) + .unwrap_err() + .to_string(), + "Schema error: Schema contains qualified fields with duplicate unqualified names t1.c1" + ); + + Ok(()) + } } diff --git a/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs b/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs index 9fbe54e1ccb92..11aa5ca230277 100644 --- a/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs +++ b/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs @@ -55,7 +55,6 @@ fn expand_internal(plan: LogicalPlan) -> Result> { match plan { LogicalPlan::Projection(Projection { expr, input, .. }) => { let projected_expr = expand_exprlist(&input, expr)?; - validate_unique_names("Projections", projected_expr.iter())?; Ok(Transformed::yes( Projection::try_new(projected_expr, Arc::clone(&input)) .map(LogicalPlan::Projection)?, diff --git a/datafusion/optimizer/tests/optimizer_integration.rs b/datafusion/optimizer/tests/optimizer_integration.rs index 236167985790d..d6f7e0972ac16 100644 --- a/datafusion/optimizer/tests/optimizer_integration.rs +++ b/datafusion/optimizer/tests/optimizer_integration.rs @@ -343,11 +343,11 @@ fn test_propagate_empty_relation_inner_join_and_unions() { #[test] fn select_wildcard_with_repeated_column() { let sql = "SELECT *, col_int32 FROM test"; - let err = test_sql(sql).expect_err("query should have failed"); - assert_eq!( - "Schema error: Schema contains duplicate qualified field name test.col_int32", - err.strip_backtrace() - ); + let plan = test_sql(sql).unwrap(); + let expected = "\ + Projection: test.col_int32, test.col_uint32, test.col_utf8, test.col_date32, test.col_date64, test.col_ts_nano_none, test.col_ts_nano_utc, test.col_int32\ + \n TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]"; + assert_eq!(expected, format!("{plan}")); } #[test] diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index be26a832be177..2174dc1b85ec7 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -1282,7 +1282,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let schema = self.build_schema(columns)?; let df_schema = schema.to_dfschema_ref()?; - df_schema.check_names()?; let ordered_exprs = self.build_order_by(order_exprs, &df_schema, &mut planner_context)?; diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index ab7e6c8d0bb73..e9a1b32a187ac 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -631,11 +631,10 @@ fn select_column_does_not_exist() { #[test] fn select_repeated_column() { - let sql = "SELECT age, age FROM person"; - let err = logical_plan(sql).expect_err("query should have failed"); - assert_eq!( - "Error during planning: Projections require unique expression names but the expression \"person.age\" at position 0 and \"person.age\" at position 1 have the same name. Consider aliasing (\"AS\") one of them.", - err.strip_backtrace() + quick_test( + "SELECT age, age FROM person", + "Projection: person.age, person.age\ + \n TableScan: person", ); } @@ -1334,11 +1333,11 @@ fn select_simple_aggregate_column_does_not_exist() { #[test] fn select_simple_aggregate_repeated_aggregate() { - let sql = "SELECT MIN(age), MIN(age) FROM person"; - let err = logical_plan(sql).expect_err("query should have failed"); - assert_eq!( - "Error during planning: Projections require unique expression names but the expression \"min(person.age)\" at position 0 and \"min(person.age)\" at position 1 have the same name. Consider aliasing (\"AS\") one of them.", - err.strip_backtrace() + quick_test( + "SELECT MIN(age), MIN(age) FROM person", + "Projection: min(person.age), min(person.age)\ + \n Aggregate: groupBy=[[]], aggr=[[min(person.age)]]\ + \n TableScan: person", ); } @@ -1375,11 +1374,11 @@ fn select_from_typed_string_values() { #[test] fn select_simple_aggregate_repeated_aggregate_with_repeated_aliases() { - let sql = "SELECT MIN(age) AS a, MIN(age) AS a FROM person"; - let err = logical_plan(sql).expect_err("query should have failed"); - assert_eq!( - "Error during planning: Projections require unique expression names but the expression \"min(person.age) AS a\" at position 0 and \"min(person.age) AS a\" at position 1 have the same name. Consider aliasing (\"AS\") one of them.", - err.strip_backtrace() + quick_test( + "SELECT MIN(age) AS a, MIN(age) AS a FROM person", + "Projection: min(person.age) AS a, min(person.age) AS a\ + \n Aggregate: groupBy=[[]], aggr=[[min(person.age)]]\ + \n TableScan: person", ); } @@ -1405,11 +1404,11 @@ fn select_simple_aggregate_with_groupby_with_aliases() { #[test] fn select_simple_aggregate_with_groupby_with_aliases_repeated() { - let sql = "SELECT state AS a, MIN(age) AS a FROM person GROUP BY state"; - let err = logical_plan(sql).expect_err("query should have failed"); - assert_eq!( - "Error during planning: Projections require unique expression names but the expression \"person.state AS a\" at position 0 and \"min(person.age) AS a\" at position 1 have the same name. Consider aliasing (\"AS\") one of them.", - err.strip_backtrace() + quick_test( + "SELECT state AS a, MIN(age) AS a FROM person GROUP BY state", + "Projection: person.state AS a, min(person.age) AS a\ + \n Aggregate: groupBy=[[person.state]], aggr=[[min(person.age)]]\ + \n TableScan: person", ); } @@ -1554,11 +1553,11 @@ fn select_simple_aggregate_with_groupby_can_use_alias() { #[test] fn select_simple_aggregate_with_groupby_aggregate_repeated() { - let sql = "SELECT state, MIN(age), MIN(age) FROM person GROUP BY state"; - let err = logical_plan(sql).expect_err("query should have failed"); - assert_eq!( - "Error during planning: Projections require unique expression names but the expression \"min(person.age)\" at position 1 and \"min(person.age)\" at position 2 have the same name. Consider aliasing (\"AS\") one of them.", - err.strip_backtrace() + quick_test( + "SELECT state, MIN(age), MIN(age) FROM person GROUP BY state", + "Projection: person.state, min(person.age), min(person.age)\ + \n Aggregate: groupBy=[[person.state]], aggr=[[min(person.age)]]\ + \n TableScan: person", ); } diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 917e037682f24..ca798aee01506 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -72,7 +72,7 @@ CREATE TABLE test (c1 BIGINT,c2 BIGINT) as values ####### # https://github.com/apache/datafusion/issues/3353 -statement error DataFusion error: Schema error: Schema contains duplicate unqualified field name "approx_distinct\(aggregate_test_100\.c9\)" +statement error DataFusion error: Schema error: Ambiguous reference to unqualified field "approx_distinct\(aggregate_test_100\.c9\)" SELECT approx_distinct(c9) count_c9, approx_distinct(cast(c9 as varchar)) count_c9_str FROM aggregate_test_100 # csv_query_approx_percentile_cont_with_weight diff --git a/datafusion/sqllogictest/test_files/create_external_table.slt b/datafusion/sqllogictest/test_files/create_external_table.slt index ed001cf9f84c5..94329e1b1dcd3 100644 --- a/datafusion/sqllogictest/test_files/create_external_table.slt +++ b/datafusion/sqllogictest/test_files/create_external_table.slt @@ -84,6 +84,10 @@ CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV FOOBAR BARBAR BARFOO LOCATION 'foo statement error DataFusion error: Arrow error: Schema error: Unable to get field named "c2". Valid fields: \["c1"\] create EXTERNAL TABLE t(c1 int) STORED AS CSV PARTITIONED BY (c2) LOCATION 'foo.csv' +# Duplicate Column +statement error DataFusion error: Schema error: Schema contains duplicate unqualified field name c1 +create EXTERNAL TABLE t(c1 int, c1 int) STORED AS CSV LOCATION 'foo.csv' + # Duplicate Column in `PARTITIONED BY` clause statement error DataFusion error: Schema error: Schema contains duplicate unqualified field name c1 create EXTERNAL TABLE t(c1 int, c2 int) STORED AS CSV PARTITIONED BY (c1 int) LOCATION 'foo.csv' diff --git a/datafusion/sqllogictest/test_files/create_table.slt b/datafusion/sqllogictest/test_files/create_table.slt new file mode 100644 index 0000000000000..3e30d7486f29a --- /dev/null +++ b/datafusion/sqllogictest/test_files/create_table.slt @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Issue https://github.com/apache/datafusion/issues/13487 +statement error DataFusion error: Schema error: Schema contains qualified fields with duplicate unqualified names l\.id +CREATE TABLE t AS SELECT * FROM (SELECT 1 AS id, 'Foo' AS name) l JOIN (SELECT 1 AS id, 'Bar' as name) r ON l.id = r.id; diff --git a/datafusion/sqllogictest/test_files/join.slt b/datafusion/sqllogictest/test_files/join.slt index 1feacc5ebe53e..b2b6db129ad35 100644 --- a/datafusion/sqllogictest/test_files/join.slt +++ b/datafusion/sqllogictest/test_files/join.slt @@ -1215,14 +1215,18 @@ statement ok create table t1(v1 int) as values(100); ## Query with Ambiguous column reference -query error DataFusion error: Schema error: Schema contains duplicate qualified field name t1\.v1 +query I select count(*) from t1 right outer join t1 on t1.v1 > 0; +---- +1 -query error DataFusion error: Schema error: Schema contains duplicate qualified field name t1\.v1 +query I select t1.v1 from t1 join t1 using(v1) cross join (select struct('foo' as v1) as t1); +---- +100 statement ok drop table t1; diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index c687429ae6ecf..9994ca3ba4ef2 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -1777,6 +1777,42 @@ create table users (id int, name varchar); statement ok insert into users values (1, 'Tom'); +query II +SELECT id, id FROM users +---- +1 1 + +query II +SELECT users.id, users.id FROM users +---- +1 1 + +query ITI +SELECT *, id FROM users +---- +1 Tom 1 + +query ITIT +SELECT *, * FROM users +---- +1 Tom 1 Tom + +query ITITIIII +SELECT *, *, id, id, -id AS id, id*3 AS id FROM users +---- +1 Tom 1 Tom 1 1 -1 3 + +query II +SELECT id AS col, id+1 AS col FROM users +---- +1 2 + +# TODO When joining using USING, the condition columns should appear once in the output, and should be selectible using unqualified name only +query ITIT +SELECT * FROM users JOIN users USING (id); +---- +1 Tom 1 Tom + statement ok create view v as select count(id) from users; diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index 2e1b8b87cc429..8b0c1c277f03e 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -533,8 +533,16 @@ select unnest(column1) from (select * from (values([1,2,3]), ([4,5,6])) limit 1 5 6 -query error DataFusion error: Error during planning: Projections require unique expression names but the expression "UNNEST\(unnest_table.column1\)" at position 0 and "UNNEST\(unnest_table.column1\)" at position 1 have the same name. Consider aliasing \("AS"\) one of them. +query II select unnest(column1), unnest(column1) from unnest_table; +---- +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +12 12 query II select unnest(column1), unnest(column1) u1 from unnest_table;