From 3e867a69fd12f706f61043a7d97f7d027183af21 Mon Sep 17 00:00:00 2001 From: Daniel Tenedorio Date: Fri, 16 Aug 2024 08:22:21 +0900 Subject: [PATCH] [SPARK-48966][SQL] Improve error message with invalid unresolved column reference in UDTF call ### What changes were proposed in this pull request? This bug covers improving an error message in the event of invalid UDTF calls. For example: ``` select * from udtf( observed => TABLE(select column from t), value_col => classic_dollars ) ``` Currently we get: ``` Unsupported subquery expression: Table arguments are used in a function where they are not supported: 'UnresolvedTableValuedFunction [udtf], [observed => table-argument#68918 [], value_col => 'classic_dollars, false +- Project ... +- SubqueryAlias ... +- Relation ... ``` But the real error is that the user passed column identifier classic_dollars rather than string "classic_dollars" into the string argument. The core reason is that `CheckAnalysis` checks the analyzer output tree for unresolved TABLE arguments first before checking for remaining unresolved attribute references. To fix it, we update `CheckAnalysis` to move the check for remaining `TABLE` arguments to later. Now that query returns something like: ``` { "errorClass" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", "sqlState" : "42703", "messageParameters" : { "objectName" : "`classic_dollars`" }, "queryContext" : [ { "objectType" : "", "objectName" : "", "startIndex" : 93, "stopIndex" : 109, "fragment" : "classic_dollars" } ] } ``` ### Why are the changes needed? This improves error messages for SQL queries that are invalid, but that a user might reasonably create accidentally while figuring out how the syntax works for calling table-valued functions. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? This PR adds and updates golden file test coverage. ### Was this patch authored or co-authored using generative AI tooling? Just some casual, everyday GitHub CoPilot usage. Closes #47447 from dtenedor/improve-error-udtf. Authored-by: Daniel Tenedorio Signed-off-by: Hyukjin Kwon --- .../sql/catalyst/analysis/CheckAnalysis.scala | 15 +++++++++--- .../named-function-arguments.sql.out | 16 ++++++++----- .../analyzer-results/udtf/udtf.sql.out | 21 +++++++++++++++++ .../resources/sql-tests/inputs/udtf/udtf.sql | 6 +++++ .../results/named-function-arguments.sql.out | 16 ++++++++----- .../sql-tests/results/udtf/udtf.sql.out | 23 +++++++++++++++++++ .../execution/python/PythonUDTFSuite.scala | 13 ----------- 7 files changed, 82 insertions(+), 28 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index b4753d0c6e4f7..594573f13cf00 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -400,6 +400,17 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB case _ => }) + + // Check for unresolved TABLE arguments after the main check above to allow other analysis + // errors to apply first, providing better error messages. + getAllExpressions(operator).foreach(_.foreachUp { + case expr: FunctionTableSubqueryArgumentExpression => + expr.failAnalysis( + errorClass = "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.UNSUPPORTED_TABLE_ARGUMENT", + messageParameters = Map("treeNode" -> planToString(plan))) + case _ => + }) + if (stagedError.isDefined) stagedError.get.apply() operator match { @@ -1078,9 +1089,7 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB checkCorrelationsInSubquery(expr.plan, isLateral = true) case _: FunctionTableSubqueryArgumentExpression => - expr.failAnalysis( - errorClass = "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.UNSUPPORTED_TABLE_ARGUMENT", - messageParameters = Map("treeNode" -> planToString(plan))) + // Do nothing here, since we will check for this pattern later. case inSubqueryOrExistsSubquery => plan match { diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/named-function-arguments.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/named-function-arguments.sql.out index 051ad64c6268c..a49d80912f081 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/named-function-arguments.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/named-function-arguments.sql.out @@ -202,17 +202,21 @@ SELECT * FROM explode(collection => TABLE(v)) -- !query analysis org.apache.spark.sql.catalyst.ExtendedAnalysisException { - "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.UNSUPPORTED_TABLE_ARGUMENT", - "sqlState" : "0A000", + "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + "sqlState" : "42K09", "messageParameters" : { - "treeNode" : "'Generate explode(table-argument#x []), false\n: +- SubqueryAlias v\n: +- View (`v`, [id#xL])\n: +- Project [cast(id#xL as bigint) AS id#xL]\n: +- Project [id#xL]\n: +- Range (0, 8, step=1)\n+- OneRowRelation\n" + "inputSql" : "\"functiontablesubqueryargumentexpression()\"", + "inputType" : "\"STRUCT\"", + "paramIndex" : "first", + "requiredType" : "(\"ARRAY\" or \"MAP\")", + "sqlExpr" : "\"explode(functiontablesubqueryargumentexpression())\"" }, "queryContext" : [ { "objectType" : "", "objectName" : "", - "startIndex" : 37, - "stopIndex" : 44, - "fragment" : "TABLE(v)" + "startIndex" : 15, + "stopIndex" : 45, + "fragment" : "explode(collection => TABLE(v))" } ] } diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udtf/udtf.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udtf/udtf.sql.out index 312a684b4f7ea..8f42ba0388767 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udtf/udtf.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udtf/udtf.sql.out @@ -924,6 +924,27 @@ SELECT * FROM UDTFPartitionByIndexingBug( [Analyzer test output redacted due to nondeterminism] +-- !query +SELECT * FROM + InvalidEvalReturnsNoneToNonNullableColumnScalarType(TABLE(SELECT 1 AS X), unresolved_column) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`unresolved_column`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 93, + "stopIndex" : 109, + "fragment" : "unresolved_column" + } ] +} + + -- !query DROP VIEW t1 -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/inputs/udtf/udtf.sql b/sql/core/src/test/resources/sql-tests/inputs/udtf/udtf.sql index a437b1f93b604..a4f598618ab6f 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/udtf/udtf.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udtf/udtf.sql @@ -159,6 +159,12 @@ SELECT * FROM UDTFPartitionByIndexingBug( 1.0 AS double_col ) ); +-- Exercise a query with both a valid TABLE argument and an invalid unresolved column reference. +-- The 'eval' method of this UDTF would later throw an exception, but that is not relevant here +-- because the analysis of this query should fail before that point. We just want to make sure that +-- this analysis failure returns a reasonable error message. +SELECT * FROM + InvalidEvalReturnsNoneToNonNullableColumnScalarType(TABLE(SELECT 1 AS X), unresolved_column); -- cleanup DROP VIEW t1; diff --git a/sql/core/src/test/resources/sql-tests/results/named-function-arguments.sql.out b/sql/core/src/test/resources/sql-tests/results/named-function-arguments.sql.out index 8f6844f0157ba..6dbc20b7153ee 100644 --- a/sql/core/src/test/resources/sql-tests/results/named-function-arguments.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/named-function-arguments.sql.out @@ -185,17 +185,21 @@ struct<> -- !query output org.apache.spark.sql.catalyst.ExtendedAnalysisException { - "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.UNSUPPORTED_TABLE_ARGUMENT", - "sqlState" : "0A000", + "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + "sqlState" : "42K09", "messageParameters" : { - "treeNode" : "'Generate explode(table-argument#x []), false\n: +- SubqueryAlias v\n: +- View (`v`, [id#xL])\n: +- Project [cast(id#xL as bigint) AS id#xL]\n: +- Project [id#xL]\n: +- Range (0, 8, step=1)\n+- OneRowRelation\n" + "inputSql" : "\"functiontablesubqueryargumentexpression()\"", + "inputType" : "\"STRUCT\"", + "paramIndex" : "first", + "requiredType" : "(\"ARRAY\" or \"MAP\")", + "sqlExpr" : "\"explode(functiontablesubqueryargumentexpression())\"" }, "queryContext" : [ { "objectType" : "", "objectName" : "", - "startIndex" : 37, - "stopIndex" : 44, - "fragment" : "TABLE(v)" + "startIndex" : 15, + "stopIndex" : 45, + "fragment" : "explode(collection => TABLE(v))" } ] } diff --git a/sql/core/src/test/resources/sql-tests/results/udtf/udtf.sql.out b/sql/core/src/test/resources/sql-tests/results/udtf/udtf.sql.out index 9368aea5eb1f0..b205161511463 100644 --- a/sql/core/src/test/resources/sql-tests/results/udtf/udtf.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udtf/udtf.sql.out @@ -1095,6 +1095,29 @@ NULL 1.0 NULL 1.0 +-- !query +SELECT * FROM + InvalidEvalReturnsNoneToNonNullableColumnScalarType(TABLE(SELECT 1 AS X), unresolved_column) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`unresolved_column`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 93, + "stopIndex" : 109, + "fragment" : "unresolved_column" + } ] +} + + -- !query DROP VIEW t1 -- !query schema diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDTFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDTFSuite.scala index 1cd469a8cae29..36a9c3c40e3e6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDTFSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDTFSuite.scala @@ -317,19 +317,6 @@ class PythonUDTFSuite extends QueryTest with SharedSparkSession { case other => failure(other) } - withTable("t") { - sql("create table t(col array) using parquet") - val query = "select * from explode(table(t))" - checkErrorMatchPVals( - exception = intercept[AnalysisException](sql(query)), - errorClass = "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.UNSUPPORTED_TABLE_ARGUMENT", - sqlState = None, - parameters = Map("treeNode" -> "(?s).*"), - context = ExpectedContext( - fragment = "table(t)", - start = 22, - stop = 29)) - } spark.udtf.registerPython(UDTFCountSumLast.name, pythonUDTFCountSumLast) var plan = sql(