From 1e92155363292b06347158374dd8fbdcced0a636 Mon Sep 17 00:00:00 2001 From: Daniel Tenedorio Date: Fri, 10 May 2024 08:44:07 +0900 Subject: [PATCH] [SPARK-48180][SQL] Improve error when UDTF call with TABLE arg forgets parentheses around multiple PARTITION/ORDER BY exprs ### What changes were proposed in this pull request? This PR improves the error message when a table-valued function call has a TABLE argument with a PARTITION BY or ORDER BY clause with more than one associated expression, but forgets parentheses around them. For example: ``` SELECT * FROM testUDTF( TABLE(SELECT 1 AS device_id, 2 AS data_ds) WITH SINGLE PARTITION ORDER BY device_id, data_ds) ``` This query previously returned an obscure, unrelated error: ``` [UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.UNSUPPORTED_TABLE_ARGUMENT] Unsupported subquery expression: Table arguments are used in a function where they are not supported: 'UnresolvedTableValuedFunction [tvf], [table-argument#338 [], 'data_ds](https://issues.apache.org/jira/browse/SPARK-48180#338%20[],%20'data_ds), false +- Project [1 AS device_id#336, 2 AS data_ds#337](https://issues.apache.org/jira/browse/SPARK-48180#336,%202%20AS%20data_ds#337) +- OneRowRelation ``` Now it returns a reasonable error: ``` The table function call includes a table argument with an invalid partitioning/ordering specification: the ORDER BY clause included multiple expressions without parentheses surrounding them; please add parentheses around these expressions and then retry the query again. (line 4, pos 2) == SQL == SELECT * FROM testUDTF( TABLE(SELECT 1 AS device_id, 2 AS data_ds) WITH SINGLE PARTITION --^^^ ORDER BY device_id, data_ds) ``` ### Why are the changes needed? Here we improve error messages for common SQL syntax mistakes. ### Does this PR introduce _any_ user-facing change? Yes, see above. ### How was this patch tested? This PR adds test coverage. ### Was this patch authored or co-authored using generative AI tooling? No Closes #46451 from dtenedor/udtf-analyzer-bug. Authored-by: Daniel Tenedorio Signed-off-by: Hyukjin Kwon --- .../sql/catalyst/parser/SqlBaseParser.g4 | 2 ++ .../sql/catalyst/parser/AstBuilder.scala | 14 ++++++++++ .../sql/catalyst/parser/PlanParserSuite.scala | 19 ++++++++++---- .../execution/python/PythonUDTFSuite.scala | 26 +++++++++++++++++++ 4 files changed, 56 insertions(+), 5 deletions(-) diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 index 653224c5475f8..249f55fa40ac5 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 @@ -838,9 +838,11 @@ tableArgumentPartitioning : ((WITH SINGLE PARTITION) | ((PARTITION | DISTRIBUTE) BY (((LEFT_PAREN partition+=expression (COMMA partition+=expression)* RIGHT_PAREN)) + | (expression (COMMA invalidMultiPartitionExpression=expression)+) | partition+=expression))) ((ORDER | SORT) BY (((LEFT_PAREN sortItem (COMMA sortItem)* RIGHT_PAREN) + | (sortItem (COMMA invalidMultiSortItem=sortItem)+) | sortItem)))? ; diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 7d2355b2f08d1..326f1e7684b9c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -1638,6 +1638,20 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { } partitionByExpressions = p.partition.asScala.map(expression).toSeq orderByExpressions = p.sortItem.asScala.map(visitSortItem).toSeq + def invalidPartitionOrOrderingExpression(clause: String): String = { + "The table function call includes a table argument with an invalid " + + s"partitioning/ordering specification: the $clause clause included multiple " + + "expressions without parentheses surrounding them; please add parentheses around " + + "these expressions and then retry the query again" + } + validate( + Option(p.invalidMultiPartitionExpression).isEmpty, + message = invalidPartitionOrOrderingExpression("PARTITION BY"), + ctx = p.invalidMultiPartitionExpression) + validate( + Option(p.invalidMultiSortItem).isEmpty, + message = invalidPartitionOrOrderingExpression("ORDER BY"), + ctx = p.invalidMultiSortItem) } validate( !(withSinglePartition && partitionByExpressions.nonEmpty), diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala index 17dd7349e7bea..8d01040563361 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala @@ -1617,14 +1617,23 @@ class PlanParserSuite extends AnalysisTest { parameters = Map( "error" -> "'order'", "hint" -> "")) - val sql8 = s"select * from my_tvf(arg1 => table(select col1, col2, col3 from v2) " + - s"$partition by col1, col2 order by col2 asc, col3 desc)" + val sql8tableArg = "table(select col1, col2, col3 from v2)" + val sql8partition = s"$partition by col1, col2 order by col2 asc, col3 desc" + val sql8 = s"select * from my_tvf(arg1 => $sql8tableArg $sql8partition)" checkError( exception = parseException(sql8), - errorClass = "PARSE_SYNTAX_ERROR", + errorClass = "_LEGACY_ERROR_TEMP_0064", parameters = Map( - "error" -> "'order'", - "hint" -> ": extra input 'order'")) + "msg" -> + ("The table function call includes a table argument with an invalid " + + "partitioning/ordering specification: the PARTITION BY clause included multiple " + + "expressions without parentheses surrounding them; please add parentheses around " + + "these expressions and then retry the query again")), + context = ExpectedContext( + fragment = s"$sql8tableArg $sql8partition", + start = 29, + stop = 110 + partition.length) + ) } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDTFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDTFSuite.scala index 989597ae041db..1eaf1d24056da 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDTFSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDTFSuite.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.python import org.apache.spark.api.python.PythonEvalType import org.apache.spark.sql.{AnalysisException, IntegratedUDFTestUtils, QueryTest, Row} import org.apache.spark.sql.catalyst.expressions.{Add, Alias, Expression, FunctionTableSubqueryArgumentExpression, Literal} +import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, OneRowRelation, Project, Repartition, RepartitionByExpression, Sort, SubqueryAlias} import org.apache.spark.sql.functions.lit import org.apache.spark.sql.internal.SQLConf @@ -363,4 +364,29 @@ class PythonUDTFSuite extends QueryTest with SharedSparkSession { Row("abc")) } } + + test("SPARK-48180: Analyzer bug with multiple ORDER BY items for input table argument") { + assume(shouldTestPythonUDFs) + spark.udtf.registerPython("testUDTF", pythonUDTF) + checkError( + exception = intercept[ParseException](sql( + """ + |SELECT * FROM testUDTF( + | TABLE(SELECT 1 AS device_id, 2 AS data_ds) + | WITH SINGLE PARTITION + | ORDER BY device_id, data_ds) + |""".stripMargin)), + errorClass = "_LEGACY_ERROR_TEMP_0064", + parameters = Map("msg" -> + ("The table function call includes a table argument with an invalid " + + "partitioning/ordering specification: the ORDER BY clause included multiple " + + "expressions without parentheses surrounding them; please add parentheses around these " + + "expressions and then retry the query again")), + context = ExpectedContext( + fragment = "TABLE(SELECT 1 AS device_id, 2 AS data_ds)\n " + + "WITH SINGLE PARTITION\n " + + "ORDER BY device_id, data_ds", + start = 27, + stop = 122)) + } }