Skip to content

Commit

Permalink
refine ut framework including Part 1 and Part 2 (NVIDIA#10861)
Browse files Browse the repository at this point in the history
* refine UT framework to promote GPU evaluation

Signed-off-by: Hongbin Ma (Mahone) <[email protected]>

* enable some exprs for json

Signed-off-by: Hongbin Ma (Mahone) <[email protected]>

* exclude flaky tests

Signed-off-by: Hongbin Ma (Mahone) <[email protected]>

* fix review comments

Signed-off-by: Hongbin Ma (Mahone) <[email protected]>

* use vectorized parameter where possible

Signed-off-by: Hongbin Ma (Mahone) <[email protected]>

* add todo for utc issue

Signed-off-by: Hongbin Ma (Mahone) <[email protected]>

---------

Signed-off-by: Hongbin Ma (Mahone) <[email protected]>
  • Loading branch information
binmahone authored May 24, 2024
1 parent 5a1863b commit c5da29d
Show file tree
Hide file tree
Showing 58 changed files with 404 additions and 83 deletions.
1 change: 1 addition & 0 deletions docs/additional-functionality/advanced_configs.md
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ Name | SQL Function(s) | Description | Default Value | Notes
<a name="sql.expression.BitwiseNot"></a>spark.rapids.sql.expression.BitwiseNot|`~`|Returns the bitwise NOT of the operands|true|None|
<a name="sql.expression.BitwiseOr"></a>spark.rapids.sql.expression.BitwiseOr|`\|`|Returns the bitwise OR of the operands|true|None|
<a name="sql.expression.BitwiseXor"></a>spark.rapids.sql.expression.BitwiseXor|`^`|Returns the bitwise XOR of the operands|true|None|
<a name="sql.expression.BoundReference"></a>spark.rapids.sql.expression.BoundReference| |Reference to a bound variable|true|None|
<a name="sql.expression.CaseWhen"></a>spark.rapids.sql.expression.CaseWhen|`when`|CASE WHEN expression|true|None|
<a name="sql.expression.Cast"></a>spark.rapids.sql.expression.Cast|`bigint`, `binary`, `boolean`, `cast`, `date`, `decimal`, `double`, `float`, `int`, `smallint`, `string`, `timestamp`, `tinyint`|Convert a column of one type of data into another type|true|None|
<a name="sql.expression.Cbrt"></a>spark.rapids.sql.expression.Cbrt|`cbrt`|Cube root|true|None|
Expand Down
48 changes: 48 additions & 0 deletions docs/supported_ops.md
Original file line number Diff line number Diff line change
Expand Up @@ -4112,6 +4112,54 @@ are limited.
<td> </td>
</tr>
<tr>
<td rowSpan="2">BoundReference</td>
<td rowSpan="2"> </td>
<td rowSpan="2">Reference to a bound variable</td>
<td rowSpan="2">None</td>
<td rowSpan="1">project</td>
<td>result</td>
<td>S</td>
<td>S</td>
<td>S</td>
<td>S</td>
<td>S</td>
<td>S</td>
<td>S</td>
<td>S</td>
<td><em>PS<br/>UTC is only supported TZ for TIMESTAMP</em></td>
<td>S</td>
<td>S</td>
<td>S</td>
<td>S</td>
<td><b>NS</b></td>
<td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types CALENDAR, UDT</em></td>
<td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types CALENDAR, UDT</em></td>
<td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types CALENDAR, UDT</em></td>
<td><b>NS</b></td>
</tr>
<tr>
<td rowSpan="1">AST</td>
<td>result</td>
<td>S</td>
<td>S</td>
<td>S</td>
<td>S</td>
<td>S</td>
<td>S</td>
<td>S</td>
<td>S</td>
<td><em>PS<br/>UTC is only supported TZ for TIMESTAMP</em></td>
<td>S</td>
<td><b>NS</b></td>
<td><b>NS</b></td>
<td><b>NS</b></td>
<td><b>NS</b></td>
<td><b>NS</b></td>
<td><b>NS</b></td>
<td><b>NS</b></td>
<td><b>NS</b></td>
</tr>
<tr>
<td rowSpan="3">CaseWhen</td>
<td rowSpan="3">`when`</td>
<td rowSpan="3">CASE WHEN expression</td>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -930,6 +930,19 @@ object GpuOverrides extends Logging {
override def convertToGpu(child: Expression): GpuExpression =
GpuAlias(child, a.name)(a.exprId, a.qualifier, a.explicitMetadata)
}),
expr[BoundReference](
"Reference to a bound variable",
ExprChecks.projectAndAst(
TypeSig.astTypes + GpuTypeShims.additionalCommonOperatorSupportedTypes,
(TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.MAP + TypeSig.ARRAY + TypeSig.STRUCT +
TypeSig.DECIMAL_128 + TypeSig.BINARY +
GpuTypeShims.additionalCommonOperatorSupportedTypes).nested(),
TypeSig.all),
(currentRow, conf, p, r) => new ExprMeta[BoundReference](currentRow, conf, p, r) {
override def convertToGpu(): GpuExpression = GpuBoundReference(
currentRow.ordinal, currentRow.dataType, currentRow.nullable)(
NamedExpression.newExprId, "")
}),
expr[AttributeReference](
"References an input column",
ExprChecks.projectAndAst(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1485,6 +1485,13 @@ val GPU_COREDUMP_PIPE_PATTERN = conf("spark.rapids.gpu.coreDump.pipePattern")
.stringConf
.createWithDefault(false.toString)

val FOLDABLE_NON_LIT_ALLOWED = conf("spark.rapids.sql.test.isFoldableNonLitAllowed")
.doc("Only to be used in tests. If `true` the foldable expressions that are not literals " +
"will be allowed")
.internal()
.booleanConf
.createWithDefault(false)

val TEST_CONF = conf("spark.rapids.sql.test.enabled")
.doc("Intended to be used by unit tests, if enabled all operations must run on the " +
"GPU or an error happens.")
Expand Down Expand Up @@ -2428,6 +2435,8 @@ class RapidsConf(conf: Map[String, String]) extends Logging {

lazy val isTestEnabled: Boolean = get(TEST_CONF)

lazy val isFoldableNonLitAllowed: Boolean = get(FOLDABLE_NON_LIT_ALLOWED)

/**
* Convert a string value to the injection configuration OomInjection.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1108,7 +1108,7 @@ abstract class BaseExprMeta[INPUT <: Expression](
case _ => ExpressionContext.getRegularOperatorContext(this)
}

val isFoldableNonLitAllowed: Boolean = false
val isFoldableNonLitAllowed: Boolean = conf.isFoldableNonLitAllowed

// There are 4 levels of timezone check in GPU plan tag phase:
// Level 1: Check whether an expression is related to timezone. This is achieved by
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/*** spark-rapids-shim-json-lines
{"spark": "330"}
spark-rapids-shim-json-lines ***/
package org.apache.spark.sql.rapids.suites

import org.apache.spark.sql.catalyst.expressions.JsonExpressionsSuite
import org.apache.spark.sql.rapids.utils.{RapidsJsonConfTrait, RapidsTestsTrait}

class RapidsJsonExpressionsSuite
extends JsonExpressionsSuite with RapidsTestsTrait with RapidsJsonConfTrait {}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ spark-rapids-shim-json-lines ***/
package org.apache.spark.sql.rapids.suites

import org.apache.spark.sql.JsonFunctionsSuite
import org.apache.spark.sql.rapids.utils.RapidsSQLTestsTrait
import org.apache.spark.sql.rapids.utils.{RapidsJsonConfTrait, RapidsSQLTestsTrait}

class RapidsJsonFunctionsSuite extends JsonFunctionsSuite with RapidsSQLTestsTrait {}
class RapidsJsonFunctionsSuite
extends JsonFunctionsSuite with RapidsSQLTestsTrait with RapidsJsonConfTrait {}
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,13 @@ import org.apache.spark.sql.execution.datasources.{InMemoryFileIndex, NoopCache}
import org.apache.spark.sql.execution.datasources.json.JsonSuite
import org.apache.spark.sql.execution.datasources.v2.json.JsonScanBuilder
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.rapids.utils.RapidsSQLTestsBaseTrait
import org.apache.spark.sql.rapids.utils.{RapidsJsonConfTrait, RapidsSQLTestsBaseTrait}
import org.apache.spark.sql.sources
import org.apache.spark.sql.types.{IntegerType, StructType}
import org.apache.spark.sql.util.CaseInsensitiveStringMap

class RapidsJsonSuite extends JsonSuite with RapidsSQLTestsBaseTrait {

class RapidsJsonSuite
extends JsonSuite with RapidsSQLTestsBaseTrait with RapidsJsonConfTrait {
/** Returns full path to the given file in the resource folder */
override protected def testFile(fileName: String): String = {
getWorkspaceFilePath("sql", "core", "src", "test", "resources").toString + "/" + fileName
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/*** spark-rapids-shim-json-lines
{"spark": "330"}
spark-rapids-shim-json-lines ***/
package org.apache.spark.sql.rapids.utils

import org.scalatest.{BeforeAndAfterAll, Suite}

import org.apache.spark.sql.internal.SQLConf

trait RapidsJsonConfTrait extends BeforeAndAfterAll { this: Suite =>
override def beforeAll(): Unit = {
super.beforeAll()
SQLConf.get.setConfString("spark.rapids.sql.expression.JsonTuple", true.toString)
SQLConf.get.setConfString("spark.rapids.sql.expression.GetJsonObject", true.toString)
SQLConf.get.setConfString("spark.rapids.sql.expression.JsonToStructs", true.toString)
SQLConf.get.setConfString("spark.rapids.sql.expression.StructsToJson", true.toString)
}

override def afterAll(): Unit = {
SQLConf.get.unsetConf("spark.rapids.sql.expression.JsonTuple")
SQLConf.get.unsetConf("spark.rapids.sql.expression.GetJsonObject")
SQLConf.get.unsetConf("spark.rapids.sql.expression.JsonToStructs")
SQLConf.get.unsetConf("spark.rapids.sql.expression.StructsToJson")
super.afterAll()
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,13 @@ object RapidsSQLTestsBaseTrait {
"org.apache.spark.sql.rapids.ExecutionPlanCaptureCallback")
.set("spark.sql.warehouse.dir", warehouse)
.set("spark.sql.cache.serializer", "com.nvidia.spark.ParquetCachedBatchSerializer")
// TODO: remove hard coded UTC https://github.com/NVIDIA/spark-rapids/issues/10874
.set("spark.sql.session.timeZone", "UTC")
.set("spark.rapids.sql.explain", "ALL")
// uncomment below config to run `strict mode`, where fallback to CPU is treated as fail
// .set("spark.rapids.sql.test.enabled", "true")
// .set("spark.rapids.sql.test.allowedNonGpu",
// "SerializeFromObjectExec,DeserializeToObjectExec,ExternalRDDScanExec")
.setAppName("rapids spark plugin running Vanilla Spark UT")

conf
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
spark-rapids-shim-json-lines ***/
package org.apache.spark.sql.rapids.utils

import org.apache.spark.sql.rapids.suites.{RapidsCastSuite, RapidsDataFrameAggregateSuite, RapidsJsonFunctionsSuite, RapidsJsonSuite, RapidsMathFunctionsSuite, RapidsRegexpExpressionsSuite, RapidsStringExpressionsSuite, RapidsStringFunctionsSuite}
import org.apache.spark.sql.rapids.suites.{RapidsCastSuite, RapidsDataFrameAggregateSuite, RapidsJsonExpressionsSuite, RapidsJsonFunctionsSuite, RapidsJsonSuite, RapidsMathFunctionsSuite, RapidsRegexpExpressionsSuite, RapidsStringExpressionsSuite, RapidsStringFunctionsSuite}

// Some settings' line length exceeds 100
// scalastyle:off line.size.limit
Expand All @@ -41,7 +41,28 @@ class RapidsTestSettings extends BackendTestSettings {
.exclude("SPARK-17641: collect functions should not collect null values", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10772"))
.exclude("SPARK-19471: AggregationIterator does not initialize the generated result projection before using it", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10772"))
.exclude("SPARK-24788: RelationalGroupedDataset.toString with unresolved exprs should not fail", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10801"))
enableSuite[RapidsJsonExpressionsSuite]
.exclude("from_json - invalid data", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849"))
.exclude("from_json - input=empty array, schema=struct, output=single row with null", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849"))
.exclude("from_json - input=empty object, schema=struct, output=single row with null", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849"))
.exclude("SPARK-20549: from_json bad UTF-8", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849"))
.exclude("from_json with timestamp", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849"))
.exclude("to_json - array", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849"))
.exclude("to_json - array with single empty row", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849"))
.exclude("to_json - empty array", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849"))
.exclude("to_json with timestamp", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849"))
.exclude("SPARK-21513: to_json support map[string, struct] to json", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849"))
.exclude("SPARK-21513: to_json support map[struct, struct] to json", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849"))
.exclude("SPARK-21513: to_json support map[string, integer] to json", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849"))
.exclude("to_json - array with maps", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849"))
.exclude("to_json - array with single map", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849"))
.exclude("from_json missing fields", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849"))
enableSuite[RapidsJsonFunctionsSuite]
.exclude("from_json invalid json", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10852"))
.exclude("to_json - array", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10852"))
.exclude("to_json - map", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10852"))
.exclude("to_json - array of primitive types", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10852"))
.exclude("SPARK-33134: return partial results only for root JSON objects", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10852"))
enableSuite[RapidsJsonSuite]
.exclude("Casting long as timestamp", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10773"))
.exclude("Write timestamps correctly with timestampFormat option and timeZone option", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10773"))
Expand All @@ -58,24 +79,14 @@ class RapidsTestSettings extends BackendTestSettings {
.exclude("SPARK-37360: Timestamp type inference for a mix of TIMESTAMP_NTZ and TIMESTAMP_LTZ", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10773"))
enableSuite[RapidsMathFunctionsSuite]
enableSuite[RapidsRegexpExpressionsSuite]
.exclude("RegexReplace", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10774"))
.exclude("RegexExtract", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10774"))
.exclude("RegexExtractAll", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10774"))
.exclude("SPLIT", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10774"))
enableSuite[RapidsStringExpressionsSuite]
.exclude("concat", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775"))
.exclude("string substring_index function", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775"))
.exclude("format_number / FormatNumber", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775"))
.exclude("SPARK-22498: Concat should not generate codes beyond 64KB", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775"))
.exclude("SPARK-22549: ConcatWs should not generate codes beyond 64KB", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775"))
.exclude("SPARK-22550: Elt should not generate codes beyond 64KB", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775"))
.exclude("StringComparison", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775"))
.exclude("Substring", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775"))
.exclude("ascii for string", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775"))
.exclude("base64/unbase64 for string", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775"))
.exclude("encode/decode for string", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775"))
.exclude("SPARK-22603: FormatString should not generate codes beyond 64KB", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775"))
.exclude("LOCATE", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775"))
.exclude("LPAD/RPAD", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775"))
.exclude("REPEAT", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775"))
.exclude("length for string / binary", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775"))
.exclude("ParseUrl", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775"))
enableSuite[RapidsStringFunctionsSuite]
}
Expand Down
Loading

0 comments on commit c5da29d

Please sign in to comment.