Skip to content

Commit

Permalink
Create non-shim specific version of ParquetCachedBatchSerializer (#3473)
Browse files Browse the repository at this point in the history
* Removed redundant PCBS and GpuInMemoryTableScanExec classes

Signed-off-by: Raza Jafri <[email protected]>

* turn the nightly tests on

Signed-off-by: Raza Jafri <[email protected]>

* fixed pcbs and InMemoryTableScanExec package

Signed-off-by: Raza Jafri <[email protected]>

* Revert "turn the nightly tests on"

This reverts commit cafaa08.

Signed-off-by: Raza Jafri <[email protected]>

* adding no-default-profile to spark313

Signed-off-by: Raza Jafri <[email protected]>

* cleanup

* Add common class for the ParquetCachedBatchSerializer

Signed-off-by: Thomas Graves <[email protected]>

* Fix the shim layer to pick up the proper versions of Parquet cached
batch serializer

Signed-off-by: Thomas Graves <[email protected]>

* docs

* Update docs

* Fix databricks build

* re-enable cache tests

* Fix the class being used for Spark 3.2.0

* Update includes

* fix style

Co-authored-by: Raza Jafri <[email protected]>
  • Loading branch information
tgravescs and razajafri authored Sep 14, 2021
1 parent 541b9a9 commit b3f9773
Show file tree
Hide file tree
Showing 27 changed files with 273 additions and 8,200 deletions.
1 change: 1 addition & 0 deletions dist/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@ If you have to change the contents of the uber jar the following files control w

1. `unshimmed-base.txt` - this has classes and files that should go into the base jar with their normal package name (not shaded). This includes user visible classes (ie com/nvidia/spark/SQLPlugin), python files, and other files that aren't version specific. Uses Spark 3.0.1 built jar for these base classes.
2. `unshimmed-extras.txt` - This is applied to all the individual Spark specific verson jars to pull any files that need to go into the base of the jar and not into the Spark specific directory from all of the other Spark version jars.
3. `unshimmed-spark311.txt` - This is applied to all the Spark 3.1.1 specific verson to pull any files that need to go into the base of the jar and not into the Spark specific directory from all of the other Spark version jars.
51 changes: 41 additions & 10 deletions dist/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -245,13 +245,19 @@
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark311.jar"
dest="${project.build.directory}/parallel-world/"
>
<patternset refid="includeMeta"/>
<patternset id="includes-spark311">
<includesfile name="${project.basedir}/unshimmed-extras.txt"/>
<includesfile name="${project.basedir}/unshimmed-spark311.txt"/>
</patternset>
</unzip>
<unzip
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark311.jar"
dest="${project.build.directory}/parallel-world/spark311"
>
<patternset refid="excludeMeta"/>
<patternset id="excludes-spark311">
<excludesfile name="${project.basedir}/unshimmed-extras.txt"/>
<excludesfile name="${project.basedir}/unshimmed-spark311.txt"/>
</patternset>
</unzip>
<unzip
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark312.jar"
Expand Down Expand Up @@ -516,13 +522,19 @@
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark311.jar"
dest="${project.build.directory}/parallel-world/"
>
<patternset refid="includeMeta"/>
<patternset id="includes-spark311">
<includesfile name="${project.basedir}/unshimmed-extras.txt"/>
<includesfile name="${project.basedir}/unshimmed-spark311.txt"/>
</patternset>
</unzip>
<unzip
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark311.jar"
dest="${project.build.directory}/parallel-world/spark311"
>
<patternset refid="excludeMeta"/>
<patternset id="excludes-spark311">
<excludesfile name="${project.basedir}/unshimmed-extras.txt"/>
<excludesfile name="${project.basedir}/unshimmed-spark311.txt"/>
</patternset>
</unzip>
<unzip
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark312.jar"
Expand Down Expand Up @@ -728,13 +740,19 @@
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark312.jar"
dest="${project.build.directory}/parallel-world/"
>
<patternset refid="includeMeta"/>
<patternset id="includes-spark311">
<includesfile name="${project.basedir}/unshimmed-extras.txt"/>
<includesfile name="${project.basedir}/unshimmed-spark311.txt"/>
</patternset>
</unzip>
<unzip
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark312.jar"
dest="${project.build.directory}/parallel-world/spark312"
>
<patternset refid="excludeMeta"/>
<patternset id="excludes-spark311">
<excludesfile name="${project.basedir}/unshimmed-extras.txt"/>
<excludesfile name="${project.basedir}/unshimmed-spark311.txt"/>
</patternset>
</unzip>

<unzip
Expand Down Expand Up @@ -1014,13 +1032,19 @@
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark311.jar"
dest="${project.build.directory}/parallel-world/"
>
<patternset refid="includeMeta"/>
<patternset id="includes-spark311">
<includesfile name="${project.basedir}/unshimmed-extras.txt"/>
<includesfile name="${project.basedir}/unshimmed-spark311.txt"/>
</patternset>
</unzip>
<unzip
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark311.jar"
dest="${project.build.directory}/parallel-world/spark311"
>
<patternset refid="excludeMeta"/>
<patternset id="excludes-spark311">
<excludesfile name="${project.basedir}/unshimmed-extras.txt"/>
<excludesfile name="${project.basedir}/unshimmed-spark311.txt"/>
</patternset>
</unzip>
<unzip
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark312.jar"
Expand Down Expand Up @@ -1342,13 +1366,19 @@
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark311.jar"
dest="${project.build.directory}/parallel-world/"
>
<patternset refid="includeMeta"/>
<patternset id="includes-spark311">
<includesfile name="${project.basedir}/unshimmed-extras.txt"/>
<includesfile name="${project.basedir}/unshimmed-spark311.txt"/>
</patternset>
</unzip>
<unzip
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark311.jar"
dest="${project.build.directory}/parallel-world/spark311"
>
<patternset refid="excludeMeta"/>
<patternset id="excludes-spark311">
<excludesfile name="${project.basedir}/unshimmed-extras.txt"/>
<excludesfile name="${project.basedir}/unshimmed-spark311.txt"/>
</patternset>
</unzip>
<unzip
src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark312.jar"
Expand Down Expand Up @@ -1622,6 +1652,7 @@
<patternset id="sharedWorld">
<includesfile name="${project.basedir}/unshimmed-base.txt"/>
<includesfile name="${project.basedir}/unshimmed-extras.txt"/>
<includesfile name="${project.basedir}/unshimmed-spark311.txt"/>
</patternset>
</unzip>
<unzip
Expand Down
2 changes: 2 additions & 0 deletions dist/unshimmed-spark311.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
com/nvidia/spark/ParquetCachedBatchSerializer*
com/nvidia/spark/GpuCachedBatchSerializer*
2 changes: 1 addition & 1 deletion docs/additional-functionality/cache-serializer.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ nav_order: 2

To use this serializer please run Spark with the following conf.
```
spark-shell --conf spark.sql.cache.serializer=com.nvidia.spark.rapids.shims.spark311.ParquetCachedBatchSerializer"
spark-shell --conf spark.sql.cache.serializer=com.nvidia.spark.ParquetCachedBatchSerializer"
```


Expand Down
20 changes: 9 additions & 11 deletions jenkins/databricks/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,17 +59,16 @@ IS_SPARK_311_OR_LATER=0
[[ "$(printf '%s\n' "3.1.1" "$BASE_SPARK_VER" | sort -V | head -n1)" = "3.1.1" ]] && IS_SPARK_311_OR_LATER=1

TEST_TYPE="nightly"
PCBS_CONF="com.nvidia.spark.rapids.shims.spark311.ParquetCachedBatchSerializer"
PCBS_CONF="com.nvidia.spark.ParquetCachedBatchSerializer"
if [ -d "$LOCAL_JAR_PATH" ]; then
## Run tests with jars in the LOCAL_JAR_PATH dir downloading from the denpedency repo
LOCAL_JAR_PATH=$LOCAL_JAR_PATH bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE

# Temporarily only run on Spark 3.1.1 (https://github.com/NVIDIA/spark-rapids/issues/3311)
## Run cache tests
#if [[ "$IS_SPARK_311_OR_LATER" -eq "1" ]]; then
# PYSP_TEST_spark_sql_cache_serializer=${PCBS_CONF} \
# LOCAL_JAR_PATH=$LOCAL_JAR_PATH bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE -k cache_test
#fi
if [[ "$IS_SPARK_311_OR_LATER" -eq "1" ]]; then
PYSP_TEST_spark_sql_cache_serializer=${PCBS_CONF} \
LOCAL_JAR_PATH=$LOCAL_JAR_PATH bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE -k cache_test
fi

## Run cudf-udf tests
CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls $LOCAL_JAR_PATH/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
Expand All @@ -80,12 +79,11 @@ else
## Run tests with jars building from the spark-rapids source code
bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE

# Temporarily only run on Spark 3.1.1 (https://github.com/NVIDIA/spark-rapids/issues/3311)
## Run cache tests
#if [[ "$IS_SPARK_311_OR_LATER" -eq "1" ]]; then
# PYSP_TEST_spark_sql_cache_serializer=${PCBS_CONF} \
# bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE -k cache_test
#fi
if [[ "$IS_SPARK_311_OR_LATER" -eq "1" ]]; then
PYSP_TEST_spark_sql_cache_serializer=${PCBS_CONF} \
bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE -k cache_test
fi

## Run cudf-udf tests
CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls /home/ubuntu/spark-rapids/dist/target/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
Expand Down
7 changes: 2 additions & 5 deletions jenkins/spark-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,6 @@ IS_SPARK_311_OR_LATER=0
export SPARK_TASK_MAXFAILURES=1
[[ "$IS_SPARK_311_OR_LATER" -eq "0" ]] && SPARK_TASK_MAXFAILURES=4

IS_SPARK_311=0
[[ "$SPARK_VER" == "3.1.1" ]] && IS_SPARK_311=1

export PATH="$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH"

#stop and restart SPARK ETL
Expand Down Expand Up @@ -138,7 +135,7 @@ run_test() {

cache_serializer)
SPARK_SUBMIT_FLAGS="$BASE_SPARK_SUBMIT_ARGS $SEQ_CONF \
--conf spark.sql.cache.serializer=com.nvidia.spark.rapids.shims.spark311.ParquetCachedBatchSerializer" \
--conf spark.sql.cache.serializer=com.nvidia.spark.ParquetCachedBatchSerializer" \
./run_pyspark_from_build.sh -k cache_test
;;

Expand Down Expand Up @@ -179,7 +176,7 @@ fi
run_test cudf_udf_test

# Temporarily only run on Spark 3.1.1 (https://github.com/NVIDIA/spark-rapids/issues/3311)
if [[ "$IS_SPARK_311" -eq "1" ]]; then
if [[ "$IS_SPARK_311_OR_LATER" -eq "1" ]]; then
run_test cache_serializer
fi

Expand Down
Loading

0 comments on commit b3f9773

Please sign in to comment.