Create non-shim specific version of ParquetCachedBatchSerializer (#3473)

* Removed redundant PCBS and GpuInMemoryTableScanExec classes Signed-off-by: Raza Jafri <[email protected]> * turn the nightly tests on Signed-off-by: Raza Jafri <[email protected]> * fixed pcbs and InMemoryTableScanExec package Signed-off-by: Raza Jafri <[email protected]> * Revert "turn the nightly tests on" This reverts commit cafaa08. Signed-off-by: Raza Jafri <[email protected]> * adding no-default-profile to spark313 Signed-off-by: Raza Jafri <[email protected]> * cleanup * Add common class for the ParquetCachedBatchSerializer Signed-off-by: Thomas Graves <[email protected]> * Fix the shim layer to pick up the proper versions of Parquet cached batch serializer Signed-off-by: Thomas Graves <[email protected]> * docs * Update docs * Fix databricks build * re-enable cache tests * Fix the class being used for Spark 3.2.0 * Update includes * fix style Co-authored-by: Raza Jafri <[email protected]>
NVIDIA · Sep 14, 2021 · b3f9773 · b3f9773
1 parent 541b9a9
commit b3f9773
Show file tree

Hide file tree

Showing 27 changed files with 273 additions and 8,200 deletions.
diff --git a/dist/README.md b/dist/README.md
@@ -27,3 +27,4 @@ If you have to change the contents of the uber jar the following files control w
 
 1. `unshimmed-base.txt` - this has classes and files that should go into the base jar with their normal package name (not shaded). This includes user visible classes (ie com/nvidia/spark/SQLPlugin), python files, and other files that aren't version specific. Uses Spark 3.0.1 built jar for these base classes.
 2. `unshimmed-extras.txt` - This is applied to all the individual Spark specific verson jars to pull any files that need to go into the base of the jar and not into the Spark specific directory from all of the other Spark version jars.
+3. `unshimmed-spark311.txt` - This is applied to all the Spark 3.1.1 specific verson to pull any files that need to go into the base of the jar and not into the Spark specific directory from all of the other Spark version jars.
diff --git a/dist/pom.xml b/dist/pom.xml
@@ -245,13 +245,19 @@
                                                 src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark311.jar"
                                                 dest="${project.build.directory}/parallel-world/"
                                         >
-                                            <patternset refid="includeMeta"/>
+                                            <patternset id="includes-spark311">
+                                                <includesfile name="${project.basedir}/unshimmed-extras.txt"/>
+                                                <includesfile name="${project.basedir}/unshimmed-spark311.txt"/>
+                                            </patternset>
                                         </unzip>
                                         <unzip
                                                 src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark311.jar"
                                                 dest="${project.build.directory}/parallel-world/spark311"
                                         >
-                                            <patternset refid="excludeMeta"/>
+                                            <patternset id="excludes-spark311">
+                                                <excludesfile name="${project.basedir}/unshimmed-extras.txt"/>
+                                                <excludesfile name="${project.basedir}/unshimmed-spark311.txt"/>
+                                            </patternset>
                                         </unzip>
                                         <unzip
                                                 src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark312.jar"
@@ -516,13 +522,19 @@
                                                 src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark311.jar"
                                                 dest="${project.build.directory}/parallel-world/"
                                         >
-                                            <patternset refid="includeMeta"/>
+                                            <patternset id="includes-spark311">
+                                                <includesfile name="${project.basedir}/unshimmed-extras.txt"/>
+                                                <includesfile name="${project.basedir}/unshimmed-spark311.txt"/>
+                                            </patternset>
                                         </unzip>
                                         <unzip
                                                 src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark311.jar"
                                                 dest="${project.build.directory}/parallel-world/spark311"
                                         >
-                                            <patternset refid="excludeMeta"/>
+                                            <patternset id="excludes-spark311">
+                                                <excludesfile name="${project.basedir}/unshimmed-extras.txt"/>
+                                                <excludesfile name="${project.basedir}/unshimmed-spark311.txt"/>
+                                            </patternset>
                                         </unzip>
                                         <unzip
                                                 src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark312.jar"
@@ -728,13 +740,19 @@
                                                 src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark312.jar"
                                                 dest="${project.build.directory}/parallel-world/"
                                         >
-                                            <patternset refid="includeMeta"/>
+                                            <patternset id="includes-spark311">
+                                                <includesfile name="${project.basedir}/unshimmed-extras.txt"/>
+                                                <includesfile name="${project.basedir}/unshimmed-spark311.txt"/>
+                                            </patternset>
                                         </unzip>
                                         <unzip
                                                 src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark312.jar"
                                                 dest="${project.build.directory}/parallel-world/spark312"
                                         >
-                                            <patternset refid="excludeMeta"/>
+                                            <patternset id="excludes-spark311">
+                                                <excludesfile name="${project.basedir}/unshimmed-extras.txt"/>
+                                                <excludesfile name="${project.basedir}/unshimmed-spark311.txt"/>
+                                            </patternset>
                                         </unzip>
 
                                         <unzip
@@ -1014,13 +1032,19 @@
                                                 src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark311.jar"
                                                 dest="${project.build.directory}/parallel-world/"
                                         >
-                                            <patternset refid="includeMeta"/>
+                                            <patternset id="includes-spark311">
+                                                <includesfile name="${project.basedir}/unshimmed-extras.txt"/>
+                                                <includesfile name="${project.basedir}/unshimmed-spark311.txt"/>
+                                            </patternset>
                                         </unzip>
                                         <unzip
                                                 src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark311.jar"
                                                 dest="${project.build.directory}/parallel-world/spark311"
                                         >
-                                            <patternset refid="excludeMeta"/>
+                                            <patternset id="excludes-spark311">
+                                                <excludesfile name="${project.basedir}/unshimmed-extras.txt"/>
+                                                <excludesfile name="${project.basedir}/unshimmed-spark311.txt"/>
+                                            </patternset>
                                         </unzip>
                                         <unzip
                                                 src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark312.jar"
@@ -1342,13 +1366,19 @@
                                                 src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark311.jar"
                                                 dest="${project.build.directory}/parallel-world/"
                                         >
-                                            <patternset refid="includeMeta"/>
+                                            <patternset id="includes-spark311">
+                                                <includesfile name="${project.basedir}/unshimmed-extras.txt"/>
+                                                <includesfile name="${project.basedir}/unshimmed-spark311.txt"/>
+                                            </patternset>
                                         </unzip>
                                         <unzip
                                                 src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark311.jar"
                                                 dest="${project.build.directory}/parallel-world/spark311"
                                         >
-                                            <patternset refid="excludeMeta"/>
+                                            <patternset id="excludes-spark311">
+                                                <excludesfile name="${project.basedir}/unshimmed-extras.txt"/>
+                                                <excludesfile name="${project.basedir}/unshimmed-spark311.txt"/>
+                                            </patternset>
                                         </unzip>
                                         <unzip
                                                 src="${project.build.directory}/deps/rapids-4-spark-aggregator_${scala.binary.version}-${project.version}-spark312.jar"
@@ -1622,6 +1652,7 @@
                                             <patternset id="sharedWorld">
                                                 <includesfile name="${project.basedir}/unshimmed-base.txt"/>
                                                 <includesfile name="${project.basedir}/unshimmed-extras.txt"/>
+                                                <includesfile name="${project.basedir}/unshimmed-spark311.txt"/>
                                             </patternset>
                                         </unzip>
                                         <unzip

diff --git a/dist/unshimmed-spark311.txt b/dist/unshimmed-spark311.txt
@@ -0,0 +1,2 @@
+com/nvidia/spark/ParquetCachedBatchSerializer*
+com/nvidia/spark/GpuCachedBatchSerializer*
diff --git a/docs/additional-functionality/cache-serializer.md b/docs/additional-functionality/cache-serializer.md
@@ -37,7 +37,7 @@ nav_order: 2
 
   To use this serializer please run Spark with the following conf.
   ```
-  spark-shell --conf spark.sql.cache.serializer=com.nvidia.spark.rapids.shims.spark311.ParquetCachedBatchSerializer"
+  spark-shell --conf spark.sql.cache.serializer=com.nvidia.spark.ParquetCachedBatchSerializer"
   ```
 
 

diff --git a/jenkins/databricks/test.sh b/jenkins/databricks/test.sh
@@ -59,17 +59,16 @@ IS_SPARK_311_OR_LATER=0
 [[ "$(printf '%s\n' "3.1.1" "$BASE_SPARK_VER" | sort -V | head -n1)" = "3.1.1" ]] && IS_SPARK_311_OR_LATER=1
 
 TEST_TYPE="nightly"
-PCBS_CONF="com.nvidia.spark.rapids.shims.spark311.ParquetCachedBatchSerializer"
+PCBS_CONF="com.nvidia.spark.ParquetCachedBatchSerializer"
 if [ -d "$LOCAL_JAR_PATH" ]; then
     ## Run tests with jars in the LOCAL_JAR_PATH dir downloading from the denpedency repo
     LOCAL_JAR_PATH=$LOCAL_JAR_PATH bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh  --runtime_env="databricks" --test_type=$TEST_TYPE
 
-    # Temporarily only run on Spark 3.1.1 (https://github.com/NVIDIA/spark-rapids/issues/3311)
     ## Run cache tests
-    #if [[ "$IS_SPARK_311_OR_LATER" -eq "1" ]]; then
-    #  PYSP_TEST_spark_sql_cache_serializer=${PCBS_CONF} \
-    #   LOCAL_JAR_PATH=$LOCAL_JAR_PATH bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh  --runtime_env="databricks" --test_type=$TEST_TYPE -k cache_test
-    #fi
+    if [[ "$IS_SPARK_311_OR_LATER" -eq "1" ]]; then
+      PYSP_TEST_spark_sql_cache_serializer=${PCBS_CONF} \
+       LOCAL_JAR_PATH=$LOCAL_JAR_PATH bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh  --runtime_env="databricks" --test_type=$TEST_TYPE -k cache_test
+    fi
 
     ## Run cudf-udf tests
     CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls $LOCAL_JAR_PATH/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
@@ -80,12 +79,11 @@ else
     ## Run tests with jars building from the spark-rapids source code
     bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE
 
-    # Temporarily only run on Spark 3.1.1 (https://github.com/NVIDIA/spark-rapids/issues/3311)
     ## Run cache tests
-    #if [[ "$IS_SPARK_311_OR_LATER" -eq "1" ]]; then
-    #  PYSP_TEST_spark_sql_cache_serializer=${PCBS_CONF} \
-    #   bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE -k cache_test
-    #fi
+    if [[ "$IS_SPARK_311_OR_LATER" -eq "1" ]]; then
+      PYSP_TEST_spark_sql_cache_serializer=${PCBS_CONF} \
+       bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE -k cache_test
+    fi
 
     ## Run cudf-udf tests
     CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls /home/ubuntu/spark-rapids/dist/target/rapids-4-spark_*.jar | grep -v 'tests.jar'`"

diff --git a/jenkins/spark-tests.sh b/jenkins/spark-tests.sh
@@ -69,9 +69,6 @@ IS_SPARK_311_OR_LATER=0
 export SPARK_TASK_MAXFAILURES=1
 [[ "$IS_SPARK_311_OR_LATER" -eq "0" ]] && SPARK_TASK_MAXFAILURES=4
 
-IS_SPARK_311=0
-[[ "$SPARK_VER" == "3.1.1" ]] && IS_SPARK_311=1
-
 export PATH="$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH"
 
 #stop and restart SPARK ETL
@@ -138,7 +135,7 @@ run_test() {
 
       cache_serializer)
         SPARK_SUBMIT_FLAGS="$BASE_SPARK_SUBMIT_ARGS $SEQ_CONF \
-        --conf spark.sql.cache.serializer=com.nvidia.spark.rapids.shims.spark311.ParquetCachedBatchSerializer" \
+        --conf spark.sql.cache.serializer=com.nvidia.spark.ParquetCachedBatchSerializer" \
           ./run_pyspark_from_build.sh -k cache_test
         ;;
 
@@ -179,7 +176,7 @@ fi
 run_test cudf_udf_test
 
 # Temporarily only run on Spark 3.1.1 (https://github.com/NVIDIA/spark-rapids/issues/3311)
-if [[ "$IS_SPARK_311" -eq "1" ]]; then
+if [[ "$IS_SPARK_311_OR_LATER" -eq "1" ]]; then
   run_test cache_serializer
 fi
Original file line number	Diff line number	Diff line change
Expand Up		@@ -27,3 +27,4 @@ If you have to change the contents of the uber jar the following files control w

		1. `unshimmed-base.txt` - this has classes and files that should go into the base jar with their normal package name (not shaded). This includes user visible classes (ie com/nvidia/spark/SQLPlugin), python files, and other files that aren't version specific. Uses Spark 3.0.1 built jar for these base classes.
		2. `unshimmed-extras.txt` - This is applied to all the individual Spark specific verson jars to pull any files that need to go into the base of the jar and not into the Spark specific directory from all of the other Spark version jars.
		3. `unshimmed-spark311.txt` - This is applied to all the Spark 3.1.1 specific verson to pull any files that need to go into the base of the jar and not into the Spark specific directory from all of the other Spark version jars.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		com/nvidia/spark/ParquetCachedBatchSerializer*
		com/nvidia/spark/GpuCachedBatchSerializer*
-Original file line number
+Diff line change
@@ Expand Up / @@ -37,7 +37,7 @@ nav_order: 2 @@
       To use this serializer please run Spark with the following conf.
       ```
-      spark-shell --conf spark.sql.cache.serializer=com.nvidia.spark.rapids.shims.spark311.ParquetCachedBatchSerializer"
+      spark-shell --conf spark.sql.cache.serializer=com.nvidia.spark.ParquetCachedBatchSerializer"
       ```
@@ Expand Down @@