From 0509509f17813aa9ca8ae6d7e7bd0fac26651e7c Mon Sep 17 00:00:00 2001
From: NvTimLiu <50287591+NvTimLiu@users.noreply.github.com>
Date: Thu, 15 Apr 2021 21:40:27 +0800
Subject: [PATCH] Add dynamic Spark configuration for Databricks (#2116)

* Add daynamic spark confs for the Databricks

We need a way to set spark confs dynamically for the Databricks,
e.g., when we test cuDF sonatype release jars, we need to disable cudf-rapids version match by adding
"--conf spark.rapids.cudfVersionOverride=true", or enable/disable AQE, or anything else.

By adding the parameter spark_conf="--conf spark.xxx.xxx=xxx --conf ......" for the script 'run-tests.py',
we can dynamically add whatever confs for the Databricks cluster.

Signed-off-by: Tim Liu <timl@nvidia.com>

* Comma separated list of spark configurations

Signed-off-by: Tim Liu <timl@nvidia.com>

* Add a comment to make the '-f' format clear

* Add a comment to make the '-f' format clear

* Fix typo

* Add '--conf' if the SPARK_CONF is not empty
---
 jenkins/databricks/params.py    | 13 +++++++++----
 jenkins/databricks/run-tests.py |  2 +-
 jenkins/databricks/test.sh      | 21 ++++++++++++++++++---
 3 files changed, 28 insertions(+), 8 deletions(-)
diff --git a/jenkins/databricks/params.py b/jenkins/databricks/params.py
index e96acded2b4..ff815735a69 100644
--- a/jenkins/databricks/params.py
+++ b/jenkins/databricks/params.py
@@ -26,19 +26,21 @@
 clusterid = ''
 build_profiles = 'databricks,!snapshot-shims'
 jar_path = ''
+# `spark_conf` can take comma seperated mutiple spark configurations, e.g., spark.foo=1,spark.bar=2,...'
+spark_conf = ''
 
 try:
-    opts, args = getopt.getopt(sys.argv[1:], 'hw:t:c:p:l:d:z:m:v:b:j:',
-                               ['workspace=', 'token=', 'clusterid=', 'private=', 'localscript=', 'dest=', 'sparktgz=', 'basesparkpomversion=', 'buildprofiles=', 'jarpath'])
+    opts, args = getopt.getopt(sys.argv[1:], 'hw:t:c:p:l:d:z:m:v:b:j:f:',
+                               ['workspace=', 'token=', 'clusterid=', 'private=', 'localscript=', 'dest=', 'sparktgz=', 'basesparkpomversion=', 'buildprofiles=', 'jarpath', 'sparkconf'])
 except getopt.GetoptError:
     print(
-        'run-tests.py -s <workspace> -t <token> -c <clusterid> -p <privatekeyfile> -l <localscript> -d <scriptdestinatino> -z <sparktgz> -v <basesparkpomversion> -b <buildprofiles> -j <jarpath>')
+        'run-tests.py -s <workspace> -t <token> -c <clusterid> -p <privatekeyfile> -l <localscript> -d <scriptdestinatino> -z <sparktgz> -v <basesparkpomversion> -b <buildprofiles> -j <jarpath> -f <sparkconf>')
     sys.exit(2)
 
 for opt, arg in opts:
     if opt == '-h':
         print(
-            'run-tests.py -s <workspace> -t <token> -c <clusterid> -p <privatekeyfile> -n <skipstartingcluster> -l <localscript> -d <scriptdestinatino>, -z <sparktgz> -v <basesparkpomversion> -b <buildprofiles>')
+            'run-tests.py -s <workspace> -t <token> -c <clusterid> -p <privatekeyfile> -n <skipstartingcluster> -l <localscript> -d <scriptdestinatino>, -z <sparktgz> -v <basesparkpomversion> -b <buildprofiles> -f <sparkconf>')
         sys.exit()
     elif opt in ('-w', '--workspace'):
         workspace = arg
@@ -60,6 +62,8 @@
         build_profiles = arg
     elif opt in ('-j', '--jarpath'):
         jar_path = arg
+    elif opt in ('-f', '--sparkconf'):
+        spark_conf = arg
 
 print('-w is ' + workspace)
 print('-c is ' + clusterid)
@@ -69,3 +73,4 @@
 print('-z is ' + source_tgz)
 print('-v is ' + base_spark_pom_version)
 print('-j is ' + jar_path)
+print('-f is ' + spark_conf)
diff --git a/jenkins/databricks/run-tests.py b/jenkins/databricks/run-tests.py
index 3e33d7215ab..79e7e629aea 100644
--- a/jenkins/databricks/run-tests.py
+++ b/jenkins/databricks/run-tests.py
@@ -35,7 +35,7 @@ def main():
   print("rsync command: %s" % rsync_command)
   subprocess.check_call(rsync_command, shell = True)
 
-  ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s %s %s 2>&1 | tee testout; if [ `echo ${PIPESTATUS[0]}` -ne 0 ]; then false; else true; fi" % (master_addr, params.private_key_file, params.script_dest, params.jar_path)
+  ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s %s %s %s 2>&1 | tee testout; if [ `echo ${PIPESTATUS[0]}` -ne 0 ]; then false; else true; fi" % (master_addr, params.private_key_file, params.script_dest, params.jar_path, params.spark_conf)
   print("ssh command: %s" % ssh_command)
   subprocess.check_call(ssh_command, shell = True)
 
diff --git a/jenkins/databricks/test.sh b/jenkins/databricks/test.sh
index 8df0104d9da..1e7e1afb96f 100755
--- a/jenkins/databricks/test.sh
+++ b/jenkins/databricks/test.sh
@@ -15,9 +15,10 @@
 # limitations under the License.
 #
 
-set -e
+set -ex
 
 LOCAL_JAR_PATH=$1
+SPARK_CONF=$2
 
 # tests
 export PATH=/databricks/conda/envs/databricks-ml-gpu/bin:/databricks/conda/condabin:$PATH
@@ -38,6 +39,20 @@ CUDF_UDF_TEST_ARGS="--conf spark.python.daemon.module=rapids.daemon_databricks \
     --conf spark.rapids.python.memory.gpu.allocFraction=0.1 \
     --conf spark.rapids.python.concurrentPythonWorkers=2"
 
+## 'spark.foo=1,spark.bar=2,...' to 'export PYSP_TEST_spark_foo=1 export PYSP_TEST_spark_bar=2'
+if [ -n "$SPARK_CONF" ]; then
+    CONF_LIST=${SPARK_CONF//','/' '}
+    for CONF in ${CONF_LIST}; do
+        KEY=${CONF%%=*}
+        VALUE=${CONF#*=}
+        ## run_pyspark_from_build.sh requires 'export PYSP_TEST_spark_foo=1' as the spark configs
+        export PYSP_TEST_${KEY//'.'/'_'}=$VALUE
+    done
+
+    ## 'spark.foo=1,spark.bar=2,...' to '--conf spark.foo=1 --conf spark.bar=2 --conf ...'
+    SPARK_CONF="--conf ${SPARK_CONF/','/' --conf '}"
+fi
+
 TEST_TYPE="nightly"
 if [ -d "$LOCAL_JAR_PATH" ]; then
     ## Run tests with jars in the LOCAL_JAR_PATH dir downloading from the denpedency repo
@@ -45,7 +60,7 @@ if [ -d "$LOCAL_JAR_PATH" ]; then
 
     ## Run cudf-udf tests
     CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls $LOCAL_JAR_PATH/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
-    LOCAL_JAR_PATH=$LOCAL_JAR_PATH SPARK_SUBMIT_FLAGS=$CUDF_UDF_TEST_ARGS TEST_PARALLEL=1 \
+    LOCAL_JAR_PATH=$LOCAL_JAR_PATH SPARK_SUBMIT_FLAGS="$SPARK_CONF $CUDF_UDF_TEST_ARGS" TEST_PARALLEL=1 \
         bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf --test_type=$TEST_TYPE
 else
     ## Run tests with jars building from the spark-rapids source code
@@ -53,6 +68,6 @@ else
 
     ## Run cudf-udf tests
     CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls /home/ubuntu/spark-rapids/dist/target/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
-    SPARK_SUBMIT_FLAGS=$CUDF_UDF_TEST_ARGS TEST_PARALLEL=1 \
+    SPARK_SUBMIT_FLAGS="$SPARK_CONF $CUDF_UDF_TEST_ARGS" TEST_PARALLEL=1 \
         bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks"  -m "cudf_udf" --cudf_udf --test_type=$TEST_TYPE
 fi