NVIDIA · tgravescs · Jun 19, 2020 · Jun 17, 2020 · Jun 18, 2020 · Jun 18, 2020
diff --git a/jenkins/Dockerfile.ubuntu16 b/jenkins/Dockerfile.ubuntu16
@@ -35,5 +35,5 @@ RUN add-apt-repository ppa:deadsnakes/ppa && \
     openjdk-8-jdk python3.6 python3-pip tzdata git
 
 RUN ln -s /usr/bin/python3.6 /usr/bin/python
-RUN python -m pip install pytest sre_yield
+RUN python -m pip install pytest sre_yield requests
 
diff --git a/jenkins/Jenkinsfile.databricksnightly b/jenkins/Jenkinsfile.databricksnightly
@@ -0,0 +1,117 @@
+#!/usr/local/env groovy
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+*
+* Jenkinsfile for building rapids-plugin on Databricks
+*
+*/
+
+pipeline {
+    agent { label 'vanilla' }
+
+    options {
+        ansiColor('xterm')
+        timeout(time: 120, unit: 'MINUTES')
+        buildDiscarder(logRotator(numToKeepStr: '10'))
+    }
+
+    parameters {
+        choice(name: 'DEPLOY_TO', choices: ['Urm', 'Local'],
+            description: 'Where to deploy artifacts to')
+        string(name: 'REF', defaultValue: 'branch-0.1', description: 'Commit to build')
+    }
+
+    environment {
+        JENKINS_ROOT  = 'jenkins'
+        MVN_URM_MIRROR='-s jenkins/settings.xml -P mirror-apache-to-urm'
+        LIBCUDF_KERNEL_CACHE_PATH='/tmp'
+        URM_CREDS = credentials("svcngcc_artifactory")
+        DATABRICKS_TOKEN = credentials("SPARK_DATABRICKS_TOKEN")
+    }
+
+    triggers {
+        cron('H 5 * * *')
+    }
+
+    stages {
+        stage('Ubuntu16 CUDA10.1') {
+            agent {
+                dockerfile {
+                    label 'docker-gpu'
+                    filename 'Dockerfile.ubuntu16'
+                    dir "jenkins"
+                    args '--runtime=nvidia -v ${HOME}/.m2:${HOME}/.m2:rw \
+                        -v ${HOME}/.zinc:${HOME}/.zinc:rw \
+                        -v /etc/passwd:/etc/passwd -v /etc/group:/etc/group'
+                }
+            }
+            steps {
+                script {
+                    sshagent(credentials : ['svcngcc_pubpriv']) {
+                        sh "mvn versions:set -DnewVersion=0.1-databricks-SNAPSHOT && git clean -d -f"
+                        sh "patch -p1 < ./jenkins/databricks/dbimports.patch"
+                        sh "tar -zcvf spark-rapids-ci.tgz * || true"
+                        sh "python3.6 ./jenkins/databricks/run-tests.py -z ./spark-rapids-ci.tgz -t $DATABRICKS_TOKEN -p /home/svcngcc/.ssh/id_rsa -l ./jenkins/databricks/build.sh"
+                        sh "./jenkins/databricks/deploy.sh"
+                    }
+                }
+            }
+        }
+        stage('cleanup') {
+            agent {
+                dockerfile {
+                    label 'docker-gpu'
+                    filename 'Dockerfile.ubuntu16'
+                    dir "jenkins"
+                    args '--runtime=nvidia -v ${HOME}/.m2:${HOME}/.m2:rw \
+                        -v ${HOME}/.zinc:${HOME}/.zinc:rw \
+                        -v /etc/passwd:/etc/passwd -v /etc/group:/etc/group'
+                }
+            }
+            steps {
+                script {
+                    sh "python3.6 ./jenkins/databricks/shutdown.py -t $DATABRICKS_TOKEN"
+                }
+            }
+        }
+    } // end of stages
+    post {
+        always {
+            script {
+                if (currentBuild.currentResult == "SUCCESS") {
+                    slack("#rapidsai-spark-cicd", "Success", color: "#33CC33")
+                } else {
+                    slack("#rapidsai-spark-cicd", "Failed", color: "#FF0000")
+                }
+            }
+        }
+    }
+} // end of pipeline
+
+void slack(Map params = [:], String channel, String message) {
+    Map defaultParams = [
+            color: "#000000",
+            baseUrl: "https://nvidia.slack.com/services/hooks/jenkins-ci/",
+            tokenCredentialId: "slack_token"
+    ]
+
+    params["channel"] = channel
+    params["message"] = "${BUILD_URL}\n" + message
+
+    slackSend(defaultParams << params)
+}
diff --git a/jenkins/databricks/build.sh b/jenkins/databricks/build.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+#
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -e
+
+SPARKTGZ=/home/ubuntu/spark-rapids-ci.tgz
+if [ "$1" != "" ]; then
+  SPARKTGZ=$1
+fi
+
+sudo apt install -y maven
+rm -rf spark-rapids
+mkdir spark-rapids
+tar -zxvf $SPARKTGZ -C spark-rapids
+cd spark-rapids
+# pull 3.0.0 artifacts and ignore errors then install databricks jars, then build again
+mvn clean package || true
+M2DIR=/home/ubuntu/.m2/repository
+JARDIR=/databricks/jars
+SQLJAR=----workspace_spark_3_0--sql--core--core-hive-2.3__hadoop-2.7_2.12_deploy.jar
+CATALYSTJAR=----workspace_spark_3_0--sql--catalyst--catalyst-hive-2.3__hadoop-2.7_2.12_deploy.jar
+ANNOTJAR=----workspace_spark_3_0--common--tags--tags-hive-2.3__hadoop-2.7_2.12_deploy.jar
+COREJAR=----workspace_spark_3_0--core--core-hive-2.3__hadoop-2.7_2.12_deploy.jar
+VERSIONJAR=----workspace_spark_3_0--core--libcore_generated_resources.jar
+VERSION=3.0.0
+mvn install:install-file \
+   -Dmaven.repo.local=$M2DIR \
+   -Dfile=$JARDIR/$COREJAR \
+   -DgroupId=org.apache.spark \
+   -DartifactId=spark-core_2.12 \
+   -Dversion=$VERSION \
+   -Dpackaging=jar
+
+mvn install:install-file \
+   -Dmaven.repo.local=$M2DIR \
+   -Dfile=$JARDIR/$CATALYSTJAR \
+   -DgroupId=org.apache.spark \
+   -DartifactId=spark-catalyst_2.12 \
+   -Dversion=$VERSION \
+   -Dpackaging=jar
+
+mvn install:install-file \
+   -Dmaven.repo.local=$M2DIR \
+   -Dfile=$JARDIR/$SQLJAR \
+   -DgroupId=org.apache.spark \
+   -DartifactId=spark-sql_2.12 \
+   -Dversion=$VERSION \
+   -Dpackaging=jar
+
+mvn install:install-file \
+   -Dmaven.repo.local=$M2DIR \
+   -Dfile=$JARDIR/$ANNOTJAR \
+   -DgroupId=org.apache.spark \
+   -DartifactId=spark-annotation_2.12 \
+   -Dversion=$VERSION \
+   -Dpackaging=jar
+
+mvn install:install-file \
+   -Dmaven.repo.local=$M2DIR \
+   -Dfile=$JARDIR/$VERSIONJAR \
+   -DgroupId=org.apache.spark \
+   -DartifactId=spark-version_2.12 \
+   -Dversion=$VERSION \
+   -Dpackaging=jar
+
+mvn -Pdatabricks clean verify -DskipTests
+
+# copy so we pick up new built jar
+sudo cp dist/target/rapids-4-spark_2.12-*-SNAPSHOT.jar /databricks/jars/rapids-4-spark_2.12-0.1-SNAPSHOT-ci.jar
+
+# tests
+export PATH=/databricks/conda/envs/databricks-ml-gpu/bin:/databricks/conda/condabin:$PATH
+sudo /databricks/conda/envs/databricks-ml-gpu/bin/pip install pytest sre_yield
+cd /home/ubuntu/spark-rapids/integration_tests
+export SPARK_HOME=/databricks/spark
+export PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$SPARK_HOME/python/lib/py4j-0.10.9-src.zip
+sudo ln -s /databricks/jars/ $SPARK_HOME/jars || true
+sudo chmod 777 /databricks/data/logs/
+sudo chmod 777 /databricks/data/logs/*
+echo { \"port\":\"15002\" } > ~/.databricks-connect
+$SPARK_HOME/bin/spark-submit ./runtests.py 2>&1 | tee out
+
+cd /home/ubuntu
+tar -zcvf spark-rapids-built.tgz spark-rapids
diff --git a/jenkins/databricks/dbimports.patch b/jenkins/databricks/dbimports.patch
@@ -0,0 +1,78 @@
+diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuHashJoin.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuHashJoin.scala
+index b1599e6..b432fe0 100644
+--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuHashJoin.scala
++++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuHashJoin.scala
+@@ -18,8 +18,9 @@ package com.nvidia.spark.rapids
+ import ai.rapids.cudf.{NvtxColor, Table}
+
+ import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
++import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight}
+ import org.apache.spark.sql.catalyst.plans.{Inner, JoinType, LeftAnti, LeftOuter, LeftSemi}
+-import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, HashJoin}
++import org.apache.spark.sql.execution.joins.HashJoin
+ import org.apache.spark.sql.execution.metric.SQLMetric
+ import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}
+
+diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala
+index 6de4a95..90fb3e7 100644
+--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala
++++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala
+@@ -21,10 +21,11 @@ import com.nvidia.spark.rapids.GpuMetricNames._
+ import org.apache.spark.TaskContext
+ import org.apache.spark.rdd.RDD
+ import org.apache.spark.sql.catalyst.InternalRow
++import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, BuildSide}
+ import org.apache.spark.sql.catalyst.plans.JoinType
+ import org.apache.spark.sql.catalyst.plans.physical.{Distribution, HashClusteredDistribution}
+ import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}
+-import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide, ShuffledHashJoinExec}
++import org.apache.spark.sql.execution.joins.ShuffledHashJoinExec
+ import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
+ import org.apache.spark.sql.vectorized.ColumnarBatch
+
+diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSortMergeJoinExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSortMergeJoinExec.scala
+index c493508..3e24a2d 100644
+--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSortMergeJoinExec.scala
++++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSortMergeJoinExec.scala
+@@ -17,8 +17,9 @@
+ package com.nvidia.spark.rapids
+
+ import org.apache.spark.internal.Logging
++import org.apache.spark.sql.catalyst.optimizer.BuildRight
+ import org.apache.spark.sql.execution.SortExec
+-import org.apache.spark.sql.execution.joins.{BuildRight, SortMergeJoinExec}
++import org.apache.spark.sql.execution.joins.SortMergeJoinExec
+
+ class GpuSortMergeJoinMeta(
+     join: SortMergeJoinExec,
+diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala
+index 088b66b..b7f267f 100644
+--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala
++++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala
+@@ -22,12 +22,13 @@ import com.nvidia.spark.rapids.GpuOverrides.isStringLit
+
+ import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, ComplexTypeMergingExpression, Expression, String2TrimExpression, TernaryExpression, UnaryExpression}
+ import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateFunction
++import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight}
+ import org.apache.spark.sql.catalyst.plans.physical.Partitioning
+ import org.apache.spark.sql.connector.read.Scan
+ import org.apache.spark.sql.execution.SparkPlan
+ import org.apache.spark.sql.execution.command.DataWritingCommand
+ import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
+-import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, BuildLeft, BuildRight, ShuffledHashJoinExec, SortMergeJoinExec}
++import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, ShuffledHashJoinExec, SortMergeJoinExec}
+ import org.apache.spark.sql.types.{CalendarIntervalType, DataType, DataTypes, StringType}
+
+ trait ConfKeysAndIncompat {
+diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala
+index dc12bbe..a07add3 100644
+--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala
++++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala
+@@ -21,6 +21,7 @@ import com.nvidia.spark.rapids.GpuMetricNames._
+
+ import org.apache.spark.rdd.RDD
+ import org.apache.spark.sql.catalyst.InternalRow
++import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, BuildSide}
+ import org.apache.spark.sql.catalyst.plans.JoinType
+ import org.apache.spark.sql.catalyst.plans.physical.{BroadcastDistribution, Distribution, UnspecifiedDistribution}
+ import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}
diff --git a/jenkins/databricks/deploy.sh b/jenkins/databricks/deploy.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+#
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -e
+rm -rf deploy
+mkdir -p deploy
+cd deploy
+tar -zxvf ../spark-rapids-built.tgz
+cd spark-rapids
+echo "Maven mirror is $MVN_URM_MIRROR"
+SERVER_ID='snapshots'
+SERVER_URL='https://urm.nvidia.com:443/artifactory/sw-spark-maven-local'
+FPATH=./dist/target/rapids-4-spark_2.12-0.1-databricks-SNAPSHOT.jar
+mvn -B deploy:deploy-file $MVN_URM_MIRROR -Durl=$SERVER_URL -DrepositoryId=$SERVER_ID \
+    -Dfile=$FPATH -DpomFile=dist/pom.xml