From eac020af34d2be85d4d84f2f8b02ba168d3658d3 Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Tue, 28 Jul 2020 14:38:55 -0500 Subject: [PATCH 1/2] remove unneeded files and use rm -rf --- jenkins/Jenkinsfile.databricksnightly | 2 +- jenkins/Jenkinsfile.databricksrelease | 110 ------------------------ jenkins/databricks/dbimports.patch | 118 -------------------------- 3 files changed, 1 insertion(+), 229 deletions(-) delete mode 100644 jenkins/Jenkinsfile.databricksrelease delete mode 100644 jenkins/databricks/dbimports.patch diff --git a/jenkins/Jenkinsfile.databricksnightly b/jenkins/Jenkinsfile.databricksnightly index 6bc6a8bec38..47eef0692ea 100644 --- a/jenkins/Jenkinsfile.databricksnightly +++ b/jenkins/Jenkinsfile.databricksnightly @@ -76,7 +76,7 @@ pipeline { steps { script { sshagent(credentials : ['svcngcc_pubpriv']) { - sh "rm spark-rapids-ci.tgz" + sh "rm -rf spark-rapids-ci.tgz" sh "tar -zcvf spark-rapids-ci.tgz *" sh "python3.6 ./jenkins/databricks/run-tests.py -c $CLUSTER_ID -z ./spark-rapids-ci.tgz -t $DATABRICKS_TOKEN -p /home/svcngcc/.ssh/id_rsa -l ./jenkins/databricks/build.sh -j $CI_RAPIDS_JAR -b $DATABRICKS_VERSION -k $SPARK_VERSION -a $SCALA_VERSION -f $CUDF_VERSION -u $CUDA_VERSION -m $CI_CUDF_JAR" sh "./jenkins/databricks/deploy.sh" diff --git a/jenkins/Jenkinsfile.databricksrelease b/jenkins/Jenkinsfile.databricksrelease deleted file mode 100644 index 647bab74099..00000000000 --- a/jenkins/Jenkinsfile.databricksrelease +++ /dev/null @@ -1,110 +0,0 @@ -#!/usr/local/env groovy -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** -* -* Jenkinsfile for building and deploy rapids-plugin for Databricks to public repo -* -*/ -@Library('shared-libs') _ - -def urmUrl="https://${ArtifactoryConstants.ARTIFACTORY_NAME}/artifactory/sw-spark-maven" - -pipeline { - agent { - dockerfile { - label 'docker-deploy||docker-gpu' - filename 'Dockerfile.ubuntu16' - dir "jenkins" - args '--runtime=nvidia -v ${HOME}/.m2:${HOME}/.m2:rw \ - -v ${HOME}/.zinc:${HOME}/.zinc:rw' - } - } - - options { - ansiColor('xterm') - timeout(time: 120, unit: 'MINUTES') - buildDiscarder(logRotator(numToKeepStr: '10')) - } - - parameters { - string(name: 'DEPLOY_TO', defaultValue: 'https://oss.sonatype.org/service/local/staging/deploy/maven2', - description: 'The repo URL where to deploy the artifacts') - string(name: 'DATABRICKS_VERSION', - defaultValue: '0.2.0-SNAPSHOT', description: 'Version to set') - string(name: 'CUDF_VERSION', - defaultValue: '0.15-SNAPSHOT', description: 'Cudf version to use') - string(name: 'CUDA_VERSION', - defaultValue: 'cuda10-1', description: 'cuda version to use') - string(name: 'REF', defaultValue: 'branch-0.2', description: 'Commit to build') - } - - environment { - JENKINS_ROOT='jenkins' - LIBCUDF_KERNEL_CACHE_PATH='/tmp/.cudf' - MVN_MIRROR='-s jenkins/settings.xml -P mirror-apache-to-urm' - URM_CREDS = credentials("svcngcc_artifactory") - DATABRICKS_TOKEN = credentials("SPARK_DATABRICKS_TOKEN") - DIST_PL='dist' - SQL_PL='sql-plugin' - SCALA_VERSION = '2.12' - SPARK_VERSION = '3.0.0-databricks' - CI_RAPIDS_JAR = 'rapids-4-spark_2.12-0.1-SNAPSHOT-ci.jar' - CI_CUDF_JAR = 'cudf-0.14-cuda10-1.jar' - LOCAL_URL = "${localUrl}" - } - - stages { - stage('Build') { - steps { - script { - sshagent(credentials : ['svcngcc_pubpriv']) { - sh "rm spark-rapids-ci.tgz" - sh "tar -zcvf spark-rapids-ci.tgz * || true" - sh "python3.6 ./jenkins/databricks/run-tests.py -z ./spark-rapids-ci.tgz -t $DATABRICKS_TOKEN -p /home/svcngcc/.ssh/id_rsa -l ./jenkins/databricks/build.sh -j $CI_RAPIDS_JAR -b $DATABRICKS_VERSION -k $SPARK_VERSION -a $SCALA_VERSION -f $CUDF_VERSION -u $CUDA_VERSION -m $CI_CUDF_JAR" - } - } - } - } - stage("Deploy") { - environment { - SERVER_ID='ossrh' - SERVER_URL="${DEPLOY_TO}" - GPG_PASSPHRASE=credentials('SPARK_RAPIDS_GPG_PASSPHRASE') - GPG_FILE=credentials('SPARK_RAPIDS_GPG_PRIVATE_KEY') - SONATYPE=credentials('SPARK_SONATYPE_USERPASS') - GNUPGHOME="${WORKSPACE}/.gnupg" - } - steps { - script { - sh 'rm -rf $GNUPGHOME' - sh 'gpg --import $GPG_FILE' - retry (3) { - sh "bash $JENKINS_ROOT/deploy.sh true true" - } - } - } - } - stage('Cleanup') { - steps { - script { - sh "python3.6 ./jenkins/databricks/shutdown.py -t $DATABRICKS_TOKEN" - } - } - } - } // End of stages -} // end of pipeline diff --git a/jenkins/databricks/dbimports.patch b/jenkins/databricks/dbimports.patch deleted file mode 100644 index db44ecf0e35..00000000000 --- a/jenkins/databricks/dbimports.patch +++ /dev/null @@ -1,118 +0,0 @@ -diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuHashJoin.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuHashJoin.scala -index f0aaec3..eafba2a 100644 ---- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuHashJoin.scala -+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuHashJoin.scala -@@ -19,8 +19,9 @@ import ai.rapids.cudf.{NvtxColor, Table} - - import org.apache.spark.TaskContext - import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} -+import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight} - import org.apache.spark.sql.catalyst.plans.{ExistenceJoin, FullOuter, InnerLike, JoinType, LeftAnti, LeftExistence, LeftOuter, LeftSemi, RightOuter} --import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, HashJoin} -+import org.apache.spark.sql.execution.joins.HashJoin - import org.apache.spark.sql.execution.metric.SQLMetric - import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector} - -diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala -index 7ae310b..3ebde77 100644 ---- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala -+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala -@@ -22,10 +22,11 @@ import org.apache.spark.TaskContext - import org.apache.spark.rdd.RDD - import org.apache.spark.sql.catalyst.InternalRow - import org.apache.spark.sql.catalyst.expressions.Expression -+import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, BuildSide} - import org.apache.spark.sql.catalyst.plans.JoinType - import org.apache.spark.sql.catalyst.plans.physical.{Distribution, HashClusteredDistribution} - import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} --import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide, ShuffledHashJoinExec} -+import org.apache.spark.sql.execution.joins.ShuffledHashJoinExec - import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} - import org.apache.spark.sql.vectorized.ColumnarBatch - -diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSortMergeJoinExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSortMergeJoinExec.scala -index af7e607..6edf950 100644 ---- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSortMergeJoinExec.scala -+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSortMergeJoinExec.scala -@@ -17,9 +17,10 @@ - package com.nvidia.spark.rapids - - import org.apache.spark.internal.Logging -+import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight} - import org.apache.spark.sql.catalyst.plans.{ExistenceJoin, FullOuter, InnerLike, JoinType, LeftAnti, LeftOuter, LeftSemi, RightOuter} - import org.apache.spark.sql.execution.SortExec --import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, SortMergeJoinExec} -+import org.apache.spark.sql.execution.joins.SortMergeJoinExec - - class GpuSortMergeJoinMeta( - join: SortMergeJoinExec, -diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala -index 834ec51..646ccda 100644 ---- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala -+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala -@@ -22,12 +22,13 @@ import com.nvidia.spark.rapids.GpuOverrides.isStringLit - - import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, ComplexTypeMergingExpression, Expression, String2TrimExpression, TernaryExpression, UnaryExpression} - import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateFunction -+import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight} - import org.apache.spark.sql.catalyst.plans.physical.Partitioning - import org.apache.spark.sql.connector.read.Scan - import org.apache.spark.sql.execution.SparkPlan - import org.apache.spark.sql.execution.command.DataWritingCommand - import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec --import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, BuildLeft, BuildRight, ShuffledHashJoinExec, SortMergeJoinExec} -+import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, ShuffledHashJoinExec, SortMergeJoinExec} - import org.apache.spark.sql.types.{CalendarIntervalType, DataType, DataTypes, StringType} - - trait ConfKeysAndIncompat { -diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuCartesianProductExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuCartesianProductExec.scala -index 4c8c540..fb6dc06 100644 ---- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuCartesianProductExec.scala -+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuCartesianProductExec.scala -@@ -27,8 +27,8 @@ import org.apache.spark.rdd.RDD - import org.apache.spark.serializer.Serializer - import org.apache.spark.sql.catalyst.InternalRow - import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} -+import org.apache.spark.sql.catalyst.optimizer.BuildLeft - import org.apache.spark.sql.execution.{BinaryExecNode, ExplainUtils, SparkPlan} --import org.apache.spark.sql.execution.joins.BuildLeft - import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} - import org.apache.spark.sql.rapids.execution.GpuBroadcastNestedLoopJoinExec - import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector} -diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala -index ac444d1..14a8c6e 100644 ---- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala -+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastHashJoinExec.scala -@@ -22,6 +22,7 @@ import com.nvidia.spark.rapids.GpuMetricNames._ - import org.apache.spark.rdd.RDD - import org.apache.spark.sql.catalyst.InternalRow - import org.apache.spark.sql.catalyst.expressions.Expression -+import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, BuildSide} - import org.apache.spark.sql.catalyst.plans.JoinType - import org.apache.spark.sql.catalyst.plans.physical.{BroadcastDistribution, Distribution, UnspecifiedDistribution} - import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} -diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala -index c120444..16c318a 100644 ---- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala -+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala -@@ -23,11 +23,12 @@ import com.nvidia.spark.rapids.GpuMetricNames.{NUM_OUTPUT_BATCHES, NUM_OUTPUT_RO - import org.apache.spark.rdd.RDD - import org.apache.spark.sql.catalyst.InternalRow - import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} -+import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, BuildSide} - import org.apache.spark.sql.catalyst.plans.{Cross, ExistenceJoin, FullOuter, Inner, InnerLike, JoinType, LeftExistence, LeftOuter, RightOuter} - import org.apache.spark.sql.catalyst.plans.physical.{BroadcastDistribution, Distribution, IdentityBroadcastMode, UnspecifiedDistribution} - import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} - import org.apache.spark.sql.execution.exchange.ReusedExchangeExec --import org.apache.spark.sql.execution.joins.{BroadcastNestedLoopJoinExec, BuildLeft, BuildRight, BuildSide} -+import org.apache.spark.sql.execution.joins.{BroadcastNestedLoopJoinExec} - import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} - import org.apache.spark.sql.vectorized.ColumnarBatch - -@@ -222,4 +223,4 @@ case class GpuBroadcastNestedLoopJoinExec( - } - } - } --} -\ No newline at end of file -+} From 65c20be0a88e517efffa0833ec380478021d9801 Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Tue, 28 Jul 2020 14:48:24 -0500 Subject: [PATCH 2/2] Fix scalastyle for databricks shim --- .../spark/rapids/shims/spark300db/Spark300dbShims.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/shims/spark300db/src/main/scala/com/nvidia/spark/rapids/shims/spark300db/Spark300dbShims.scala b/shims/spark300db/src/main/scala/com/nvidia/spark/rapids/shims/spark300db/Spark300dbShims.scala index fcf42aff1de..5895739e37a 100644 --- a/shims/spark300db/src/main/scala/com/nvidia/spark/rapids/shims/spark300db/Spark300dbShims.scala +++ b/shims/spark300db/src/main/scala/com/nvidia/spark/rapids/shims/spark300db/Spark300dbShims.scala @@ -70,7 +70,7 @@ class Spark300dbShims extends Spark300Shims { override def getExecs: Map[Class[_ <: SparkPlan], ExecRule[_ <: SparkPlan]] = { Seq( - GpuOverrides.exec[FileSourceScanExec]( + GpuOverrides.exec[FileSourceScanExec]( "Reading data from files, often from Hive tables", (fsse, conf, p, r) => new SparkPlanMeta[FileSourceScanExec](fsse, conf, p, r) { // partition filters and data filters are not run on the GPU @@ -104,7 +104,7 @@ class Spark300dbShims extends Spark300Shims { (join, conf, p, r) => new GpuBroadcastHashJoinMeta(join, conf, p, r)), GpuOverrides.exec[ShuffledHashJoinExec]( "Implementation of join using hashed shuffled data", - (join, conf, p, r) => new GpuShuffledHashJoinMeta(join, conf, p, r)), + (join, conf, p, r) => new GpuShuffledHashJoinMeta(join, conf, p, r)) ).map(r => (r.getClassFor.asSubclass(classOf[SparkPlan]), r)).toMap }