From acfd8f3be1dc14659fc9b7c5061c0c8dee25010b Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Thu, 30 Jul 2020 13:48:45 -0500 Subject: [PATCH] Udf compiler pom followup (#475) * Minor changes to udf compiler pom/docs Signed-off-by: Alessandro Bellina * Update config descrition for the udfCompiler Signed-off-by: Alessandro Bellina --- docs/configs.md | 2 +- .../com/nvidia/spark/rapids/RapidsConf.scala | 3 +- udf-compiler/pom.xml | 100 +++++++++--------- 3 files changed, 50 insertions(+), 55 deletions(-) diff --git a/docs/configs.md b/docs/configs.md index 1b34abe368b..460c1173f47 100644 --- a/docs/configs.md +++ b/docs/configs.md @@ -49,7 +49,6 @@ Name | Description | Default Value spark.rapids.sql.concurrentGpuTasks|Set the number of tasks that can execute concurrently per GPU. Tasks may temporarily block when the number of concurrent tasks in the executor exceeds this amount. Allowing too many concurrent tasks on the same GPU may lead to GPU out of memory errors.|1 spark.rapids.sql.csvTimestamps.enabled|When set to true, enables the CSV parser to read timestamps. The default output format for Spark includes a timezone at the end. Anything except the UTC timezone is not supported. Timestamps after 2038 and before 1902 are also not supported.|false spark.rapids.sql.enabled|Enable (true) or disable (false) sql operations on the GPU|true -spark.rapids.sql.udfCompiler.enabled|When set to true, all UDFs are compiled to Catalyst expressions by Catalyst Analyzer|false spark.rapids.sql.explain|Explain why some parts of a query were not placed on a GPU or not. Possible values are ALL: print everything, NONE: print nothing, NOT_ON_GPU: print only parts of a query that did not go on the GPU|NONE spark.rapids.sql.format.csv.enabled|When set to false disables all csv input and output acceleration. (only input is currently supported anyways)|true spark.rapids.sql.format.csv.read.enabled|When set to false disables csv input acceleration|true @@ -68,6 +67,7 @@ Name | Description | Default Value spark.rapids.sql.reader.batchSizeRows|Soft limit on the maximum number of rows the reader will read per batch. The orc and parquet readers will read row groups until this limit is met or exceeded. The limit is respected by the csv reader.|2147483647 spark.rapids.sql.replaceSortMergeJoin.enabled|Allow replacing sortMergeJoin with HashJoin|true spark.rapids.sql.shuffle.spillThreads|Number of threads used to spill shuffle data to disk in the background.|6 +spark.rapids.sql.udfCompiler.enabled|When set to true, Scala UDFs will be considered for compilation as Catalyst expressions|false spark.rapids.sql.variableFloatAgg.enabled|Spark assumes that all operations produce the exact same result each time. This is not true for some floating point aggregations, which can produce slightly different results on the GPU as the aggregation is done in parallel. This can enable those operations if you know the query is only computing it once.|false ## Supported GPU Operators and Fine Tuning diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala index 566b3f05987..ac4b35eb826 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala @@ -351,8 +351,7 @@ object RapidsConf { .createWithDefault(true) val UDF_COMPILER_ENABLED = conf("spark.rapids.sql.udfCompiler.enabled") - .doc("When set to true, all UDFs will be compiled to Catalyst expressions by Catalyst " + - "Analyzer.") + .doc("When set to true, Scala UDFs will be considered for compilation as Catalyst expressions") .booleanConf .createWithDefault(false) diff --git a/udf-compiler/pom.xml b/udf-compiler/pom.xml index 425828327ae..bcbf7ed7ebd 100644 --- a/udf-compiler/pom.xml +++ b/udf-compiler/pom.xml @@ -25,31 +25,32 @@ 0.2.0-SNAPSHOT com.nvidia - rapids-4-spark-udf + rapids-4-spark-udf_2.12 + RAPIDS Accelerator for Apache Spark Scala UDF Plugin + The RAPIDS Scala UDF plugin for Apache Spark 0.2.0-SNAPSHOT + + ai.rapids + cudf + ${cuda.version} + org.scala-lang scala-library - commons-logging - commons-logging - 1.1.1 + org.apache.spark + spark-sql_${scala.binary.version} org.apache.spark spark-sql_${scala.binary.version} + test-jar + test ${spark.version} - - org.apache.spark - spark-sql_${scala.binary.version} - test-jar - test - ${spark.version} - org.apache.spark spark-catalyst_${scala.binary.version} @@ -67,54 +68,49 @@ ${project.version} provided - - ai.rapids - cudf - ${cuda.version} - - - - ${project.build.directory}/extra-resources - true - - - ${project.basedir}/.. - META-INF - - - LICENSE - - + + + ${project.build.directory}/extra-resources + true + + + ${project.basedir}/.. + META-INF + + + LICENSE + + - maven-antrun-plugin - - - copy-notice - - run - - process-resources - - - - - - - - - - - - - + maven-antrun-plugin + + + copy-notice + + run + + process-resources + + + + + + + + + + + + +