Skip to content

Commit

Permalink
Use the ORC version that corresponds to the Spark version [databricks] (
Browse files Browse the repository at this point in the history
#4408)

* try to remove shade

* 312db build

* 301db build

* CDH shim

* Add license to pass RAT check

* Use Spark OrcFilters instead of the copied one; Comment a test case

* Remove protobuf and hive storage api version proterties

* Remove the dependency of Guava; Refactor

* Put mudules into specific profiles

* Fix thread factory builder bug

* Add dependency

* Update test case

* Fix shading overlapping resource warning

* Update the test cases

* Update version after 22.02 changed to 22.04 accordingly

Signed-off-by: Chong Gao <[email protected]>
  • Loading branch information
Chong Gao authored Feb 8, 2022
1 parent 434ee37 commit 215541a
Show file tree
Hide file tree
Showing 31 changed files with 732 additions and 622 deletions.
11 changes: 0 additions & 11 deletions NOTICE-binary
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,6 @@ Copyright 2014 and onwards The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).

---------------------------------------------------------------------

Apache ORC
Copyright 2013-2019 The Apache Software Foundation

This product includes software developed by The Apache Software
Foundation (http://www.apache.org/).

This product includes software developed by Hewlett-Packard:
(c) Copyright [2014-2015] Hewlett-Packard Development Company, L.P

---------------------------------------------------------------------
UCF Consortium - Unified Communication X (UCX)

Expand Down
49 changes: 2 additions & 47 deletions aggregator/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -88,53 +88,8 @@
</transformers>
<relocations>
<relocation>
<pattern>org.apache.orc.</pattern>
<shadedPattern>${rapids.shade.package}.orc.</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.hadoop.hive.</pattern>
<shadedPattern>${rapids.shade.package}.hadoop.hive.</shadedPattern>
<excludes>
<!--
Class exclusions for Hive UDFs, to avoid the ClassNotFoundException,
For example:
E Caused by: java.lang.ClassNotFoundException: com.nvidia.shaded.spark.hadoop.hive.serde2.objectinspector.ObjectInspector
E at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
-->
<exclude>org.apache.hadoop.hive.conf.HiveConf</exclude>
<exclude>org.apache.hadoop.hive.ql.exec.FunctionRegistry</exclude>
<exclude>org.apache.hadoop.hive.ql.exec.UDF</exclude>
<exclude>org.apache.hadoop.hive.ql.exec.UDFMethodResolver</exclude>
<exclude>org.apache.hadoop.hive.ql.udf.UDFType</exclude>
<exclude>org.apache.hadoop.hive.ql.udf.generic.GenericUDF</exclude>
<exclude>org.apache.hadoop.hive.ql.udf.generic.GenericUDF$DeferredObject</exclude>
<exclude>org.apache.hadoop.hive.ql.udf.generic.GenericUDFUtils$ConversionHelper</exclude>
<exclude>org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector</exclude>
<exclude>org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory</exclude>
<exclude>org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory$ObjectInspectorOptions</exclude>
<exclude>org.apache.hadoop.hive.serde2.objectinspector.StructField</exclude>
<exclude>org.apache.hadoop.hive.serde2.typeinfo.TypeInfo</exclude>
</excludes>
</relocation>
<relocation>
<pattern>org.apache.hive.</pattern>
<shadedPattern>${rapids.shade.package}.hive.</shadedPattern>
</relocation>
<relocation>
<pattern>io.airlift.compress.</pattern>
<shadedPattern>${rapids.shade.package}.io.airlift.compress.</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.commons.codec.</pattern>
<shadedPattern>${rapids.shade.package}.org.apache.commons.codec.</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.commons.lang.</pattern>
<shadedPattern>${rapids.shade.package}.org.apache.commons.lang.</shadedPattern>
</relocation>
<relocation>
<pattern>com.google</pattern>
<shadedPattern>${rapids.shade.package}.com.google</shadedPattern>
<pattern>com.google.flatbuffers</pattern>
<shadedPattern>${rapids.shade.package}.com.google.flatbuffers</shadedPattern>
</relocation>
</relocations>
<filters>
Expand Down
96 changes: 96 additions & 0 deletions common/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Copyright (c) 2022, NVIDIA CORPORATION.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<!-- Filter unused classes for shade purpose, generate a jar with shaded classifier -->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-parent</artifactId>
<version>22.04.0-SNAPSHOT</version>
</parent>

<artifactId>rapids-4-spark-common_2.12</artifactId>
<name>RAPIDS Accelerator for Apache Spark Common</name>
<description>Utility code that is common across the RAPIDS Accelerator projects</description>
<version>22.04.0-SNAPSHOT</version>

<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
</dependency>
<dependency>
<groupId>org.scalatest</groupId>
<artifactId>scalatest_${scala.binary.version}</artifactId>
<scope>test</scope>
</dependency>
</dependencies>

<build>
<resources>
<resource>
<!-- Include the properties file to provide the build information. -->
<directory>${project.build.directory}/extra-resources</directory>
<filtering>true</filtering>
</resource>
<resource>
<directory>${project.basedir}/..</directory>
<targetPath>META-INF</targetPath>
<includes>
<!-- The NOTICE will be taken care of by the antrun task below -->
<include>LICENSE</include>
</includes>
</resource>
</resources>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<executions>
<execution>
<id>default-test-jar</id>
<phase>none</phase>
</execution>
</executions>
</plugin>
<!-- disable surefire as tests are some place else -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<skipTests>true</skipTests>
</configuration>
</plugin>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.rat</groupId>
<artifactId>apache-rat-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.scalatest</groupId>
<artifactId>scalatest-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>
23 changes: 23 additions & 0 deletions common/src/main/scala/com/nvidia/spark/rapids/CheckUtils.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.nvidia.spark.rapids

object CheckUtils {
def checkArgument(expression: Boolean, msg: String): Unit = {
if (!expression) throw new IllegalArgumentException(msg)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.nvidia.spark.rapids

import java.util.concurrent.{Executors, ThreadFactory}
import java.util.concurrent.atomic.AtomicLong

// This is similar to Guava ThreadFactoryBuilder
// Avoid to use Guava as it is a messy dependency in practice.
class ThreadFactoryBuilder {
private var nameFormat = Option.empty[String]
private var daemon = Option.empty[Boolean]

def setNameFormat(nameFormat: String): ThreadFactoryBuilder = {
nameFormat.format(0)
this.nameFormat = Some(nameFormat)
this
}

def setDaemon(daemon: Boolean): ThreadFactoryBuilder = {
this.daemon = Some(daemon)
this
}

def build(): ThreadFactory = {
val count = nameFormat.map(_ => new AtomicLong(0))
new ThreadFactory() {
private val defaultThreadFactory = Executors.defaultThreadFactory

override def newThread(r: Runnable): Thread = {
val thread = defaultThreadFactory.newThread(r)
nameFormat.foreach(f => thread.setName(f.format(count.get.getAndIncrement())))
daemon.foreach(b => thread.setDaemon(b))
thread
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.nvidia.spark.rapids

import java.util.concurrent.{Callable, Executors}

import org.scalatest.FunSuite

class ThreadFactoryBuilderTest extends FunSuite {

test("test thread factory builder") {
val pool1 = Executors.newFixedThreadPool(2,
new ThreadFactoryBuilder().setNameFormat("thread-pool1-1 %s").setDaemon(true).build())
try {
var ret = pool1.submit(new Callable[String] {
override def call(): String = {
assert(Thread.currentThread().isDaemon)
assert(Thread.currentThread().getName == "thread-pool1-1 0")
""
}
})
// waits and retrieves the result, if above asserts failed, will get execution exception
ret.get()
ret = pool1.submit(() => {
assert(Thread.currentThread().isDaemon)
assert(Thread.currentThread().getName == "thread-pool1-1 1")
""
})
ret.get()
} finally {
pool1.shutdown()
}

val pool2 = Executors.newFixedThreadPool(2,
new ThreadFactoryBuilder().setNameFormat("pool2-%d").build())
try {
var ret = pool2.submit(new Callable[String] {
override def call(): String = {
assert(!Thread.currentThread().isDaemon)
assert(Thread.currentThread().getName == "pool2-0")
""
}
})
ret.get()
ret = pool2.submit(() => {
assert(!Thread.currentThread().isDaemon)
assert(Thread.currentThread().getName == "pool2-1")
""
})
ret.get()
} finally {
pool2.shutdown()
}

val pool3 = Executors.newFixedThreadPool(2,
new ThreadFactoryBuilder().setNameFormat("pool3-%d").setDaemon(false).build())
try {
pool3.submit(new Callable[String] {
override def call(): String = {
assert(!Thread.currentThread().isDaemon)
assert(Thread.currentThread().getName == "pool3-0")
""
}
}).get()
} finally {
pool3.shutdown()
}
}
}
42 changes: 41 additions & 1 deletion jenkins/databricks/build.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash
#
# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -87,11 +87,19 @@ then
PARQUETHADOOPJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-hadoop--org.apache.parquet__parquet-hadoop__1.10.1-databricks9.jar
PARQUETCOMMONJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-common--org.apache.parquet__parquet-common__1.10.1-databricks9.jar
PARQUETCOLUMNJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-column--org.apache.parquet__parquet-column__1.10.1-databricks9.jar
ORC_CORE_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.orc--orc-core--org.apache.orc__orc-core__1.5.12.jar
ORC_SHIM_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.orc--orc-shims--org.apache.orc__orc-shims__1.5.12.jar
ORC_MAPREDUCE_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.orc--orc-mapreduce--org.apache.orc__orc-mapreduce__1.5.12.jar
else
PARQUETHADOOPJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-hadoop--org.apache.parquet__parquet-hadoop__1.10.1-databricks6.jar
PARQUETCOMMONJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-common--org.apache.parquet__parquet-common__1.10.1-databricks6.jar
PARQUETCOLUMNJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-column--org.apache.parquet__parquet-column__1.10.1-databricks6.jar
ORC_CORE_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.orc--orc-core--org.apache.orc__orc-core__1.5.10.jar
ORC_SHIM_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.orc--orc-shims--org.apache.orc__orc-shims__1.5.10.jar
ORC_MAPREDUCE_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.orc--orc-mapreduce--org.apache.orc__orc-mapreduce__1.5.10.jar
fi

PROTOBUF_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--com.google.protobuf--protobuf-java--com.google.protobuf__protobuf-java__2.6.1.jar
PARQUETFORMATJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-format--org.apache.parquet__parquet-format__2.4.0.jar

NETWORKCOMMON=----workspace_${SPARK_MAJOR_VERSION_STRING}--common--network-common--network-common-hive-2.3__hadoop-2.7_2.12_deploy.jar
Expand Down Expand Up @@ -363,6 +371,38 @@ mvn -B install:install-file \
-Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
-Dpackaging=jar

mvn -B install:install-file \
-Dmaven.repo.local=$M2DIR \
-Dfile=$JARDIR/$ORC_CORE_JAR \
-DgroupId=org.apache.orc \
-DartifactId=orc-core \
-Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
-Dpackaging=jar

mvn -B install:install-file \
-Dmaven.repo.local=$M2DIR \
-Dfile=$JARDIR/$ORC_SHIM_JAR \
-DgroupId=org.apache.orc \
-DartifactId=orc-shims \
-Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
-Dpackaging=jar

mvn -B install:install-file \
-Dmaven.repo.local=$M2DIR \
-Dfile=$JARDIR/$ORC_MAPREDUCE_JAR \
-DgroupId=org.apache.orc \
-DartifactId=orc-mapreduce \
-Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
-Dpackaging=jar

mvn -B install:install-file \
-Dmaven.repo.local=$M2DIR \
-Dfile=$JARDIR/$PROTOBUF_JAR \
-DgroupId=com.google.protobuf \
-DartifactId=protobuf-java \
-Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
-Dpackaging=jar

mvn -B -Ddatabricks -Dbuildver=$BUILDVER clean package -DskipTests

cd /home/ubuntu
Expand Down
Loading

0 comments on commit 215541a

Please sign in to comment.