Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use the ORC version that corresponds to the Spark version [databricks] #4408

Merged
merged 26 commits into from
Feb 8, 2022
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 0 additions & 11 deletions NOTICE-binary
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,6 @@ Copyright 2014 and onwards The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).

---------------------------------------------------------------------

Apache ORC
Copyright 2013-2019 The Apache Software Foundation

This product includes software developed by The Apache Software
Foundation (http://www.apache.org/).

This product includes software developed by Hewlett-Packard:
(c) Copyright [2014-2015] Hewlett-Packard Development Company, L.P

---------------------------------------------------------------------
UCF Consortium - Unified Communication X (UCX)

Expand Down
51 changes: 3 additions & 48 deletions aggregator/pom.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Copyright (c) 2021, NVIDIA CORPORATION.
Copyright (c) 2021-2022, NVIDIA CORPORATION.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -93,53 +93,8 @@
</transformers>
<relocations>
<relocation>
<pattern>org.apache.orc.</pattern>
<shadedPattern>${rapids.shade.package}.orc.</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.hadoop.hive.</pattern>
<shadedPattern>${rapids.shade.package}.hadoop.hive.</shadedPattern>
<excludes>
<!--
Class exclusions for Hive UDFs, to avoid the ClassNotFoundException,
For example:
E Caused by: java.lang.ClassNotFoundException: com.nvidia.shaded.spark.hadoop.hive.serde2.objectinspector.ObjectInspector
E at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
-->
<exclude>org.apache.hadoop.hive.conf.HiveConf</exclude>
<exclude>org.apache.hadoop.hive.ql.exec.FunctionRegistry</exclude>
<exclude>org.apache.hadoop.hive.ql.exec.UDF</exclude>
<exclude>org.apache.hadoop.hive.ql.exec.UDFMethodResolver</exclude>
<exclude>org.apache.hadoop.hive.ql.udf.UDFType</exclude>
<exclude>org.apache.hadoop.hive.ql.udf.generic.GenericUDF</exclude>
<exclude>org.apache.hadoop.hive.ql.udf.generic.GenericUDF$DeferredObject</exclude>
<exclude>org.apache.hadoop.hive.ql.udf.generic.GenericUDFUtils$ConversionHelper</exclude>
<exclude>org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector</exclude>
<exclude>org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory</exclude>
<exclude>org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory$ObjectInspectorOptions</exclude>
<exclude>org.apache.hadoop.hive.serde2.objectinspector.StructField</exclude>
<exclude>org.apache.hadoop.hive.serde2.typeinfo.TypeInfo</exclude>
</excludes>
</relocation>
<relocation>
<pattern>org.apache.hive.</pattern>
<shadedPattern>${rapids.shade.package}.hive.</shadedPattern>
</relocation>
<relocation>
<pattern>io.airlift.compress.</pattern>
<shadedPattern>${rapids.shade.package}.io.airlift.compress.</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.commons.codec.</pattern>
<shadedPattern>${rapids.shade.package}.org.apache.commons.codec.</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.commons.lang.</pattern>
<shadedPattern>${rapids.shade.package}.org.apache.commons.lang.</shadedPattern>
</relocation>
<relocation>
<pattern>com.google</pattern>
<shadedPattern>${rapids.shade.package}.com.google</shadedPattern>
<pattern>com.google.flatbuffers</pattern>
<shadedPattern>${rapids.shade.package}.com.google.flatbuffers</shadedPattern>
</relocation>
</relocations>
<filters>
Expand Down
96 changes: 96 additions & 0 deletions common/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Copyright (c) 2022, NVIDIA CORPORATION.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<!-- Filter unused classes for shade purpose, generate a jar with shaded classifier -->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-parent</artifactId>
<version>22.02.0-SNAPSHOT</version>
jlowe marked this conversation as resolved.
Show resolved Hide resolved
</parent>

<artifactId>rapids-4-spark-common_2.12</artifactId>
<name>RAPIDS Accelerator for Apache Spark Common</name>
<description>Utility code that is common across the RAPIDS Accelerator projects</description>
<version>22.02.0-SNAPSHOT</version>
jlowe marked this conversation as resolved.
Show resolved Hide resolved

<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
</dependency>
<dependency>
<groupId>org.scalatest</groupId>
<artifactId>scalatest_${scala.binary.version}</artifactId>
<scope>test</scope>
</dependency>
</dependencies>

<build>
<resources>
<resource>
<!-- Include the properties file to provide the build information. -->
<directory>${project.build.directory}/extra-resources</directory>
<filtering>true</filtering>
</resource>
<resource>
<directory>${project.basedir}/..</directory>
<targetPath>META-INF</targetPath>
<includes>
<!-- The NOTICE will be taken care of by the antrun task below -->
<include>LICENSE</include>
</includes>
</resource>
</resources>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<executions>
<execution>
<id>default-test-jar</id>
<phase>none</phase>
</execution>
</executions>
</plugin>
<!-- disable surefire as tests are some place else -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<skipTests>true</skipTests>
</configuration>
</plugin>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.rat</groupId>
<artifactId>apache-rat-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.scalatest</groupId>
<artifactId>scalatest-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>
23 changes: 23 additions & 0 deletions common/src/main/scala/com/nvidia/spark/rapids/CheckUtils.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.nvidia.spark.rapids

object CheckUtils {
def checkArgument(expression: Boolean, msg: String): Unit = {
if (!expression) throw new IllegalArgumentException(msg)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.nvidia.spark.rapids

import java.util.concurrent.{Executors, ThreadFactory}
import java.util.concurrent.atomic.AtomicLong

// This is similar to Guava ThreadFactoryBuilder
// Avoid to use Guava as it is a messy dependency in practice.
class ThreadFactoryBuilder {
private var nameFormat = Option.empty[String]
private var daemon = Option.empty[Boolean]

def setNameFormat(nameFormat: String): ThreadFactoryBuilder = {
nameFormat.format(0)
this.nameFormat = Some(nameFormat)
this
}

def setDaemon(daemon: Boolean): ThreadFactoryBuilder = {
this.daemon = Some(daemon)
this
}

def build(): ThreadFactory = {
val count = nameFormat.map(_ => new AtomicLong(0))
new ThreadFactory() {
private val defaultThreadFactory = Executors.defaultThreadFactory

override def newThread(r: Runnable): Thread = {
val thread = defaultThreadFactory.newThread(r)
nameFormat.foreach(f => thread.setName(f.format(count.get.getAndIncrement())))
daemon.foreach(b => thread.setDaemon(b))
thread
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.nvidia.spark.rapids

import java.util.concurrent.{Callable, Executors, ExecutorService}

import org.scalatest.FunSuite

class ThreadFactoryBuilderTest extends FunSuite {

test("test thread factory builder") {
var pool1: ExecutorService = null
jlowe marked this conversation as resolved.
Show resolved Hide resolved
try {
pool1 = Executors.newFixedThreadPool(2,
new ThreadFactoryBuilder().setNameFormat("thread-pool1-1 %s").setDaemon(true).build())
jlowe marked this conversation as resolved.
Show resolved Hide resolved
var ret = pool1.submit(new Callable[String] {
override def call(): String = {
assert(Thread.currentThread().isDaemon)
assert(Thread.currentThread().getName == "thread-pool1-1 0")
""
}
})
// waits and retrieves the result, if above asserts failed, will get execution exception
ret.get()
ret = pool1.submit(() => {
assert(Thread.currentThread().isDaemon)
assert(Thread.currentThread().getName == "thread-pool1-1 1")
""
})
ret.get()
} finally {
if (pool1 != null) pool1.shutdown()
}

var pool2: ExecutorService = null
try {
pool2 = Executors.newFixedThreadPool(2,
new ThreadFactoryBuilder().setNameFormat("pool2-%d").build())

var ret = pool2.submit(new Callable[String] {
override def call(): String = {
assert(!Thread.currentThread().isDaemon)
assert(Thread.currentThread().getName == "pool2-0")
""
}
})
ret.get()
ret = pool2.submit(() => {
assert(!Thread.currentThread().isDaemon)
assert(Thread.currentThread().getName == "pool2-1")
""
})
ret.get()
} finally {
if (pool2 != null) pool2.shutdown()
}
}
}
42 changes: 41 additions & 1 deletion jenkins/databricks/build.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash
#
# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -87,10 +87,18 @@ then
PARQUETHADOOPJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-hadoop--org.apache.parquet__parquet-hadoop__1.10.1-databricks9.jar
PARQUETCOMMONJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-common--org.apache.parquet__parquet-common__1.10.1-databricks9.jar
PARQUETCOLUMNJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-column--org.apache.parquet__parquet-column__1.10.1-databricks9.jar
ORC_CORE_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.orc--orc-core--org.apache.orc__orc-core__1.5.12.jar
ORC_SHIM_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.orc--orc-shims--org.apache.orc__orc-shims__1.5.12.jar
ORC_MAPREDUCE_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.orc--orc-mapreduce--org.apache.orc__orc-mapreduce__1.5.12.jar
PROTOBUF_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--com.google.protobuf--protobuf-java--com.google.protobuf__protobuf-java__2.6.1.jar
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this jar looks the same the one in the else statement, if so move out

else
PARQUETHADOOPJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-hadoop--org.apache.parquet__parquet-hadoop__1.10.1-databricks6.jar
PARQUETCOMMONJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-common--org.apache.parquet__parquet-common__1.10.1-databricks6.jar
PARQUETCOLUMNJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-column--org.apache.parquet__parquet-column__1.10.1-databricks6.jar
ORC_CORE_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.orc--orc-core--org.apache.orc__orc-core__1.5.10.jar
ORC_SHIM_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.orc--orc-shims--org.apache.orc__orc-shims__1.5.10.jar
ORC_MAPREDUCE_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.orc--orc-mapreduce--org.apache.orc__orc-mapreduce__1.5.10.jar
PROTOBUF_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--com.google.protobuf--protobuf-java--com.google.protobuf__protobuf-java__2.6.1.jar
fi
PARQUETFORMATJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-format--org.apache.parquet__parquet-format__2.4.0.jar

Expand Down Expand Up @@ -363,6 +371,38 @@ mvn -B install:install-file \
-Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
-Dpackaging=jar

mvn -B install:install-file \
-Dmaven.repo.local=$M2DIR \
-Dfile=$JARDIR/$ORC_CORE_JAR \
-DgroupId=org.apache.orc \
-DartifactId=orc-core \
-Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
-Dpackaging=jar

mvn -B install:install-file \
-Dmaven.repo.local=$M2DIR \
-Dfile=$JARDIR/$ORC_SHIM_JAR \
-DgroupId=org.apache.orc \
-DartifactId=orc-shims \
-Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
-Dpackaging=jar

mvn -B install:install-file \
-Dmaven.repo.local=$M2DIR \
-Dfile=$JARDIR/$ORC_MAPREDUCE_JAR \
-DgroupId=org.apache.orc \
-DartifactId=orc-mapreduce \
-Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
-Dpackaging=jar

mvn -B install:install-file \
-Dmaven.repo.local=$M2DIR \
-Dfile=$JARDIR/$PROTOBUF_JAR \
-DgroupId=com.google.protobuf \
-DartifactId=protobuf-java \
-Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
-Dpackaging=jar

mvn -B -Ddatabricks -Dbuildver=$BUILDVER clean package -DskipTests

cd /home/ubuntu
Expand Down
Loading