Skip to content

Commit

Permalink
Merge branch 'main' into QT
Browse files Browse the repository at this point in the history
  • Loading branch information
srinivasst authored Jan 9, 2025
2 parents 957b43a + a326e54 commit 7e531f1
Show file tree
Hide file tree
Showing 16 changed files with 547 additions and 417 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/velox_backend.yml
Original file line number Diff line number Diff line change
Expand Up @@ -544,7 +544,7 @@ jobs:
fail-fast: false
matrix:
spark: [ "spark-3.2" ]
celeborn: [ "celeborn-0.5.2", "celeborn-0.4.3", "celeborn-0.3.2-incubating" ]
celeborn: [ "celeborn-0.5.3", "celeborn-0.4.3", "celeborn-0.3.2-incubating" ]
runs-on: ubuntu-20.04
container: apache/gluten:centos-8
steps:
Expand All @@ -568,7 +568,7 @@ jobs:
EXTRA_PROFILE=""
if [ "${{ matrix.celeborn }}" = "celeborn-0.4.3" ]; then
EXTRA_PROFILE="-Pceleborn-0.4"
elif [ "${{ matrix.celeborn }}" = "celeborn-0.5.2" ]; then
elif [ "${{ matrix.celeborn }}" = "celeborn-0.5.3" ]; then
EXTRA_PROFILE="-Pceleborn-0.5"
fi
echo "EXTRA_PROFILE: ${EXTRA_PROFILE}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package org.apache.gluten.columnarbatch;

import org.apache.gluten.backendsapi.BackendsApiManager;
import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators;
import org.apache.gluten.runtime.Runtime;
import org.apache.gluten.runtime.Runtimes;

Expand Down Expand Up @@ -56,6 +57,7 @@ public static void checkNonVeloxBatch(ColumnarBatch batch) {
}

public static ColumnarBatch toVeloxBatch(ColumnarBatch input) {
ColumnarBatches.checkOffloaded(input);
if (ColumnarBatches.isZeroColumnBatch(input)) {
return input;
}
Expand Down Expand Up @@ -86,6 +88,26 @@ public static ColumnarBatch toVeloxBatch(ColumnarBatch input) {
return input;
}

/**
* Check if a columnar batch is in Velox format. If not, convert it to Velox format then return.
* If already in Velox format, return the batch directly.
*
* <p>Should only be used for certain conditions when unable to insert explicit to-Velox
* transitions through query planner.
*
* <p>For example, used by {@link org.apache.spark.sql.execution.ColumnarCachedBatchSerializer} as
* Spark directly calls API ColumnarCachedBatchSerializer#convertColumnarBatchToCachedBatch for
* query plan that returns supportsColumnar=true without generating a cache-write query plan node.
*/
public static ColumnarBatch ensureVeloxBatch(ColumnarBatch input) {
final ColumnarBatch light =
ColumnarBatches.ensureOffloaded(ArrowBufferAllocators.contextInstance(), input);
if (isVeloxBatch(light)) {
return light;
}
return toVeloxBatch(light);
}

/**
* Combine multiple columnar batches horizontally, assuming each of them is already offloaded.
* Otherwise {@link UnsupportedOperationException} will be thrown.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,24 +171,16 @@ class ColumnarCachedBatchSerializer extends CachedBatchSerializer with Logging {
conf: SQLConf): RDD[CachedBatch] = {
input.mapPartitions {
it =>
val lightBatches = it.map {
val veloxBatches = it.map {
/* Native code needs a Velox offloaded batch, making sure to offload
if heavy batch is encountered */
batch =>
val heavy = ColumnarBatches.isHeavyBatch(batch)
if (heavy) {
val offloaded = VeloxColumnarBatches.toVeloxBatch(
ColumnarBatches.offload(ArrowBufferAllocators.contextInstance(), batch))
offloaded
} else {
batch
}
batch => VeloxColumnarBatches.ensureVeloxBatch(batch)
}
new Iterator[CachedBatch] {
override def hasNext: Boolean = lightBatches.hasNext
override def hasNext: Boolean = veloxBatches.hasNext

override def next(): CachedBatch = {
val batch = lightBatches.next()
val batch = veloxBatches.next()
val results =
ColumnarBatchSerializerJniWrapper
.create(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,33 +67,26 @@ class ArrowCsvScanSuiteV2 extends ArrowCsvScanSuite {
}
}

/** Since https://github.com/apache/incubator-gluten/pull/5850. */
abstract class ArrowCsvScanSuite extends VeloxWholeStageTransformerSuite {
override protected val resourcePath: String = "N/A"
override protected val fileFormat: String = "N/A"

protected val rootPath: String = getClass.getResource("/").getPath

override def beforeAll(): Unit = {
super.beforeAll()
createCsvTables()
}

override def afterAll(): Unit = {
super.afterAll()
}

class ArrowCsvScanWithTableCacheSuite extends ArrowCsvScanSuiteBase {
override protected def sparkConf: SparkConf = {
super.sparkConf
.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.ColumnarShuffleManager")
.set("spark.sql.files.maxPartitionBytes", "1g")
.set("spark.sql.shuffle.partitions", "1")
.set("spark.memory.offHeap.size", "2g")
.set("spark.unsafe.exceptionOnMemoryLeak", "true")
.set("spark.sql.autoBroadcastJoinThreshold", "-1")
.set(GlutenConfig.NATIVE_ARROW_READER_ENABLED.key, "true")
.set("spark.sql.sources.useV1SourceList", "csv")
.set(GlutenConfig.COLUMNAR_TABLE_CACHE_ENABLED.key, "true")
}

/**
* Test for GLUTEN-8453: https://github.com/apache/incubator-gluten/issues/8453. To make sure no
* error is thrown when caching an Arrow Java query plan.
*/
test("csv scan v1 with table cache") {
val df = spark.sql("select * from student")
df.cache()
assert(df.collect().length == 3)
}
}

/** Since https://github.com/apache/incubator-gluten/pull/5850. */
abstract class ArrowCsvScanSuite extends ArrowCsvScanSuiteBase {
test("csv scan with option string as null") {
val df = runAndCompare("select * from student_option_str")()
val plan = df.queryExecution.executedPlan
Expand Down Expand Up @@ -152,6 +145,33 @@ abstract class ArrowCsvScanSuite extends VeloxWholeStageTransformerSuite {
val df = runAndCompare("select count(1) from student")()
checkLengthAndPlan(df, 1)
}
}

abstract class ArrowCsvScanSuiteBase extends VeloxWholeStageTransformerSuite {
override protected val resourcePath: String = "N/A"
override protected val fileFormat: String = "N/A"

protected val rootPath: String = getClass.getResource("/").getPath

override def beforeAll(): Unit = {
super.beforeAll()
createCsvTables()
}

override def afterAll(): Unit = {
super.afterAll()
}

override protected def sparkConf: SparkConf = {
super.sparkConf
.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.ColumnarShuffleManager")
.set("spark.sql.files.maxPartitionBytes", "1g")
.set("spark.sql.shuffle.partitions", "1")
.set("spark.memory.offHeap.size", "2g")
.set("spark.unsafe.exceptionOnMemoryLeak", "true")
.set("spark.sql.autoBroadcastJoinThreshold", "-1")
.set(GlutenConfig.NATIVE_ARROW_READER_ENABLED.key, "true")
}

private def createCsvTables(): Unit = {
spark.read
Expand Down
2 changes: 1 addition & 1 deletion dev/docker/Dockerfile.centos8-dynamic-build
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ ENV PATH=${PATH}:/usr/lib/maven/bin

RUN wget -nv https://archive.apache.org/dist/celeborn/celeborn-0.3.2-incubating/apache-celeborn-0.3.2-incubating-bin.tgz -P /opt/
RUN wget -nv https://archive.apache.org/dist/celeborn/celeborn-0.4.3/apache-celeborn-0.4.3-bin.tgz -P /opt/
RUN wget -nv https://archive.apache.org/dist/celeborn/celeborn-0.5.2/apache-celeborn-0.5.2-bin.tgz -P /opt/
RUN wget -nv https://archive.apache.org/dist/celeborn/celeborn-0.5.3/apache-celeborn-0.5.3-bin.tgz -P /opt/

RUN git clone --depth=1 https://github.com/apache/incubator-gluten /opt/gluten

Expand Down
67 changes: 57 additions & 10 deletions docs/developers/velox-backend-build-in-docker.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,64 @@ nav_order: 7
parent: Developer Overview
---

Currently, Centos-7/8/9 and Ubuntu 20.04/22.04 are supported to build Gluten Velox backend. Please refer to
`.github/workflows/velox_weekly.yml` to install required tools before the build.
Currently, we have two way to build Gluten, static link or dynamic link.

There are two docker images with almost all dependencies installed, respective for static build and dynamic build.
The according Dockerfiles are respectively `Dockerfile.centos7-static-build` and `Dockerfile.centos8-dynamic-build`
under `dev/docker/`.
# Static link
The static link approach builds all dependency libraries in vcpkg for both Velox and Gluten. It then statically links these libraries into libvelox.so and libgluten.so, enabling the build of Gluten on *any* Linux OS on x86 platforms with 64G memory. However we only verified on Centos-7/8/9 and Ubuntu 20.04/22.04. Please submit an issue if it fails on your OS.

```shell
# For static build on centos-7.
docker pull apache/gluten:vcpkg-centos-7
Here is the dependency libraries required on target system, they are the essential libraries pre-installed in every Linux OS.
```
linux-vdso.so.1
librt.so.1
libpthread.so.0
libdl.so.2
libm.so.6
libc.so.6
/lib64/ld-linux-x86-64.so.2
```

The 'dockerfile' to build Gluten jar:

```
FROM apache/gluten:vcpkg-centos-7
# For dynamic build on centos-8.
docker pull apache/gluten:centos-8 (dynamic build)
# Build Gluten Jar
RUN source /opt/rh/devtoolset-11/enable && \
git clone https://github.com/apache/incubator-gluten.git && \
cd incubator-gluten && \
./dev/builddeps-veloxbe.sh --run_setup_script=OFF --enable_s3=ON --enable_gcs=ON --enable_abfs=ON --enable_vcpkg=ON --build_arrow=OFF && \
mvn clean package -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-3.4 -DskipTests
```
`enable_vcpkg=ON` enables the static link. Vcpkg packages are already pre-installed in the vcpkg-centos-7 image and can be reused automatically. The image is maintained by Gluten community.

The command builds Gluten jar in 'glutenimage':
```
docker build -t glutenimage -f dockerfile
```
The gluten jar can be copied from glutenimage:/incubator-gluten/package/target/gluten-velox-bundle-*.jar

# Dynamic link
The dynamic link approach needs to install the dependencies libraries. It then dynamically link the .so files into libvelox.so and libgluten.so. Currently, Centos-7/8/9 and
Ubuntu 20.04/22.04 are supported to build Gluten Velox backend dynamically.

The 'dockerfile' to build Gluten jar:

```
FROM apache/gluten:centos-8
# Build Gluten Jar
RUN source /opt/rh/devtoolset-11/enable && \
git clone https://github.com/apache/incubator-gluten.git && \
cd incubator-gluten && \
./dev/builddeps-veloxbe.sh --run_setup_script=ON --enable_hdfs=ON --enable_vcpkg=OFF --build_arrow=OFF && \
mvn clean package -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-3.4 -DskipTests && \
./dev/build-thirdparty.sh
```
`enable_vcpkg=OFF` enables the dynamic link. Part of shared libraries are pre-installed in the image. You need to specify `--run_setup_script=ON` to install the rest of them. It then packages all dependency libraries into a jar by `build-thirdparty.sh`.
Please note the image is built based on centos-8. It has risk to build and deploy the jar on other OSes.

The command builds Gluten jar in 'glutenimage':
```
docker build -t glutenimage -f dockerfile
```
The gluten jar can be copied from glutenimage:/incubator-gluten/package/target/gluten-velox-bundle-*.jar and glutenimage:/incubator-gluten/package/target/gluten-thirdparty-lib-*.jar
Loading

0 comments on commit 7e531f1

Please sign in to comment.