Merge branch 'main' into QT

apache · Jan 9, 2025 · 7e531f1 · 7e531f1
2 parents 957b43a + a326e54
commit 7e531f1
Show file tree

Hide file tree

Showing 16 changed files with 547 additions and 417 deletions.
diff --git a/.github/workflows/velox_backend.yml b/.github/workflows/velox_backend.yml
@@ -544,7 +544,7 @@ jobs:
       fail-fast: false
       matrix:
         spark: [ "spark-3.2" ]
-        celeborn: [ "celeborn-0.5.2", "celeborn-0.4.3", "celeborn-0.3.2-incubating" ]
+        celeborn: [ "celeborn-0.5.3", "celeborn-0.4.3", "celeborn-0.3.2-incubating" ]
     runs-on: ubuntu-20.04
     container: apache/gluten:centos-8
     steps:
@@ -568,7 +568,7 @@ jobs:
           EXTRA_PROFILE=""
           if [ "${{ matrix.celeborn }}" = "celeborn-0.4.3" ]; then
             EXTRA_PROFILE="-Pceleborn-0.4"
-          elif [ "${{ matrix.celeborn }}" = "celeborn-0.5.2" ]; then
+          elif [ "${{ matrix.celeborn }}" = "celeborn-0.5.3" ]; then
             EXTRA_PROFILE="-Pceleborn-0.5"
           fi
           echo "EXTRA_PROFILE: ${EXTRA_PROFILE}"

diff --git a/backends-velox/src/main/java/org/apache/gluten/columnarbatch/VeloxColumnarBatches.java b/backends-velox/src/main/java/org/apache/gluten/columnarbatch/VeloxColumnarBatches.java
@@ -17,6 +17,7 @@
 package org.apache.gluten.columnarbatch;
 
 import org.apache.gluten.backendsapi.BackendsApiManager;
+import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators;
 import org.apache.gluten.runtime.Runtime;
 import org.apache.gluten.runtime.Runtimes;
 
@@ -56,6 +57,7 @@ public static void checkNonVeloxBatch(ColumnarBatch batch) {
   }
 
   public static ColumnarBatch toVeloxBatch(ColumnarBatch input) {
+    ColumnarBatches.checkOffloaded(input);
     if (ColumnarBatches.isZeroColumnBatch(input)) {
       return input;
     }
@@ -86,6 +88,26 @@ public static ColumnarBatch toVeloxBatch(ColumnarBatch input) {
     return input;
   }
 
+  /**
+   * Check if a columnar batch is in Velox format. If not, convert it to Velox format then return.
+   * If already in Velox format, return the batch directly.
+   *
+   * <p>Should only be used for certain conditions when unable to insert explicit to-Velox
+   * transitions through query planner.
+   *
+   * <p>For example, used by {@link org.apache.spark.sql.execution.ColumnarCachedBatchSerializer} as
+   * Spark directly calls API ColumnarCachedBatchSerializer#convertColumnarBatchToCachedBatch for
+   * query plan that returns supportsColumnar=true without generating a cache-write query plan node.
+   */
+  public static ColumnarBatch ensureVeloxBatch(ColumnarBatch input) {
+    final ColumnarBatch light =
+        ColumnarBatches.ensureOffloaded(ArrowBufferAllocators.contextInstance(), input);
+    if (isVeloxBatch(light)) {
+      return light;
+    }
+    return toVeloxBatch(light);
+  }
+
   /**
    * Combine multiple columnar batches horizontally, assuming each of them is already offloaded.
    * Otherwise {@link UnsupportedOperationException} will be thrown.

diff --git a/...s-velox/src/main/scala/org/apache/spark/sql/execution/ColumnarCachedBatchSerializer.scala b/...s-velox/src/main/scala/org/apache/spark/sql/execution/ColumnarCachedBatchSerializer.scala
@@ -171,24 +171,16 @@ class ColumnarCachedBatchSerializer extends CachedBatchSerializer with Logging {
       conf: SQLConf): RDD[CachedBatch] = {
     input.mapPartitions {
       it =>
-        val lightBatches = it.map {
+        val veloxBatches = it.map {
           /* Native code needs a Velox offloaded batch, making sure to offload
              if heavy batch is encountered */
-          batch =>
-            val heavy = ColumnarBatches.isHeavyBatch(batch)
-            if (heavy) {
-              val offloaded = VeloxColumnarBatches.toVeloxBatch(
-                ColumnarBatches.offload(ArrowBufferAllocators.contextInstance(), batch))
-              offloaded
-            } else {
-              batch
-            }
+          batch => VeloxColumnarBatches.ensureVeloxBatch(batch)
         }
         new Iterator[CachedBatch] {
-          override def hasNext: Boolean = lightBatches.hasNext
+          override def hasNext: Boolean = veloxBatches.hasNext
 
           override def next(): CachedBatch = {
-            val batch = lightBatches.next()
+            val batch = veloxBatches.next()
             val results =
               ColumnarBatchSerializerJniWrapper
                 .create(

diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/ArrowCsvScanSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/ArrowCsvScanSuite.scala
@@ -67,33 +67,26 @@ class ArrowCsvScanSuiteV2 extends ArrowCsvScanSuite {
   }
 }
 
-/** Since https://github.com/apache/incubator-gluten/pull/5850. */
-abstract class ArrowCsvScanSuite extends VeloxWholeStageTransformerSuite {
-  override protected val resourcePath: String = "N/A"
-  override protected val fileFormat: String = "N/A"
-
-  protected val rootPath: String = getClass.getResource("/").getPath
-
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-    createCsvTables()
-  }
-
-  override def afterAll(): Unit = {
-    super.afterAll()
-  }
-
+class ArrowCsvScanWithTableCacheSuite extends ArrowCsvScanSuiteBase {
   override protected def sparkConf: SparkConf = {
     super.sparkConf
-      .set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.ColumnarShuffleManager")
-      .set("spark.sql.files.maxPartitionBytes", "1g")
-      .set("spark.sql.shuffle.partitions", "1")
-      .set("spark.memory.offHeap.size", "2g")
-      .set("spark.unsafe.exceptionOnMemoryLeak", "true")
-      .set("spark.sql.autoBroadcastJoinThreshold", "-1")
-      .set(GlutenConfig.NATIVE_ARROW_READER_ENABLED.key, "true")
+      .set("spark.sql.sources.useV1SourceList", "csv")
+      .set(GlutenConfig.COLUMNAR_TABLE_CACHE_ENABLED.key, "true")
+  }
+
+  /**
+   * Test for GLUTEN-8453: https://github.com/apache/incubator-gluten/issues/8453. To make sure no
+   * error is thrown when caching an Arrow Java query plan.
+   */
+  test("csv scan v1 with table cache") {
+    val df = spark.sql("select * from student")
+    df.cache()
+    assert(df.collect().length == 3)
   }
+}
 
+/** Since https://github.com/apache/incubator-gluten/pull/5850. */
+abstract class ArrowCsvScanSuite extends ArrowCsvScanSuiteBase {
   test("csv scan with option string as null") {
     val df = runAndCompare("select * from student_option_str")()
     val plan = df.queryExecution.executedPlan
@@ -152,6 +145,33 @@ abstract class ArrowCsvScanSuite extends VeloxWholeStageTransformerSuite {
     val df = runAndCompare("select count(1) from student")()
     checkLengthAndPlan(df, 1)
   }
+}
+
+abstract class ArrowCsvScanSuiteBase extends VeloxWholeStageTransformerSuite {
+  override protected val resourcePath: String = "N/A"
+  override protected val fileFormat: String = "N/A"
+
+  protected val rootPath: String = getClass.getResource("/").getPath
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    createCsvTables()
+  }
+
+  override def afterAll(): Unit = {
+    super.afterAll()
+  }
+
+  override protected def sparkConf: SparkConf = {
+    super.sparkConf
+      .set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.ColumnarShuffleManager")
+      .set("spark.sql.files.maxPartitionBytes", "1g")
+      .set("spark.sql.shuffle.partitions", "1")
+      .set("spark.memory.offHeap.size", "2g")
+      .set("spark.unsafe.exceptionOnMemoryLeak", "true")
+      .set("spark.sql.autoBroadcastJoinThreshold", "-1")
+      .set(GlutenConfig.NATIVE_ARROW_READER_ENABLED.key, "true")
+  }
 
   private def createCsvTables(): Unit = {
     spark.read

diff --git a/dev/docker/Dockerfile.centos8-dynamic-build b/dev/docker/Dockerfile.centos8-dynamic-build
@@ -17,7 +17,7 @@ ENV PATH=${PATH}:/usr/lib/maven/bin
 
 RUN wget -nv https://archive.apache.org/dist/celeborn/celeborn-0.3.2-incubating/apache-celeborn-0.3.2-incubating-bin.tgz -P /opt/
 RUN wget -nv https://archive.apache.org/dist/celeborn/celeborn-0.4.3/apache-celeborn-0.4.3-bin.tgz -P /opt/
-RUN wget -nv https://archive.apache.org/dist/celeborn/celeborn-0.5.2/apache-celeborn-0.5.2-bin.tgz -P /opt/
+RUN wget -nv https://archive.apache.org/dist/celeborn/celeborn-0.5.3/apache-celeborn-0.5.3-bin.tgz -P /opt/
 
 RUN git clone --depth=1 https://github.com/apache/incubator-gluten /opt/gluten
 

diff --git a/docs/developers/velox-backend-build-in-docker.md b/docs/developers/velox-backend-build-in-docker.md
@@ -5,17 +5,64 @@ nav_order: 7
 parent: Developer Overview
 ---
 
-Currently, Centos-7/8/9 and Ubuntu 20.04/22.04 are supported to build Gluten Velox backend. Please refer to
-`.github/workflows/velox_weekly.yml` to install required tools before the build.
+Currently, we have two way to build Gluten, static link or dynamic link. 
 
-There are two docker images with almost all dependencies installed, respective for static build and dynamic build.
-The according Dockerfiles are respectively `Dockerfile.centos7-static-build` and `Dockerfile.centos8-dynamic-build`
-under `dev/docker/`.
+# Static link
+The static link approach builds all dependency libraries in vcpkg for both Velox and Gluten. It then statically links these libraries into libvelox.so and libgluten.so, enabling the build of Gluten on *any* Linux OS on x86 platforms with 64G memory. However we only verified on Centos-7/8/9 and Ubuntu 20.04/22.04. Please submit an issue if it fails on your OS.
 
-```shell
-# For static build on centos-7.
-docker pull apache/gluten:vcpkg-centos-7
+Here is the dependency libraries required on target system, they are the essential libraries pre-installed in every Linux OS.
+```
+linux-vdso.so.1
+librt.so.1
+libpthread.so.0
+libdl.so.2
+libm.so.6
+libc.so.6
+/lib64/ld-linux-x86-64.so.2
+```
+
+The 'dockerfile' to build Gluten jar:
+
+```
+FROM apache/gluten:vcpkg-centos-7
 
-# For dynamic build on centos-8.
-docker pull apache/gluten:centos-8 (dynamic build)
+# Build Gluten Jar
+RUN source /opt/rh/devtoolset-11/enable && \
+    git clone https://github.com/apache/incubator-gluten.git && \
+    cd incubator-gluten && \
+    ./dev/builddeps-veloxbe.sh --run_setup_script=OFF --enable_s3=ON --enable_gcs=ON --enable_abfs=ON --enable_vcpkg=ON --build_arrow=OFF && \
+    mvn clean package -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-3.4 -DskipTests
+```
+`enable_vcpkg=ON` enables the static link. Vcpkg packages are already pre-installed in the vcpkg-centos-7 image and can be reused automatically. The image is maintained by Gluten community.
+
+The command builds Gluten jar in 'glutenimage':
+```
+docker build -t glutenimage -f dockerfile
+```
+The gluten jar can be copied from glutenimage:/incubator-gluten/package/target/gluten-velox-bundle-*.jar
+
+# Dynamic link
+The dynamic link approach needs to install the dependencies libraries. It then dynamically link the .so files into libvelox.so and libgluten.so. Currently, Centos-7/8/9 and
+ Ubuntu 20.04/22.04 are supported to build Gluten Velox backend dynamically. 
+
+The 'dockerfile' to build Gluten jar:
+
+```
+FROM apache/gluten:centos-8
+
+# Build Gluten Jar
+RUN source /opt/rh/devtoolset-11/enable && \
+    git clone https://github.com/apache/incubator-gluten.git && \
+    cd incubator-gluten && \
+    ./dev/builddeps-veloxbe.sh --run_setup_script=ON --enable_hdfs=ON --enable_vcpkg=OFF --build_arrow=OFF && \
+    mvn clean package -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-3.4 -DskipTests && \
+    ./dev/build-thirdparty.sh
+```
+`enable_vcpkg=OFF` enables the dynamic link. Part of shared libraries are pre-installed in the image. You need to specify `--run_setup_script=ON` to install the rest of them. It then packages all dependency libraries into a jar by `build-thirdparty.sh`. 
+Please note the image is built based on centos-8. It has risk to build and deploy the jar on other OSes.
+
+The command builds Gluten jar in 'glutenimage':
+```
+docker build -t glutenimage -f dockerfile
 ```
+The gluten jar can be copied from glutenimage:/incubator-gluten/package/target/gluten-velox-bundle-*.jar and glutenimage:/incubator-gluten/package/target/gluten-thirdparty-lib-*.jar