Merge branch 'branch-24.06' into regex_string_digits_pattern

thirtiseven · Apr 26, 2024 · d290ccd · d290ccd
2 parents 42db581 + 82f838a
commit d290ccd
Show file tree

Hide file tree

Showing 285 changed files with 3,219 additions and 712 deletions.
diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
@@ -35,21 +35,17 @@ jobs:
     # This job only runs for pull request comments
     if: contains( '\
       abellina,\
-      andygrove,\
       anfeng,\
       firestarman,\
       GaryShen2008,\
-      jbrennan333, \
       jlowe,\
-      krajendrannv,\
       kuhushukla,\
       mythrocks,\
       nartal1,\
       nvdbaranec,\
       NvTimLiu,\
       razajafri,\
       revans2,\
-      rongou,\
       rwlee,\
       sameerz,\
       tgravescs,\
@@ -73,6 +69,7 @@ jobs:
       yinqingh,\
       parthosa,\
       liurenjie1024,\
+      binmahone,\
       ', format('{0},', github.actor)) && github.event.comment.body == 'build'
     steps:
       - name: Check if comment is issued by authorized person

diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/aggregator/pom.xml b/aggregator/pom.xml
@@ -728,6 +728,23 @@
                 </dependency>
             </dependencies>
         </profile>
+        <profile>
+            <id>release343</id>
+            <activation>
+                <property>
+                    <name>buildver</name>
+                    <value>343</value>
+                </property>
+            </activation>
+            <dependencies>
+                <dependency>
+                    <groupId>com.nvidia</groupId>
+                    <artifactId>rapids-4-spark-delta-24x_${scala.binary.version}</artifactId>
+                    <version>${project.version}</version>
+                    <classifier>${spark.version.classifier}</classifier>
+                </dependency>
+            </dependencies>
+        </profile>
         <profile>
             <id>release350</id>
             <activation>

diff --git a/.../src/main/spark320/scala/org/apache/spark/sql/tests/datagen/datagen/DataGenExprBase.scala b/.../src/main/spark320/scala/org/apache/spark/sql/tests/datagen/datagen/DataGenExprBase.scala
@@ -34,8 +34,10 @@
 {"spark": "341"}
 {"spark": "341db"}
 {"spark": "342"}
+{"spark": "343"}
 {"spark": "350"}
 {"spark": "351"}
+{"spark": "400"}
 spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.tests.datagen
 

diff --git a/docs/additional-functionality/advanced_configs.md b/docs/additional-functionality/advanced_configs.md
@@ -129,12 +129,12 @@ Name | Description | Default Value | Applicable at
 <a name="sql.json.read.decimal.enabled"></a>spark.rapids.sql.json.read.decimal.enabled|When reading a quoted string as a decimal Spark supports reading non-ascii unicode digits, and the RAPIDS Accelerator does not.|true|Runtime
 <a name="sql.json.read.double.enabled"></a>spark.rapids.sql.json.read.double.enabled|JSON reading is not 100% compatible when reading doubles.|true|Runtime
 <a name="sql.json.read.float.enabled"></a>spark.rapids.sql.json.read.float.enabled|JSON reading is not 100% compatible when reading floats.|true|Runtime
-<a name="sql.json.read.mixedTypesAsString.enabled"></a>spark.rapids.sql.json.read.mixedTypesAsString.enabled|JSON reading is not 100% compatible when reading mixed types as string.|false|Runtime
 <a name="sql.mode"></a>spark.rapids.sql.mode|Set the mode for the Rapids Accelerator. The supported modes are explainOnly and executeOnGPU. This config can not be changed at runtime, you must restart the application for it to take affect. The default mode is executeOnGPU, which means the RAPIDS Accelerator plugin convert the Spark operations and execute them on the GPU when possible. The explainOnly mode allows running queries on the CPU and the RAPIDS Accelerator will evaluate the queries as if it was going to run on the GPU. The explanations of what would have run on the GPU and why are output in log messages. When using explainOnly mode, the default explain output is ALL, this can be changed by setting spark.rapids.sql.explain. See that config for more details.|executeongpu|Startup
 <a name="sql.optimizer.joinReorder.enabled"></a>spark.rapids.sql.optimizer.joinReorder.enabled|When enabled, joins may be reordered for improved query performance|true|Runtime
 <a name="sql.python.gpu.enabled"></a>spark.rapids.sql.python.gpu.enabled|This is an experimental feature and is likely to change in the future. Enable (true) or disable (false) support for scheduling Python Pandas UDFs with GPU resources. When enabled, pandas UDFs are assumed to share the same GPU that the RAPIDs accelerator uses and will honor the python GPU configs|false|Runtime
-<a name="sql.reader.chunked"></a>spark.rapids.sql.reader.chunked|Enable a chunked reader where possible. A chunked reader allows reading highly compressed data that could not be read otherwise, but at the expense of more GPU memory, and in some cases more GPU computation.|true|Runtime
-<a name="sql.reader.chunked.subPage"></a>spark.rapids.sql.reader.chunked.subPage|Enable a chunked reader where possible for reading data that is smaller than the typical row group/page limit. Currently this only works for parquet.|true|Runtime
+<a name="sql.reader.chunked"></a>spark.rapids.sql.reader.chunked|Enable a chunked reader where possible. A chunked reader allows reading highly compressed data that could not be read otherwise, but at the expense of more GPU memory, and in some cases more GPU computation. Currently this only supports ORC and Parquet formats.|true|Runtime
+<a name="sql.reader.chunked.limitMemoryUsage"></a>spark.rapids.sql.reader.chunked.limitMemoryUsage|Enable a soft limit on the internal memory usage of the chunked reader (if being used). Such limit is calculated as the multiplication of 'spark.rapids.sql.batchSizeBytes' and 'spark.rapids.sql.reader.chunked.memoryUsageRatio'.For example, if batchSizeBytes is set to 1GB and memoryUsageRatio is 4, the chunked reader will try to keep its memory usage under 4GB.|None|Runtime
+<a name="sql.reader.chunked.subPage"></a>spark.rapids.sql.reader.chunked.subPage|Enable a chunked reader where possible for reading data that is smaller than the typical row group/page limit. Currently deprecated and replaced by 'spark.rapids.sql.reader.chunked.limitMemoryUsage'.|None|Runtime
 <a name="sql.reader.multithreaded.combine.sizeBytes"></a>spark.rapids.sql.reader.multithreaded.combine.sizeBytes|The target size in bytes to combine multiple small files together when using the MULTITHREADED parquet or orc reader. With combine disabled, the MULTITHREADED reader reads the files in parallel and sends individual files down to the GPU, but that can be inefficient for small files. When combine is enabled, files that are ready within spark.rapids.sql.reader.multithreaded.combine.waitTime together, up to this threshold size, are combined before sending down to GPU. This can be disabled by setting it to 0. Note that combine also will not go over the spark.rapids.sql.reader.batchSizeRows or spark.rapids.sql.reader.batchSizeBytes limits.|67108864|Runtime
 <a name="sql.reader.multithreaded.combine.waitTime"></a>spark.rapids.sql.reader.multithreaded.combine.waitTime|When using the multithreaded parquet or orc reader with combine mode, how long to wait, in milliseconds, for more files to finish if haven't met the size threshold. Note that this will wait this amount of time from when the last file was available, so total wait time could be larger then this.|200|Runtime
 <a name="sql.reader.multithreaded.read.keepOrder"></a>spark.rapids.sql.reader.multithreaded.read.keepOrder|When using the MULTITHREADED reader, if this is set to true we read the files in the same order Spark does, otherwise the order may not be the same. Now it is supported only for parquet and orc.|true|Runtime
@@ -269,7 +269,7 @@ Name | SQL Function(s) | Description | Default Value | Notes
 <a name="sql.expression.IsNotNull"></a>spark.rapids.sql.expression.IsNotNull|`isnotnull`|Checks if a value is not null|true|None|
 <a name="sql.expression.IsNull"></a>spark.rapids.sql.expression.IsNull|`isnull`|Checks if a value is null|true|None|
 <a name="sql.expression.JsonToStructs"></a>spark.rapids.sql.expression.JsonToStructs|`from_json`|Returns a struct value with the given `jsonStr` and `schema`|false|This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case|
-<a name="sql.expression.JsonTuple"></a>spark.rapids.sql.expression.JsonTuple|`json_tuple`|Returns a tuple like the function get_json_object, but it takes multiple names. All the input parameters and output column types are string.|false|This is disabled by default because JsonTuple on the GPU does not support all of the normalization that the CPU supports.|
+<a name="sql.expression.JsonTuple"></a>spark.rapids.sql.expression.JsonTuple|`json_tuple`|Returns a tuple like the function get_json_object, but it takes multiple names. All the input parameters and output column types are string.|false|This is disabled by default because Experimental feature that could be unstable or have performance issues.|
 <a name="sql.expression.KnownFloatingPointNormalized"></a>spark.rapids.sql.expression.KnownFloatingPointNormalized| |Tag to prevent redundant normalization|true|None|
 <a name="sql.expression.KnownNotNull"></a>spark.rapids.sql.expression.KnownNotNull| |Tag an expression as known to not be null|true|None|
 <a name="sql.expression.Lag"></a>spark.rapids.sql.expression.Lag|`lag`|Window function that returns N entries behind this one|true|None|

diff --git a/docs/additional-functionality/shuffle-docker-examples/Dockerfile.rocky_no_rdma b/docs/additional-functionality/shuffle-docker-examples/Dockerfile.rocky_no_rdma
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 #   - ROCKY_VER: Rocky Linux OS version
 
 ARG CUDA_VER=11.8.0
-ARG UCX_VER=1.15.0
+ARG UCX_VER=1.16.0
 ARG UCX_CUDA_VER=11
 ARG UCX_ARCH=x86_64
 ARG ROCKY_VER=8
@@ -38,6 +38,5 @@ RUN ls /usr/lib
 RUN mkdir /tmp/ucx_install && cd /tmp/ucx_install && \
   wget https://github.com/openucx/ucx/releases/download/v$UCX_VER/ucx-$UCX_VER-centos8-mofed5-cuda$UCX_CUDA_VER-$UCX_ARCH.tar.bz2 && \
   tar -xvf *.bz2 && \
-  rpm -i ucx-$UCX_VER*.rpm && \
-  rpm -i ucx-cuda-$UCX_VER*.rpm --nodeps && \
+  rpm -i `ls ucx-[0-9]*.rpm ucx-cuda-[0-9]*.rpm` --nodeps && \
   rm -rf /tmp/ucx_install
diff --git a/docs/additional-functionality/shuffle-docker-examples/Dockerfile.rocky_rdma b/docs/additional-functionality/shuffle-docker-examples/Dockerfile.rocky_rdma
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 #   - ROCKY_VER: Rocky Linux OS version
 
 ARG CUDA_VER=11.8.0
-ARG UCX_VER=1.15.0
+ARG UCX_VER=1.16.0
 ARG UCX_CUDA_VER=11
 ARG UCX_ARCH=x86_64
 ARG ROCKY_VER=8
@@ -37,7 +37,5 @@ RUN yum update -y && yum install -y wget bzip2 rdma-core numactl-libs libgomp li
 RUN mkdir /tmp/ucx_install && cd /tmp/ucx_install && \
   wget https://github.com/openucx/ucx/releases/download/v$UCX_VER/ucx-$UCX_VER-centos8-mofed5-cuda$UCX_CUDA_VER-$UCX_ARCH.tar.bz2 && \
   tar -xvf *.bz2 && \
-  rpm -i ucx-$UCX_VER*.rpm && \
-  rpm -i ucx-cuda-$UCX_VER*.rpm --nodeps && \
-  rpm -i ucx-ib-$UCX_VER-1.el8.x86_64.rpm ucx-rdmacm-$UCX_VER-1.el8.x86_64.rpm && \
+  rpm -i `ls ucx-[0-9]*.rpm ucx-cuda-[0-9]*.rpm ucx-ib-[0-9]*.rpm ucx-rdmacm-[0-9]*.rpm` --nodeps && \
   rm -rf /tmp/ucx_install
diff --git a/docs/additional-functionality/shuffle-docker-examples/Dockerfile.ubuntu_no_rdma b/docs/additional-functionality/shuffle-docker-examples/Dockerfile.ubuntu_no_rdma
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@
 #
 
 ARG CUDA_VER=11.8.0
-ARG UCX_VER=1.15.0
+ARG UCX_VER=1.16.0
 ARG UCX_CUDA_VER=11
 ARG UCX_ARCH=x86_64
 ARG UBUNTU_VER=20.04

diff --git a/docs/additional-functionality/shuffle-docker-examples/Dockerfile.ubuntu_rdma b/docs/additional-functionality/shuffle-docker-examples/Dockerfile.ubuntu_rdma
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -35,7 +35,7 @@
 
 ARG RDMA_CORE_VERSION=32.1
 ARG CUDA_VER=11.8.0
-ARG UCX_VER=1.15.0
+ARG UCX_VER=1.16.0
 ARG UCX_CUDA_VER=11
 ARG UCX_ARCH=x86_64
 ARG UBUNTU_VER=20.04