diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
index e115736460b..43d1b6160c7 100644
--- a/.github/workflows/blossom-ci.yml
+++ b/.github/workflows/blossom-ci.yml
@@ -67,6 +67,7 @@ jobs:
       mattahrens,\
       sinkinben,\
       thirtiseven,\
+      YanxuanLiu,\
       ', format('{0},', github.actor)) && github.event.comment.body == 'build'
     steps:
       - name: Check if comment is issued by authorized person
diff --git a/.github/workflows/mvn-verify-check.yml b/.github/workflows/mvn-verify-check.yml
index 44038eee414..ee981311582 100644
--- a/.github/workflows/mvn-verify-check.yml
+++ b/.github/workflows/mvn-verify-check.yml
@@ -68,7 +68,7 @@ jobs:
 
       - name: package aggregator check
         run: >
-          mvn -B package -pl aggregator -am
+          mvn -Dmaven.wagon.http.retryHandler.count=3 -B package -pl aggregator -am
           -P 'individual,pre-merge'
           -Dbuildver=${{ matrix.spark-version }}
           -DskipTests
@@ -92,7 +92,7 @@ jobs:
       # includes RAT, code style and doc-gen checks of default shim
       - name: verify all modules with lowest-supported Spark version
         run: >
-          mvn -B verify
+          mvn -Dmaven.wagon.http.retryHandler.count=3 -B verify
           -P 'individual,pre-merge'
           -Dbuildver=${{ needs.get-noSnapshot-versions-from-dist.outputs.sparkHeadVersion }}
           -DskipTests
diff --git a/dist/pom.xml b/dist/pom.xml
index f2b3d3575ea..facf52d388a 100644
--- a/dist/pom.xml
+++ b/dist/pom.xml
@@ -58,6 +58,7 @@
             312db,
             321db
         </databricks.buildvers>
+        <dist.jar.name>${project.build.directory}/${project.build.finalName}-${cuda.version}.jar</dist.jar.name>
     </properties>
     <profiles>
         <profile>
@@ -331,7 +332,8 @@
                         <configuration>
                             <target>
                                 <zip update="true" basedir="${project.build.directory}/extra-resources"
-                                    destfile="${project.build.directory}/${project.build.finalName}-${cuda.version}.jar"/>
+                                    compress="${dist.jar.compress}"
+                                    destfile="${dist.jar.name}"/>
                             </target>
                         </configuration>
                     </execution>
@@ -450,6 +452,65 @@
                     </execution>
                 </executions>
             </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-install-plugin</artifactId>
+                <version>3.0.1</version>
+                <executions>
+                    <execution>
+                        <id>default-install</id>
+                        <phase>none</phase>
+                    </execution>
+                    <execution>
+                        <id>install-parallel-worlds-jar</id>
+                        <phase>install</phase>
+                        <goals>
+                            <goal>install-file</goal>
+                        </goals>
+                        <configuration>
+                            <file>${dist.jar.name}</file>
+                            <artifactId>${project.artifactId}</artifactId>
+                            <classifier>${cuda.version}</classifier>
+                            <groupId>${project.groupId}</groupId>
+                            <version>${project.version}</version>
+                            <packaging>jar</packaging>
+                            <!-- pomFile will be taken from META-INF in jar
+                            https://github.com/apache/maven-install-plugin/blob/9f77fb95ab2a95b1d8d0c34c39c6f088f9f690ab/src/main/java/org/apache/maven/plugins/install/InstallFileMojo.java#L309
+                            -->
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-deploy-plugin</artifactId>
+                <version>3.0.0</version>
+                <executions>
+                    <execution>
+                        <id>default-deploy</id>
+                        <phase>none</phase>
+                    </execution>
+                    <execution>
+                        <id>deploy-parallel-worlds-jar</id>
+                        <phase>deploy</phase>
+                        <goals>
+                            <goal>deploy-file</goal>
+                        </goals>
+                        <configuration>
+                            <file>${dist.jar.name}</file>
+                            <url>file://${java.io.tmpdir}/m2-repo</url>
+                            <artifactId>${project.artifactId}</artifactId>
+                            <classifier>${cuda.version}</classifier>
+                            <groupId>${project.groupId}</groupId>
+                            <packaging>jar</packaging>
+                            <!-- pomFile will be taken from META-INF in jar
+                            https://github.com/apache/maven-deploy-plugin/blob/4a72d8e9778c1878058435bdb919d40d65c879dd/src/main/java/org/apache/maven/plugins/deploy/DeployFileMojo.java#L186
+                            -->
+                            <version>${project.version}</version>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
         </plugins>
     </build>
 </project>
diff --git a/dist/unshimmed-spark311.txt b/dist/unshimmed-spark311.txt
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/docs/compatibility.md b/docs/compatibility.md
index c63bdd47e02..10cfcfd0783 100644
--- a/docs/compatibility.md
+++ b/docs/compatibility.md
@@ -882,6 +882,8 @@ Casting from string to timestamp currently has the following limitations.
 | `"yyyy-[M]M "`                                                      | Yes               |
 | `"yyyy-[M]M-[d]d"`                                                  | Yes               |
 | `"yyyy-[M]M-[d]d "`                                                 | Yes               |
+| `"yyyy-[M]M-[d]dT[h]h:[m]m:[s]s[zone_id]"` | Partial [\[1\]](#Footnote1)       |
+| `"yyyy-[M]M-[d]d [h]h:[m]m:[s]s[zone_id]"` | Partial [\[1\]](#Footnote1)       |
 | `"yyyy-[M]M-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]"` | Partial [\[1\]](#Footnote1)       |
 | `"yyyy-[M]M-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]"` | Partial [\[1\]](#Footnote1)       |
 | `"[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]"`                | Partial [\[1\]](#Footnote1)       |
@@ -892,8 +894,8 @@ Casting from string to timestamp currently has the following limitations.
 | `"tomorrow"`                                                        | Yes               |
 | `"yesterday"`                                                       | Yes               |
 
-- <a name="Footnote1"></a>[1] The timestamp portion must have 6 digits for milliseconds.
- Only timezone 'Z' (UTC) is supported. Casting unsupported formats will result in null values.
+- <a name="Footnote1"></a>[1] Leap seconds are not supported. If a zone_id is provided then only
+ timezone 'Z' (UTC) is supported. Casting unsupported formats will result in null values.
 
 Spark is very lenient when casting from string to timestamp because all date and time components
 are optional, meaning that input values such as `T`, `T2`, `:`, `::`, `1:`, `:1`, and `::1`
diff --git a/docs/configs.md b/docs/configs.md
index b4598b315a8..edcf1bcc621 100644
--- a/docs/configs.md
+++ b/docs/configs.md
@@ -225,6 +225,7 @@ Name | SQL Function(s) | Description | Default Value | Notes
 <a name="sql.expression.Explode"></a>spark.rapids.sql.expression.Explode|`explode`, `explode_outer`|Given an input array produces a sequence of rows for each value in the array|true|None|
 <a name="sql.expression.Expm1"></a>spark.rapids.sql.expression.Expm1|`expm1`|Euler's number e raised to a power minus 1|true|None|
 <a name="sql.expression.Floor"></a>spark.rapids.sql.expression.Floor|`floor`|Floor of a number|true|None|
+<a name="sql.expression.FromUTCTimestamp"></a>spark.rapids.sql.expression.FromUTCTimestamp|`from_utc_timestamp`|Render the input UTC timestamp in the input timezone|true|None|
 <a name="sql.expression.FromUnixTime"></a>spark.rapids.sql.expression.FromUnixTime|`from_unixtime`|Get the string from a unix timestamp|true|None|
 <a name="sql.expression.GetArrayItem"></a>spark.rapids.sql.expression.GetArrayItem| |Gets the field at `ordinal` in the Array|true|None|
 <a name="sql.expression.GetArrayStructFields"></a>spark.rapids.sql.expression.GetArrayStructFields| |Extracts the `ordinal`-th fields of all array elements for the data with the type of array of struct|true|None|
diff --git a/docs/supported_ops.md b/docs/supported_ops.md
index 811cbfa80a6..cdd8f6f1105 100644
--- a/docs/supported_ops.md
+++ b/docs/supported_ops.md
@@ -5442,7 +5442,7 @@ are limited.
 <td> </td>
 <td> </td>
 <td> </td>
-<td><em>PS<br/>Because of Spark's inner workings the full range of decimal precision (even for 128-bit values) is not supported.</em></td>
+<td>S</td>
 <td> </td>
 <td> </td>
 <td> </td>
@@ -6114,6 +6114,74 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<td rowSpan="3">FromUTCTimestamp</td>
+<td rowSpan="3">`from_utc_timestamp`</td>
+<td rowSpan="3">Render the input UTC timestamp in the input timezone</td>
+<td rowSpan="3">None</td>
+<td rowSpan="3">project</td>
+<td>timestamp</td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td><em>PS<br/>UTC is only supported TZ for TIMESTAMP</em></td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+</tr>
+<tr>
+<td>timezone</td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td><em>PS<br/>Only timezones equivalent to UTC are supported</em></td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+</tr>
+<tr>
+<td>result</td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td><em>PS<br/>UTC is only supported TZ for TIMESTAMP</em></td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+</tr>
+<tr>
 <td rowSpan="3">FromUnixTime</td>
 <td rowSpan="3">`from_unixtime`</td>
 <td rowSpan="3">Get the string from a unix timestamp</td>
@@ -6250,6 +6318,32 @@ are limited.
 <td><b>NS</b></td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="2">GetArrayStructFields</td>
 <td rowSpan="2"> </td>
 <td rowSpan="2">Extracts the `ordinal`-th fields of all array elements for the data with the type of array of struct</td>
@@ -6365,32 +6459,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="3">GetMapValue</td>
 <td rowSpan="3"> </td>
 <td rowSpan="3">Gets Value from a Map based on a key</td>
@@ -6706,6 +6774,32 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="6">GreaterThanOrEqual</td>
 <td rowSpan="6">`>=`</td>
 <td rowSpan="6">>= operator</td>
@@ -6838,32 +6932,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="2">Greatest</td>
 <td rowSpan="2">`greatest`</td>
 <td rowSpan="2">Returns the greatest value of all parameters, skipping null values</td>
@@ -7115,6 +7183,32 @@ are limited.
 <td><b>NS</b></td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="3">In</td>
 <td rowSpan="3">`in`</td>
 <td rowSpan="3">IN operator</td>
@@ -7230,32 +7324,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="2">InitCap</td>
 <td rowSpan="2">`initcap`</td>
 <td rowSpan="2">Returns str with the first letter of each word in uppercase. All other letters are in lowercase</td>
@@ -7496,6 +7564,32 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="2">IsNotNull</td>
 <td rowSpan="2">`isnotnull`</td>
 <td rowSpan="2">Checks if a value is not null</td>
@@ -7637,32 +7731,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="2">KnownFloatingPointNormalized</td>
 <td rowSpan="2"> </td>
 <td rowSpan="2">Tag to prevent redundant normalization</td>
@@ -7914,6 +7982,32 @@ are limited.
 <td><b>NS</b></td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="2">LastDay</td>
 <td rowSpan="2">`last_day`</td>
 <td rowSpan="2">Returns the last day of the month which the date belongs to</td>
@@ -8050,32 +8144,6 @@ are limited.
 <td><b>NS</b></td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="2">Least</td>
 <td rowSpan="2">`least`</td>
 <td rowSpan="2">Returns the least value of all parameters, skipping null values</td>
@@ -8302,6 +8370,32 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="6">LessThanOrEqual</td>
 <td rowSpan="6">`<=`</td>
 <td rowSpan="6"><= operator</td>
@@ -8434,32 +8528,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="3">Like</td>
 <td rowSpan="3">`like`</td>
 <td rowSpan="3">Like</td>
@@ -8670,6 +8738,32 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="2">Log1p</td>
 <td rowSpan="2">`log1p`</td>
 <td rowSpan="2">Natural log 1 + expr</td>
@@ -8832,32 +8926,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="2">Lower</td>
 <td rowSpan="2">`lower`, `lcase`</td>
 <td rowSpan="2">String lowercase operator</td>
@@ -9046,6 +9114,32 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="3">MapFilter</td>
 <td rowSpan="3">`map_filter`</td>
 <td rowSpan="3">Filters entries in a map using the function</td>
@@ -9208,32 +9302,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="2">Md5</td>
 <td rowSpan="2">`md5`</td>
 <td rowSpan="2">MD5 hash operator</td>
@@ -9533,6 +9601,32 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="2">Murmur3Hash</td>
 <td rowSpan="2">`hash`</td>
 <td rowSpan="2">Murmur3 hash operator</td>
@@ -9580,32 +9674,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="3">NaNvl</td>
 <td rowSpan="3">`nanvl`</td>
 <td rowSpan="3">Evaluates to `left` iff left is not NaN, `right` otherwise</td>
@@ -9905,6 +9973,32 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="6">Or</td>
 <td rowSpan="6">`or`</td>
 <td rowSpan="6">Logical OR</td>
@@ -10037,32 +10131,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="2">PercentRank</td>
 <td rowSpan="2">`percent_rank`</td>
 <td rowSpan="2">Window function that returns the percent rank value within the aggregation window</td>
@@ -10126,7 +10194,7 @@ are limited.
 <td> </td>
 <td> </td>
 <td> </td>
-<td>S</td>
+<td><em>PS<br/>decimals with precision 38 are not supported</em></td>
 <td> </td>
 <td> </td>
 <td> </td>
@@ -10357,6 +10425,32 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="2">PreciseTimestampConversion</td>
 <td rowSpan="2"> </td>
 <td rowSpan="2">Expression used internally to convert the TimestampType to Long and back without losing precision, i.e. in microseconds. Used in time windowing</td>
@@ -10404,32 +10498,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="2">PromotePrecision</td>
 <td rowSpan="2"> </td>
 <td rowSpan="2">PromotePrecision before arithmetic operations between DecimalType data</td>
@@ -16486,8 +16554,8 @@ are limited.
 <td>S</td>
 <td>S</td>
 <td>S</td>
-<td><em>PS<br/>Input must not contain NaNs and spark.rapids.sql.hasNans must be false.</em></td>
-<td><em>PS<br/>Input must not contain NaNs and spark.rapids.sql.hasNans must be false.</em></td>
+<td>S</td>
+<td>S</td>
 <td>S</td>
 <td><em>PS<br/>UTC is only supported TZ for TIMESTAMP</em></td>
 <td>S</td>
@@ -16529,8 +16597,8 @@ are limited.
 <td>S</td>
 <td>S</td>
 <td>S</td>
-<td><em>PS<br/>Input must not contain NaNs and spark.rapids.sql.hasNans must be false.</em></td>
-<td><em>PS<br/>Input must not contain NaNs and spark.rapids.sql.hasNans must be false.</em></td>
+<td>S</td>
+<td>S</td>
 <td>S</td>
 <td><em>PS<br/>UTC is only supported TZ for TIMESTAMP</em></td>
 <td>S</td>
@@ -16572,8 +16640,8 @@ are limited.
 <td>S</td>
 <td>S</td>
 <td>S</td>
-<td><em>PS<br/>Input must not contain NaNs and spark.rapids.sql.hasNans must be false.</em></td>
-<td><em>PS<br/>Input must not contain NaNs and spark.rapids.sql.hasNans must be false.</em></td>
+<td>S</td>
+<td>S</td>
 <td>S</td>
 <td><em>PS<br/>UTC is only supported TZ for TIMESTAMP</em></td>
 <td>S</td>
diff --git a/integration_tests/README.md b/integration_tests/README.md
index c1151a7d02e..7da0ec89ca9 100644
--- a/integration_tests/README.md
+++ b/integration_tests/README.md
@@ -37,20 +37,20 @@ It is recommended that you use `pyenv` to manage Python installations.
 - Follow instructions to use the right method of installation described
   [here](https://github.com/pyenv/pyenv#installation)
 - Verify that `pyenv` is set correctly
-  
+
   ```shell script
-  which pyenv  
+  which pyenv
   ```
 
 - Using `pyenv` to set Python installation
   - To check versions to be installed (will return a long list)
-  
+
     ```shell script
     ls ~/.pyenv/versions/
     ```
 
   - To install a specific version from the available list
-  
+
     ```shell script
     pyenv install 3.X.Y
     ```
@@ -116,7 +116,7 @@ You can install all the dependencies using `pip` by running the following comman
 ### Installing Spark
 
 You need to install spark-3.x and set `$SPARK_HOME/bin` to your `$PATH`, where
-`SPARK_HOME` points to the directory of a runnable Spark distribution.  
+`SPARK_HOME` points to the directory of a runnable Spark distribution.
 This can be done in the following three steps:
 
 1. Choose the appropriate way to create Spark distribution:
@@ -156,10 +156,10 @@ Make sure that you compile the plugin against the same version of Spark that it
 Tests will run as a part of the maven build if you have the environment variable `SPARK_HOME` set.
 
 The suggested way to run these tests is to use the shell-script file located in the
- integration_tests folder called [run_pyspark_from_build.sh](run_pyspark_from_build.sh). This script takes 
-care of some of the flags that are required to run the tests which will have to be set for the 
-plugin to work. It will be very useful to read the contents of the 
-[run_pyspark_from_build.sh](run_pyspark_from_build.sh) to get a better insight 
+ integration_tests folder called [run_pyspark_from_build.sh](run_pyspark_from_build.sh). This script takes
+care of some of the flags that are required to run the tests which will have to be set for the
+plugin to work. It will be very useful to read the contents of the
+[run_pyspark_from_build.sh](run_pyspark_from_build.sh) to get a better insight
 into what is needed as we constantly keep working on to improve and expand the plugin-support.
 
 The python tests run with pytest and the script honors pytest parameters. Some handy flags are:
@@ -221,16 +221,18 @@ To run the tests separate from the build go to the `integration_tests` directory
 `runtests.py` through `spark-submit`, but if you want to run the tests in parallel with
 `pytest-xdist` you will need to submit it as a regular python application and have `findspark`
 installed.  Be sure to include the necessary jars for the RAPIDS plugin either with
-`spark-submit` or with the cluster when it is 
+`spark-submit` or with the cluster when it is
 [setup](../docs/get-started/getting-started-on-prem.md).
-The command line arguments to `runtests.py` are the same as for 
+The command line arguments to `runtests.py` are the same as for
 [pytest](https://docs.pytest.org/en/latest/usage.html). The only reason we have a separate script
 is that `spark-submit` uses python if the file name ends with `.py`.
 
 If you want to configure the Spark cluster you may also set environment variables for the tests.
 The name of the env var should be in the form `"PYSP_TEST_" + conf_key.replace('.', '_')`. Linux
-does not allow '.' in the name of an environment variable so we replace it with an underscore. As
-Spark configs avoid this character we have no other special processing.
+does not allow '.' in the name of an environment variable so we replace it with an underscore. If
+the property contains an underscore, substitute '__' for each original '_'.
+For example, `spark.sql.catalog.spark_catalog` is represented by the environment variable
+`PYSP_TEST_spark_sql_catalog_spark__catalog`.
 
 We also have a large number of integration tests that currently run as a part of the unit tests
 using scala test. Those are in the `src/test/scala` sub-directory and depend on the testing
@@ -252,7 +254,7 @@ It is recommended that you use `spark-shell` and the scalatest shell to run each
 individually, so you don't risk running unit tests along with the integration tests.
 http://www.scalatest.org/user_guide/using_the_scalatest_shell
 
-```shell 
+```shell
 spark-shell --jars rapids-4-spark-tests_2.12-22.10.0-SNAPSHOT-tests.jar,rapids-4-spark-integration-tests_2.12-22.10.0-SNAPSHOT-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar
 ```
 
@@ -366,8 +368,8 @@ cudf_udf tests needs a couple of different settings, they may need to run separa
 To enable cudf_udf tests, need following pre requirements:
    * Install cuDF Python library on all the nodes running executors. The instruction could be found at [here](https://rapids.ai/start.html). Please follow the steps to choose the version based on your environment and install the cuDF library via Conda or use other ways like building from source.
    * Disable the GPU exclusive mode on all the nodes running executors. The sample command is `sudo nvidia-smi -c DEFAULT`
-   
-To run cudf_udf tests, need following configuration changes:   
+
+To run cudf_udf tests, need following configuration changes:
    * Add configurations `--py-files` and `spark.executorEnv.PYTHONPATH` to specify the plugin jar for python modules 'rapids/daemon' 'rapids/worker'.
    * Decrease `spark.rapids.memory.gpu.allocFraction` to reserve enough GPU memory for Python processes in case of out-of-memory.
    * Add `spark.rapids.python.concurrentPythonWorkers` and `spark.rapids.python.memory.gpu.allocFraction` to reserve enough GPU memory for Python processes in case of out-of-memory.
@@ -380,7 +382,7 @@ $SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-22.10.0-SNAPSHOT-cuda11
 
 ### Enabling fuzz tests
 
-Fuzz tests are intended to find more corner cases in testing. We disable them by default because they might randomly fail. 
+Fuzz tests are intended to find more corner cases in testing. We disable them by default because they might randomly fail.
 The tests can be enabled by appending the option `--fuzz_test` to the command.
 
    * `--fuzz_test` (enable the fuzz tests when provided, and remove this option if you want to disable the tests)
@@ -459,33 +461,33 @@ When support for a new operator is added to the Rapids Accelerator for Spark, or
  to support more data types, it is recommended that the following conditions be covered in its corresponding integration tests:
 
 ### 1. Cover all supported data types
-Ensure that tests cover all data types supported by the added operation. An exhaustive list of data types supported in 
+Ensure that tests cover all data types supported by the added operation. An exhaustive list of data types supported in
 Apache Spark is available [here](https://spark.apache.org/docs/latest/sql-ref-datatypes.html). These include:
-   * Numeric Types 
-     * `ByteType` 
-     * `ShortType` 
+   * Numeric Types
+     * `ByteType`
+     * `ShortType`
      * `IntegerType`
      * `LongType`
      * `FloatType`
      * `DoubleType`
      * `DecimalType`
-   * Strings 
-     * `StringType` 
+   * Strings
+     * `StringType`
      * `VarcharType`
-   * Binary (`BinaryType`)  
+   * Binary (`BinaryType`)
    * Booleans (`BooleanType`)
-   * Chrono Types 
-     * `TimestampType` 
+   * Chrono Types
+     * `TimestampType`
      * `DateType`
      * `Interval`
-   * Complex Types 
+   * Complex Types
      * `ArrayType`
      * `StructType`
      * `MapType`
 
 `data_gen.py` provides `DataGen` classes that help generate test data in integration tests.
 
-The `assert_gpu_and_cpu_are_equal_collect()` function from `asserts.py` may be used to compare that an operator in 
+The `assert_gpu_and_cpu_are_equal_collect()` function from `asserts.py` may be used to compare that an operator in
 the Rapids Accelerator produces the same results as Apache Spark, for a test query.
 
 For data types that are not currently supported for an operator in the Rapids Accelerator,
@@ -505,17 +507,17 @@ E.g.
 The `ArrayGen` and `StructGen` classes in `data_gen.py` can be configured to support arbitrary nesting.
 
 ### 3. Literal (i.e. Scalar) values
-Operators and expressions that support literal operands need to be tested with literal inputs, of all 
-supported types from 1 and 2, above. 
+Operators and expressions that support literal operands need to be tested with literal inputs, of all
+supported types from 1 and 2, above.
 For instance, `SUM()` supports numeric columns (e.g. `SUM(a + b)`), or scalars (e.g. `SUM(20)`).
 Similarly, `COUNT()` supports the following:
    * Columns: E.g. `COUNT(a)` to count non-null rows for column `a`
    * Scalars: E.g. `COUNT(1)` to count all rows (including nulls)
    * `*`: E.g. `COUNT(*)`, functionally equivalent to `COUNT(1)`
 It is advised that tests be added for all applicable literal types, for an operator.
-     
+
 Note that for most operations, if all inputs are literal values, the Spark Catalyst optimizer will evaluate
-the expression during the logical planning phase of query compilation, via 
+the expression during the logical planning phase of query compilation, via
 [Constant Folding](https://jaceklaskowski.gitbooks.io/mastering-spark-sql/content/spark-sql-Optimizer-ConstantFolding.html)
 E.g. Consider this query:
 ```sql
@@ -529,18 +531,18 @@ need not necessarily add more test coverage.
 Ensure that the test data accommodates null values for input columns. This includes null values in columns
 and in literal inputs.
 
-Null values in input columns are a frequent source of bugs in the Rapids Accelerator for Spark, 
-because of mismatches in null-handling and semantics, between RAPIDS `libcudf` (on which 
-the Rapids Accelerator relies heavily), and Apache Spark. 
+Null values in input columns are a frequent source of bugs in the Rapids Accelerator for Spark,
+because of mismatches in null-handling and semantics, between RAPIDS `libcudf` (on which
+the Rapids Accelerator relies heavily), and Apache Spark.
 
-Tests for aggregations (including group-by, reductions, and window aggregations) should cover cases where 
+Tests for aggregations (including group-by, reductions, and window aggregations) should cover cases where
 some rows are null, and where *all* input rows are null.
 
 Apart from null rows in columns of primitive types, the following conditions must be covered for nested types:
 
    * Null rows at the "top" level for `Array`/`Struct` columns.   E.g. `[ [1,2], [3], ∅, [4,5,6] ]`.
    * Non-null rows containing null elements in the child column. E.g. `[ [1,2], [3,∅], ∅, [4,∅,6] ]`.
-   * All null rows at a nested level. E.g. 
+   * All null rows at a nested level. E.g.
      * All null list rows: `[ ∅, ∅, ∅, ∅ ]`
      * All null elements within list rows: `[ [∅,∅], [∅,∅], [∅,∅], [∅,∅] ]`
 
@@ -579,10 +581,10 @@ describes this with examples. Operations should be tested with multiple bit-repr
 The `FloatGen` and `DoubleGen` data generators in `integration_tests/src/main/python/data_gen.py` can be configured
 to generate the special float/double values mentioned above.
 
-For most basic floating-point operations like addition, subtraction, multiplication, and division the plugin will 
+For most basic floating-point operations like addition, subtraction, multiplication, and division the plugin will
 produce a bit for bit identical result as Spark does. For some other functions (like `sin`, `cos`, etc.), the output may
 differ slightly, but remain within the rounding error inherent in floating-point calculations. Certain aggregations
-might compound those differences. In those cases, the `@approximate_float` test annotation may be used to mark tests 
+might compound those differences. In those cases, the `@approximate_float` test annotation may be used to mark tests
 to use "approximate" comparisons for floating-point values.
 
 Refer to the "Floating Point" section of [compatibility.md](../docs/compatibility.md) for details.
@@ -590,11 +592,11 @@ Refer to the "Floating Point" section of [compatibility.md](../docs/compatibilit
 ### 8. Special values in timestamp columns
 Ensure date/timestamp columns include dates before the [epoch](https://en.wikipedia.org/wiki/Epoch_(computing)).
 
-Apache Spark supports dates/timestamps between `0001-01-01 00:00:00.000000` and `9999-12-31 23:59:59.999999`, but at 
+Apache Spark supports dates/timestamps between `0001-01-01 00:00:00.000000` and `9999-12-31 23:59:59.999999`, but at
 values close to the minimum value, the format used in Apache Spark causes rounding errors. To avoid such problems,
 it is recommended that the minimum value used in a test not actually equal `0001-01-01`. For instance, `0001-01-03` is
 acceptable.
 
-It is advised that `DateGen` and `TimestampGen` classes from `data_gen.py` be used to generate valid 
-(proleptic Gregorian calendar) dates when testing operators that work on dates. This data generator respects 
+It is advised that `DateGen` and `TimestampGen` classes from `data_gen.py` be used to generate valid
+(proleptic Gregorian calendar) dates when testing operators that work on dates. This data generator respects
 the valid boundaries for dates and timestamps.
diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh
index 3ac660c5c45..d3b9adacb57 100755
--- a/integration_tests/run_pyspark_from_build.sh
+++ b/integration_tests/run_pyspark_from_build.sh
@@ -25,6 +25,7 @@ then
     >&2 echo "SPARK_HOME IS NOT SET CANNOT RUN PYTHON INTEGRATION TESTS..."
 else
     echo "WILL RUN TESTS WITH SPARK_HOME: ${SPARK_HOME}"
+    [[ ! -x "$(command -v zip)" ]] && { echo "fail to find zip command in $PATH"; exit 1; }
     # Spark 3.1.1 includes https://github.com/apache/spark/pull/31540
     # which helps with spurious task failures as observed in our tests. If you are running
     # Spark versions before 3.1.1, this sets the spark.max.taskFailures to 4 to allow for
@@ -258,7 +259,14 @@ else
 
     # If you want to change the amount of GPU memory allocated you have to change it here
     # and where TEST_PARALLEL is calculated
-    export PYSP_TEST_spark_rapids_memory_gpu_allocSize='1536m'
+    if [[ -n "${PYSP_TEST_spark_rapids_memory_gpu_allocSize}" ]]; then
+       >&2 echo "#### WARNING: using externally set" \
+                "PYSP_TEST_spark_rapids_memory_gpu_allocSize" \
+                "${PYSP_TEST_spark_rapids_memory_gpu_allocSize}." \
+                "If needed permanently in CI please file an issue to accommodate" \
+                "for new GPU memory requirements ####"
+    fi
+    export PYSP_TEST_spark_rapids_memory_gpu_allocSize=${PYSP_TEST_spark_rapids_memory_gpu_allocSize:-'1536m'}
 
     if ((${#TEST_PARALLEL_OPTS[@]} > 0));
     then
@@ -266,11 +274,17 @@ else
     else
         # We set the GPU memory size to be a constant value even if only running with a parallelism of 1
         # because it helps us have consistent test runs.
+        jarOpts=()
         if [[ -n "$PYSP_TEST_spark_jars" ]]; then
-            # `spark.jars` is the same as `--jars`, e.g.: --jars a.jar,b.jar...
-            jarOpts=(--conf spark.jars="${PYSP_TEST_spark_jars}")
-        elif [[ -n "$PYSP_TEST_spark_driver_extraClassPath" ]]; then
-            jarOpts=(--driver-class-path "${PYSP_TEST_spark_driver_extraClassPath}")
+            jarOpts+=(--jars "${PYSP_TEST_spark_jars}")
+        fi
+
+        if [[ -n "$PYSP_TEST_spark_jars_packages" ]]; then
+            jarOpts+=(--packages "${PYSP_TEST_spark_jars_packages}")
+        fi
+
+        if [[ -n "$PYSP_TEST_spark_driver_extraClassPath" ]]; then
+            jarOpts+=(--driver-class-path "${PYSP_TEST_spark_driver_extraClassPath}")
         fi
 
         driverJavaOpts="$PYSP_TEST_spark_driver_extraJavaOptions"
@@ -281,6 +295,7 @@ else
         unset PYSP_TEST_spark_driver_extraClassPath
         unset PYSP_TEST_spark_driver_extraJavaOptions
         unset PYSP_TEST_spark_jars
+        unset PYSP_TEST_spark_jars_packages
         unset PYSP_TEST_spark_rapids_memory_gpu_allocSize
 
         exec "$SPARK_HOME"/bin/spark-submit "${jarOpts[@]}" \
diff --git a/integration_tests/src/main/python/arithmetic_ops_test.py b/integration_tests/src/main/python/arithmetic_ops_test.py
index f86f6294180..9e2e3e06a38 100644
--- a/integration_tests/src/main/python/arithmetic_ops_test.py
+++ b/integration_tests/src/main/python/arithmetic_ops_test.py
@@ -27,12 +27,17 @@
 # No overflow gens here because we just focus on verifying the fallback to CPU when
 # enabling ANSI mode. But overflows will fail the tests because CPU runs raise
 # exceptions.
-_no_overflow_multiply_gens = [
+_no_overflow_multiply_gens_for_fallback = [
     ByteGen(min_val = 1, max_val = 10, special_cases=[]),
     ShortGen(min_val = 1, max_val = 100, special_cases=[]),
     IntegerGen(min_val = 1, max_val = 1000, special_cases=[]),
     LongGen(min_val = 1, max_val = 3000, special_cases=[])]
 
+
+_no_overflow_multiply_gens = _no_overflow_multiply_gens_for_fallback + [
+    DecimalGen(10, 0),
+    DecimalGen(19, 0)]
+
 _decimal_gen_7_7 = DecimalGen(precision=7, scale=7)
 _decimal_gen_18_0 = DecimalGen(precision=18, scale=0)
 _decimal_gen_18_3 = DecimalGen(precision=18, scale=3)
@@ -43,11 +48,14 @@
 _decimal_gen_38_10 = DecimalGen(precision=38, scale=10)
 _decimal_gen_38_neg10 = DecimalGen(precision=38, scale=-10)
 
-_arith_data_gens_diff_precision_scale_and_no_neg_scale = [
+_arith_data_gens_diff_precision_scale_and_no_neg_scale_no_38_0 = [
     decimal_gen_32bit, decimal_gen_64bit, _decimal_gen_18_0, decimal_gen_128bit,
-    _decimal_gen_30_2, _decimal_gen_36_5, _decimal_gen_38_0, _decimal_gen_38_10
+    _decimal_gen_30_2, _decimal_gen_36_5, _decimal_gen_38_10
 ]
 
+_arith_data_gens_diff_precision_scale_and_no_neg_scale = \
+    _arith_data_gens_diff_precision_scale_and_no_neg_scale_no_38_0 + [_decimal_gen_38_0]
+
 _arith_decimal_gens_no_neg_scale = _arith_data_gens_diff_precision_scale_and_no_neg_scale + [_decimal_gen_7_7]
 
 _arith_decimal_gens = _arith_decimal_gens_no_neg_scale + [
@@ -58,6 +66,12 @@
 
 _arith_data_gens_no_neg_scale = numeric_gens + _arith_decimal_gens_no_neg_scale
 
+_arith_decimal_gens_no_neg_scale_38_0_overflow = \
+    _arith_data_gens_diff_precision_scale_and_no_neg_scale_no_38_0 + [
+        _decimal_gen_7_7,
+        pytest.param(_decimal_gen_38_0, marks=pytest.mark.skipif(
+            is_spark_330_or_later(), reason='This case overflows in Spark 3.3.0+'))]
+
 def _get_overflow_df(spark, data, data_type, expr):
     return spark.createDataFrame(
         SparkContext.getOrCreate().parallelize([data]),
@@ -114,19 +128,24 @@ def test_subtraction_ansi_no_overflow(data_gen):
 
 @pytest.mark.parametrize('data_gen', numeric_gens + [
     decimal_gen_32bit_neg_scale, decimal_gen_32bit, _decimal_gen_7_7,
-    DecimalGen(precision=8, scale=8), decimal_gen_64bit, _decimal_gen_18_3], ids=idfn)
+    DecimalGen(precision=8, scale=8), decimal_gen_64bit, _decimal_gen_18_3,
+    _decimal_gen_38_10,
+    _decimal_gen_38_neg10
+    ], ids=idfn)
 def test_multiplication(data_gen):
     data_type = data_gen.data_type
     assert_gpu_and_cpu_are_equal_collect(
             lambda spark : binary_op_df(spark, data_gen).select(
+                f.col('a'), f.col('b'),
                 f.col('a') * f.lit(100).cast(data_type),
                 f.lit(-12).cast(data_type) * f.col('b'),
                 f.lit(None).cast(data_type) * f.col('a'),
                 f.col('b') * f.lit(None).cast(data_type),
-                f.col('a') * f.col('b')))
+                f.col('a') * f.col('b')
+                ))
 
 @allow_non_gpu('ProjectExec', 'Alias', 'Multiply', 'Cast')
-@pytest.mark.parametrize('data_gen', _no_overflow_multiply_gens, ids=idfn)
+@pytest.mark.parametrize('data_gen', _no_overflow_multiply_gens_for_fallback, ids=idfn)
 def test_multiplication_fallback_when_ansi_enabled(data_gen):
     assert_gpu_fallback_collect(
             lambda spark : binary_op_df(spark, data_gen).select(
@@ -134,7 +153,7 @@ def test_multiplication_fallback_when_ansi_enabled(data_gen):
             'Multiply',
             conf=ansi_enabled_conf)
 
-@pytest.mark.parametrize('data_gen', [float_gen, double_gen, decimal_gen_32bit], ids=idfn)
+@pytest.mark.parametrize('data_gen', [float_gen, double_gen, decimal_gen_32bit, DecimalGen(19, 0)], ids=idfn)
 def test_multiplication_ansi_enabled(data_gen):
     data_type = data_gen.data_type
     assert_gpu_and_cpu_are_equal_collect(
@@ -143,8 +162,18 @@ def test_multiplication_ansi_enabled(data_gen):
                 f.col('a') * f.col('b')),
             conf=ansi_enabled_conf)
 
-@pytest.mark.parametrize('lhs', [byte_gen, short_gen, int_gen, long_gen, DecimalGen(6, 5), DecimalGen(6, 4), DecimalGen(5, 4), DecimalGen(5, 3), DecimalGen(4, 2), DecimalGen(3, -2), DecimalGen(16, 7)], ids=idfn)
-@pytest.mark.parametrize('rhs', [byte_gen, short_gen, int_gen, long_gen, DecimalGen(6, 3), DecimalGen(10, -2), DecimalGen(15, 3)], ids=idfn)
+def test_multiplication_ansi_overflow():
+    exception_str = 'ArithmeticException'
+    assert_gpu_and_cpu_error(
+        lambda spark : unary_op_df(spark, DecimalGen(38, 0)).selectExpr("a * " + "9"*38 + " as ret").collect(),
+        ansi_enabled_conf,
+        exception_str)
+
+@pytest.mark.parametrize('lhs', [byte_gen, short_gen, int_gen, long_gen, DecimalGen(6, 5), DecimalGen(6, 4), DecimalGen(5, 4), DecimalGen(5, 3),
+    DecimalGen(4, 2), DecimalGen(3, -2), DecimalGen(16, 7), DecimalGen(19, 0),
+    DecimalGen(30, 10)], ids=idfn)
+@pytest.mark.parametrize('rhs', [byte_gen, short_gen, int_gen, long_gen, DecimalGen(6, 3), DecimalGen(10, -2), DecimalGen(15, 3),
+    DecimalGen(30, 12), DecimalGen(3, -3), DecimalGen(27, 7), DecimalGen(20, -3)], ids=idfn)
 def test_multiplication_mixed(lhs, rhs):
     assert_gpu_and_cpu_are_equal_collect(
             lambda spark : two_col_df(spark, lhs, rhs).select(
@@ -161,7 +190,7 @@ def test_float_multiplication_mixed(lhs, rhs):
 
 @pytest.mark.parametrize('data_gen', [double_gen, decimal_gen_32bit_neg_scale, DecimalGen(6, 3),
  DecimalGen(5, 5), DecimalGen(6, 0), DecimalGen(7, 4), DecimalGen(15, 0), DecimalGen(18, 0), 
- DecimalGen(17, 2), DecimalGen(16, 4)], ids=idfn)
+ DecimalGen(17, 2), DecimalGen(16, 4), DecimalGen(38, 21), DecimalGen(21, 17), DecimalGen(3, -2)], ids=idfn)
 def test_division(data_gen):
     data_type = data_gen.data_type
     assert_gpu_and_cpu_are_equal_collect(
@@ -172,21 +201,33 @@ def test_division(data_gen):
                 f.col('b') / f.lit(None).cast(data_type),
                 f.col('a') / f.col('b')))
 
-@allow_non_gpu('ProjectExec', 'Alias', 'Divide', 'Cast', 'PromotePrecision', 'CheckOverflow')
-@pytest.mark.parametrize('data_gen', [DecimalGen(38, 21), DecimalGen(21, 17)], ids=idfn)
-def test_division_fallback_on_decimal(data_gen):
-    assert_gpu_fallback_collect(
-            lambda spark : binary_op_df(spark, data_gen).select(
-                f.col('a') / f.col('b')),
-            'Divide')
-
 @pytest.mark.parametrize('rhs', [byte_gen, short_gen, int_gen, long_gen, DecimalGen(4, 1), DecimalGen(5, 0), DecimalGen(5, 1), DecimalGen(10, 5)], ids=idfn)
 @pytest.mark.parametrize('lhs', [byte_gen, short_gen, int_gen, long_gen, DecimalGen(5, 3), DecimalGen(4, 2), DecimalGen(1, -2), DecimalGen(16, 1)], ids=idfn)
 def test_division_mixed(lhs, rhs):
     assert_gpu_and_cpu_are_equal_collect(
             lambda spark : two_col_df(spark, lhs, rhs).select(
+                f.col('a'), f.col('b'),
                 f.col('a') / f.col('b')))
 
+# Spark has some problems with some decimal operations where it can try to generate a type that is invalid (scale > precision) which results in an error
+# instead of increasing the precision. So we have a second test that deals with a few of these use cases
+@pytest.mark.parametrize('rhs', [DecimalGen(30, 10), DecimalGen(28, 18)], ids=idfn)
+@pytest.mark.parametrize('lhs', [DecimalGen(27, 7), DecimalGen(20, -3)], ids=idfn)
+def test_division_mixed_larger_dec(lhs, rhs):
+    assert_gpu_and_cpu_are_equal_collect(
+            lambda spark : two_col_df(spark, lhs, rhs).select(
+                f.col('a'), f.col('b'),
+                f.col('a') / f.col('b')))
+
+def test_special_decimal_division():
+    for precision in range(1, 39):
+        for scale in range(-3, precision + 1):
+            print("PRECISION " + str(precision) + " SCALE " + str(scale))
+            data_gen = DecimalGen(precision, scale)
+            assert_gpu_and_cpu_are_equal_collect(
+                    lambda spark : two_col_df(spark, data_gen, data_gen).select(
+                        f.col('a') / f.col('b')))
+
 @approximate_float # we should get the perfectly correct answer for floats except when casting a decimal to a float in some corner cases.
 @pytest.mark.parametrize('rhs', [float_gen, double_gen], ids=idfn)
 @pytest.mark.parametrize('lhs', [DecimalGen(5, 3), DecimalGen(4, 2), DecimalGen(1, -2), DecimalGen(16, 1)], ids=idfn)
@@ -196,38 +237,6 @@ def test_float_division_mixed(lhs, rhs):
                 f.col('a') / f.col('b')),
             conf={'spark.rapids.sql.castDecimalToFloat.enabled': 'true'})
 
-@ignore_order
-@pytest.mark.parametrize('rhs,rhs_type', [
-    (DecimalGen(15, 3), DecimalType(30, 10)),
-    (DecimalGen(10, 2), DecimalType(28, 18))], ids=idfn)
-@pytest.mark.parametrize('lhs,lhs_type', [
-    (DecimalGen(15, 3), DecimalType(27, 7)),
-    (DecimalGen(3, -3), DecimalType(20, -3))], ids=idfn)
-def test_decimal_division_mixed_no_overflow_guarantees(lhs, lhs_type, rhs, rhs_type):
-    assert_gpu_and_cpu_are_equal_collect(
-            lambda spark : two_col_df(spark, lhs, rhs)\
-                    .withColumn('lhs', f.col('a').cast(lhs_type))\
-                    .withColumn('rhs', f.col('b').cast(rhs_type))\
-                    .repartition(1)\
-                    .select(f.col('lhs'), f.col('rhs'), f.col('lhs') / f.col('rhs')),
-            conf={'spark.rapids.sql.decimalOverflowGuarantees': 'false'})
-
-@ignore_order
-@pytest.mark.parametrize('rhs,rhs_type', [
-    (DecimalGen(15, 3), DecimalType(30, 10)),
-    (DecimalGen(10, 2), DecimalType(28, 9))], ids=idfn)
-@pytest.mark.parametrize('lhs,lhs_type', [
-    (DecimalGen(10, 3), DecimalType(27, 7)),
-    (DecimalGen(3, -3), DecimalType(20, -3))], ids=idfn)
-def test_decimal_multiplication_mixed_no_overflow_guarantees(lhs, lhs_type, rhs, rhs_type):
-    assert_gpu_and_cpu_are_equal_collect(
-            lambda spark : two_col_df(spark, lhs, rhs)\
-                    .withColumn('lhs', f.col('a').cast(lhs_type))\
-                    .withColumn('rhs', f.col('b').cast(rhs_type))\
-                    .repartition(1)\
-                    .select(f.col('lhs'), f.col('rhs'), f.col('lhs') * f.col('rhs')),
-            conf={'spark.rapids.sql.decimalOverflowGuarantees': 'false'})
-
 @pytest.mark.parametrize('data_gen', integral_gens + [
     decimal_gen_32bit, decimal_gen_64bit, _decimal_gen_7_7, _decimal_gen_18_3, _decimal_gen_30_2,
     _decimal_gen_36_5, _decimal_gen_38_0], ids=idfn)
@@ -259,7 +268,14 @@ def test_mod(data_gen):
                 f.col('b') % f.lit(None).cast(data_type),
                 f.col('a') % f.col('b')))
 
-@pytest.mark.parametrize('data_gen', _arith_data_gens_no_neg_scale, ids=idfn)
+# pmod currently falls back for Decimal(precision=38)
+# https://github.com/NVIDIA/spark-rapids/issues/6336
+_pmod_gens = numeric_gens + [ decimal_gen_32bit, decimal_gen_64bit, _decimal_gen_18_0, decimal_gen_128bit,
+                              _decimal_gen_30_2, _decimal_gen_36_5,
+                              DecimalGen(precision=37, scale=0), DecimalGen(precision=37, scale=10),
+                              _decimal_gen_7_7]
+
+@pytest.mark.parametrize('data_gen', _pmod_gens, ids=idfn)
 def test_pmod(data_gen):
     string_type = to_cast_string(data_gen.data_type)
     assert_gpu_and_cpu_are_equal_collect(
@@ -270,6 +286,19 @@ def test_pmod(data_gen):
                 'pmod(b, cast(null as {}))'.format(string_type),
                 'pmod(a, b)'))
 
+@allow_non_gpu("ProjectExec", "Pmod")
+@pytest.mark.parametrize('data_gen', [_decimal_gen_38_0, _decimal_gen_38_10], ids=idfn)
+def test_pmod_fallback(data_gen):
+    string_type = to_cast_string(data_gen.data_type)
+    assert_gpu_fallback_collect(
+        lambda spark : binary_op_df(spark, data_gen).selectExpr(
+            'pmod(a, cast(100 as {}))'.format(string_type),
+            'pmod(cast(-12 as {}), b)'.format(string_type),
+            'pmod(cast(null as {}), a)'.format(string_type),
+            'pmod(b, cast(null as {}))'.format(string_type),
+            'pmod(a, b)'),
+        "Pmod")
+
 # test pmod(Long.MinValue, -1) = 0 and Long.MinValue % -1 = 0, should not throw
 def test_mod_pmod_long_min_value():
     assert_gpu_and_cpu_are_equal_collect(
@@ -278,7 +307,10 @@ def test_mod_pmod_long_min_value():
             'a % -1L'),
         ansi_enabled_conf)
 
-@pytest.mark.parametrize('data_gen', _arith_data_gens_diff_precision_scale_and_no_neg_scale, ids=idfn)
+# pmod currently falls back for Decimal(precision=38)
+# https://github.com/NVIDIA/spark-rapids/issues/6336
+@pytest.mark.parametrize('data_gen', [decimal_gen_32bit, decimal_gen_64bit, _decimal_gen_18_0,
+                                      decimal_gen_128bit, _decimal_gen_30_2, _decimal_gen_36_5], ids=idfn)
 @pytest.mark.parametrize('overflow_exp', [
     'pmod(a, cast(0 as {}))',
     'pmod(cast(-12 as {}), cast(0 as {}))',
@@ -314,7 +346,7 @@ def test_cast_neg_to_decimal_err():
         ansi_enabled_conf,
         exception_type + exception_content)
 
-@pytest.mark.parametrize('data_gen', _arith_data_gens_no_neg_scale, ids=idfn)
+@pytest.mark.parametrize('data_gen', _pmod_gens, ids=idfn)
 def test_mod_pmod_by_zero_not_ansi(data_gen):
     string_type = to_cast_string(data_gen.data_type)
     assert_gpu_and_cpu_are_equal_collect(
@@ -431,7 +463,7 @@ def test_floor_scale_zero(data_gen):
 
 @pytest.mark.skipif(is_before_spark_330(), reason='scale parameter in Floor function is not supported before Spark 3.3.0')
 @allow_non_gpu('ProjectExec')
-@pytest.mark.parametrize('data_gen', double_n_long_gens + _arith_decimal_gens_no_neg_scale, ids=idfn)
+@pytest.mark.parametrize('data_gen', double_n_long_gens + _arith_decimal_gens_no_neg_scale_38_0_overflow, ids=idfn)
 def test_floor_scale_nonzero(data_gen):
     assert_gpu_fallback_collect(
             lambda spark : unary_op_df(spark, data_gen).selectExpr('floor(a, -1)'), 'RoundFloor')
@@ -502,7 +534,7 @@ def test_shift_right_unsigned(data_gen):
                 'shiftrightunsigned(a, cast(null as INT))',
                 'shiftrightunsigned(a, b)'))
 
-_arith_data_gens_for_round = numeric_gens +  _arith_decimal_gens_no_neg_scale + [
+_arith_data_gens_for_round = numeric_gens + _arith_decimal_gens_no_neg_scale_38_0_overflow + [
     decimal_gen_32bit_neg_scale,
     DecimalGen(precision=15, scale=-8),
     DecimalGen(precision=30, scale=-5),
@@ -1050,16 +1082,15 @@ def test_unary_positive_day_time_interval():
         lambda spark: unary_op_df(spark, DayTimeIntervalGen()).selectExpr('+a'))
 
 @pytest.mark.skipif(is_before_spark_330(), reason='DayTimeInterval is not supported before Pyspark 3.3.0')
-@pytest.mark.parametrize('data_gen', _no_overflow_multiply_gens + [DoubleGen(min_exp=-3, max_exp=5, special_cases=[0.0])], ids=idfn)
+@pytest.mark.parametrize('data_gen', _no_overflow_multiply_gens_for_fallback + [DoubleGen(min_exp=-3, max_exp=5, special_cases=[0.0])], ids=idfn)
 def test_day_time_interval_multiply_number(data_gen):
     gen_list = [('_c1', DayTimeIntervalGen(min_value=timedelta(seconds=-20 * 86400), max_value=timedelta(seconds=20 * 86400))),
                 ('_c2', data_gen)]
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark: gen_df(spark, gen_list).selectExpr("_c1 * _c2"))
 
-
 @pytest.mark.skipif(is_before_spark_330(), reason='DayTimeInterval is not supported before Pyspark 3.3.0')
-@pytest.mark.parametrize('data_gen', _no_overflow_multiply_gens + [DoubleGen(min_exp=0, max_exp=5, special_cases=[])], ids=idfn)
+@pytest.mark.parametrize('data_gen', _no_overflow_multiply_gens_for_fallback + [DoubleGen(min_exp=0, max_exp=5, special_cases=[])], ids=idfn)
 def test_day_time_interval_division_number_no_overflow1(data_gen):
     gen_list = [('_c1', DayTimeIntervalGen(min_value=timedelta(seconds=-5000 * 365 * 86400), max_value=timedelta(seconds=5000 * 365 * 86400))),
                 ('_c2', data_gen)]
@@ -1068,7 +1099,7 @@ def test_day_time_interval_division_number_no_overflow1(data_gen):
         lambda spark: gen_df(spark, gen_list).selectExpr("_c1 / case when _c2 = 0 then cast(1 as {}) else _c2 end".format(to_cast_string(data_gen.data_type))))
 
 @pytest.mark.skipif(is_before_spark_330(), reason='DayTimeInterval is not supported before Pyspark 3.3.0')
-@pytest.mark.parametrize('data_gen', _no_overflow_multiply_gens + [DoubleGen(min_exp=-5, max_exp=0, special_cases=[])], ids=idfn)
+@pytest.mark.parametrize('data_gen', _no_overflow_multiply_gens_for_fallback + [DoubleGen(min_exp=-5, max_exp=0, special_cases=[])], ids=idfn)
 def test_day_time_interval_division_number_no_overflow2(data_gen):
     gen_list = [('_c1', DayTimeIntervalGen(min_value=timedelta(seconds=-20 * 86400), max_value=timedelta(seconds=20 * 86400))),
                 ('_c2', data_gen)]
diff --git a/integration_tests/src/main/python/cache_test.py b/integration_tests/src/main/python/cache_test.py
index cff9d4e7a07..8e37fb61d1c 100644
--- a/integration_tests/src/main/python/cache_test.py
+++ b/integration_tests/src/main/python/cache_test.py
@@ -26,7 +26,9 @@
 enable_vectorized_confs = [{"spark.sql.inMemoryColumnarStorage.enableVectorizedReader": "true"},
                            {"spark.sql.inMemoryColumnarStorage.enableVectorizedReader": "false"}]
 
-_cache_decimal_gens = [decimal_gen_32bit, decimal_gen_64bit, decimal_gen_128bit]
+# Many tests sort the results, so use a sortable decimal generator as many Spark versions
+# fail to sort some large decimals properly.
+_cache_decimal_gens = [decimal_gen_32bit, decimal_gen_64bit, orderable_decimal_gen_128bit]
 _cache_single_array_gens_no_null = [ArrayGen(gen) for gen in all_basic_gens_no_null + _cache_decimal_gens]
 
 decimal_struct_gen= StructGen([['child0', sub_gen] for ind, sub_gen in enumerate(_cache_decimal_gens)])
@@ -166,7 +168,7 @@ def n_fold(spark):
                                      pytest.param(FloatGen(special_cases=[FLOAT_MIN, FLOAT_MAX, 0.0, 1.0, -1.0]), marks=[incompat]),
                                      pytest.param(DoubleGen(special_cases=double_special_cases), marks=[incompat]),
                                      BooleanGen(), DateGen(), TimestampGen(), decimal_gen_32bit, decimal_gen_64bit,
-                                     decimal_gen_128bit] + _cache_single_array_gens_no_null_no_timestamp, ids=idfn)
+                                     orderable_decimal_gen_128bit] + _cache_single_array_gens_no_null_no_timestamp, ids=idfn)
 @pytest.mark.parametrize('ts_write', ['TIMESTAMP_MICROS', 'TIMESTAMP_MILLIS'])
 @pytest.mark.parametrize('enable_vectorized', ['true', 'false'], ids=idfn)
 @ignore_order
diff --git a/integration_tests/src/main/python/cast_test.py b/integration_tests/src/main/python/cast_test.py
index 2b15e68b2bf..8c5326c44ec 100644
--- a/integration_tests/src/main/python/cast_test.py
+++ b/integration_tests/src/main/python/cast_test.py
@@ -132,11 +132,15 @@ def test_cast_string_date_non_ansi():
         lambda spark: spark.createDataFrame(data_rows, "a string").select(f.col('a').cast(DateType())),
         conf={'spark.rapids.sql.hasExtendedYearValues': 'false'})
 
-def test_cast_string_ts_valid_format():
+@pytest.mark.parametrize('data_gen', [StringGen('[0-9]{1,4}-[0-9]{1,2}-[0-9]{1,2}'),
+                                      StringGen('[0-9]{1,4}-[0-3][0-9]-[0-5][0-9][ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]'),
+                                      StringGen('[0-9]{1,4}-[0-3][0-9]-[0-5][0-9][ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9].[0-9]{0,6}Z?')],
+                        ids=idfn)
+def test_cast_string_ts_valid_format(data_gen):
     # In Spark 3.2.0+ the valid format changed, and we cannot support all of the format.
     # This provides values that are valid in all of those formats.
     assert_gpu_and_cpu_are_equal_collect(
-            lambda spark : unary_op_df(spark, StringGen('[0-9]{1,4}-[0-9]{1,2}-[0-9]{1,2}')).select(f.col('a').cast(TimestampType())),
+            lambda spark : unary_op_df(spark, data_gen).select(f.col('a').cast(TimestampType())),
             conf = {'spark.rapids.sql.hasExtendedYearValues': 'false',
                 'spark.rapids.sql.castStringToTimestamp.enabled': 'true'})
 
@@ -228,9 +232,15 @@ def test_cast_long_to_decimal_overflow():
 # casting these types to string should be passed
 basic_gens_for_cast_to_string = [ByteGen, ShortGen, IntegerGen, LongGen, StringGen, BooleanGen, DateGen, TimestampGen] 
 basic_array_struct_gens_for_cast_to_string = [f() for f in basic_gens_for_cast_to_string] + [null_gen] + decimal_gens
+
+# We currently do not generate the exact string as Spark for some decimal values of zero
+# https://github.com/NVIDIA/spark-rapids/issues/6339
 basic_map_gens_for_cast_to_string = [
     MapGen(f(nullable=False), f()) for f in basic_gens_for_cast_to_string] + [
-    MapGen(DecimalGen(nullable=False), DecimalGen(precision=7, scale=3)), MapGen(DecimalGen(precision=7, scale=7, nullable=False), DecimalGen(precision=12, scale=2))]
+    MapGen(DecimalGen(nullable=False, special_cases=[]),
+           DecimalGen(precision=7, scale=3, special_cases=[])),
+    MapGen(DecimalGen(precision=7, scale=7, nullable=False, special_cases=[]),
+           DecimalGen(precision=12, scale=2), special_cases=[])]
 
 # GPU does not match CPU to casting these types to string, marked as xfail when testing
 not_matched_gens_for_cast_to_string = [FloatGen, DoubleGen]
diff --git a/integration_tests/src/main/python/conftest.py b/integration_tests/src/main/python/conftest.py
index 8b880985aa8..97b3be41ba8 100644
--- a/integration_tests/src/main/python/conftest.py
+++ b/integration_tests/src/main/python/conftest.py
@@ -15,8 +15,15 @@
 import os
 import pytest
 import random
-from spark_init_internal import get_spark_i_know_what_i_am_doing
-from pyspark.sql.dataframe import DataFrame
+
+# TODO redo _spark stuff using fixtures
+#
+# Don't import pyspark / _spark directly in conftest globally
+# import as a plugin to do a lazy per-pytest-session initialization
+#
+pytest_plugins = [
+    'spark_init_internal'
+]
 
 _approximate_float_args = None
 
@@ -252,6 +259,7 @@ def get_worker_id(request):
 
 @pytest.fixture
 def spark_tmp_path(request):
+    from spark_init_internal import get_spark_i_know_what_i_am_doing
     debug = request.config.getoption('debug_tmp_path')
     ret = request.config.getoption('tmp_path')
     if ret is None:
@@ -282,6 +290,7 @@ def get(self):
 
 @pytest.fixture
 def spark_tmp_table_factory(request):
+    from spark_init_internal import get_spark_i_know_what_i_am_doing
     worker_id = get_worker_id(request)
     table_id = random.getrandbits(31)
     base_id = f'tmp_table_{worker_id}_{table_id}'
@@ -300,6 +309,7 @@ def _get_jvm(spark):
     return spark.sparkContext._jvm
 
 def spark_jvm():
+    from spark_init_internal import get_spark_i_know_what_i_am_doing
     return _get_jvm(get_spark_i_know_what_i_am_doing())
 
 class MortgageRunner:
@@ -309,6 +319,7 @@ def __init__(self, mortgage_format, mortgage_acq_path, mortgage_perf_path):
     self.mortgage_perf_path = mortgage_perf_path
 
   def do_test_query(self, spark):
+    from pyspark.sql.dataframe import DataFrame
     jvm_session = _get_jvm_session(spark)
     jvm = _get_jvm(spark)
     acq = self.mortgage_acq_path
@@ -324,7 +335,7 @@ def do_test_query(self, spark):
         raise AssertionError('Not Supported Format {}'.format(self.mortgage_format))
 
     return DataFrame(df, spark.getActiveSession())
-   
+
 @pytest.fixture(scope="session")
 def mortgage(request):
     mortgage_format = request.config.getoption("mortgage_format")
diff --git a/integration_tests/src/main/python/data_gen.py b/integration_tests/src/main/python/data_gen.py
index db4b54b501f..6a8997e7167 100644
--- a/integration_tests/src/main/python/data_gen.py
+++ b/integration_tests/src/main/python/data_gen.py
@@ -21,7 +21,7 @@
 from pyspark.sql.types import *
 import pyspark.sql.functions as f
 import random
-from spark_session import is_tz_utc
+from spark_session import is_tz_utc, is_before_spark_340
 import sre_yield
 import struct
 from conftest import skip_unless_precommit_tests
@@ -215,17 +215,19 @@ def start(self, rand):
 
 class DecimalGen(DataGen):
     """Generate Decimals, with some built in corner cases."""
-    def __init__(self, precision=None, scale=None, nullable=True, special_cases=[]):
+    def __init__(self, precision=None, scale=None, nullable=True, special_cases=None):
         if precision is None:
             #Maximum number of decimal digits a Long can represent is 18
             precision = 18
             scale = 0
         DECIMAL_MIN = Decimal('-' + ('9' * precision) + 'e' + str(-scale))
         DECIMAL_MAX = Decimal(('9'* precision) + 'e' + str(-scale))
+        if (special_cases is None):
+            special_cases = [DECIMAL_MIN, DECIMAL_MAX, Decimal('0')]
         super().__init__(DecimalType(precision, scale), nullable=nullable, special_cases=special_cases)
         self.scale = scale
         self.precision = precision
-        pattern = "[0-9]{1,"+ str(precision) + "}e" + str(-scale)
+        pattern = "-?[0-9]{1,"+ str(precision) + "}e" + str(-scale)
         self.base_strs = sre_yield.AllStrings(pattern, flags=0, charset=sre_yield.CHARSET, max_count=_MAX_CHOICES)
 
     def __repr__(self):
@@ -928,10 +930,18 @@ def gen_scalars_for_sql(data_gen, count, seed=0, force_no_nulls=False):
 all_basic_gens_no_nan = [byte_gen, short_gen, int_gen, long_gen, FloatGen(no_nans=True), DoubleGen(no_nans=True),
         string_gen, boolean_gen, date_gen, timestamp_gen, null_gen]
 
+# Many Spark versions have issues sorting large decimals,
+# see https://issues.apache.org/jira/browse/SPARK-40089.
+orderable_decimal_gen_128bit = decimal_gen_128bit
+if is_before_spark_340():
+    orderable_decimal_gen_128bit = DecimalGen(precision=20, scale=2, special_cases=[])
+
+orderable_decimal_gens = [decimal_gen_32bit, decimal_gen_64bit, orderable_decimal_gen_128bit ]
+
 # TODO add in some array generators to this once that is supported for sorting
 # a selection of generators that should be orderable (sortable and compareable)
 orderable_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
-        string_gen, boolean_gen, date_gen, timestamp_gen, null_gen] + decimal_gens
+        string_gen, boolean_gen, date_gen, timestamp_gen, null_gen] + orderable_decimal_gens
 
 # TODO add in some array generators to this once that is supported for these operations
 # a selection of generators that can be compared for equality
diff --git a/integration_tests/src/main/python/date_time_test.py b/integration_tests/src/main/python/date_time_test.py
index 722febeb85e..a22ea24ac2b 100644
--- a/integration_tests/src/main/python/date_time_test.py
+++ b/integration_tests/src/main/python/date_time_test.py
@@ -215,6 +215,21 @@ def test_to_unix_timestamp(data_gen, ansi_enabled):
         {'spark.sql.ansi.enabled': ansi_enabled})
 
 
+@pytest.mark.parametrize('time_zone', ["UTC", "UTC+0", "UTC-0", "GMT", "GMT+0", "GMT-0"], ids=idfn)
+@pytest.mark.parametrize('data_gen', [timestamp_gen], ids=idfn)
+def test_from_utc_timestamp(data_gen, time_zone):
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark : unary_op_df(spark, data_gen).select(f.from_utc_timestamp(f.col('a'), time_zone)))
+
+@allow_non_gpu('ProjectExec, FromUTCTimestamp')
+@pytest.mark.parametrize('time_zone', ["PST", "MST", "EST", "VST", "NST", "AST"], ids=idfn)
+@pytest.mark.parametrize('data_gen', [timestamp_gen], ids=idfn)
+def test_from_utc_timestamp_fallback(data_gen, time_zone):
+    assert_gpu_fallback_collect(
+        lambda spark : unary_op_df(spark, data_gen).select(f.from_utc_timestamp(f.col('a'), time_zone)),
+    'ProjectExec')
+
+
 @pytest.mark.parametrize('invalid,fmt', [
     ('2021-01/01', 'yyyy-MM-dd'),
     ('2021/01-01', 'yyyy/MM/dd'),
diff --git a/integration_tests/src/main/python/dpp_test.py b/integration_tests/src/main/python/dpp_test.py
index 124b1cd3b7e..ad73358704c 100644
--- a/integration_tests/src/main/python/dpp_test.py
+++ b/integration_tests/src/main/python/dpp_test.py
@@ -287,7 +287,10 @@ def setup_tables(spark):
                   " PARTITIONED BY (dt date, hr string, mins string) STORED AS PARQUET")
         spark.sql("INSERT INTO {}(id,dt,hr,mins)".format(fact_table) +
                   " SELECT 'somevalue', to_date('2022-01-01'), '11', '59'")
-    with_cpu_session(setup_tables)
+    with_cpu_session(setup_tables, conf={
+        "hive.exec.dynamic.partition" : "true",
+        "hive.exec.dynamic.partition.mode" : "nonstrict"
+    })
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark : spark.sql("SELECT COUNT(*) AS cnt FROM {} f".format(fact_table) +
                                  " LEFT JOIN (SELECT *, " +
diff --git a/integration_tests/src/main/python/expand_exec_test.py b/integration_tests/src/main/python/expand_exec_test.py
index 8974e313f0a..d60b7859095 100644
--- a/integration_tests/src/main/python/expand_exec_test.py
+++ b/integration_tests/src/main/python/expand_exec_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,7 +19,9 @@
 from marks import ignore_order
 
 @pytest.mark.parametrize('data_gen', all_gen, ids=idfn)
-@ignore_order
+# Many Spark versions have issues sorting large decimals,
+# see https://issues.apache.org/jira/browse/SPARK-40089.
+@ignore_order(local=True)
 def test_expand_exec(data_gen):
     def op_df(spark, length=2048, seed=0):
         return gen_df(spark, StructGen([
diff --git a/integration_tests/src/main/python/explain_test.py b/integration_tests/src/main/python/explain_test.py
index 53685b5e7c3..b84754a3d3f 100644
--- a/integration_tests/src/main/python/explain_test.py
+++ b/integration_tests/src/main/python/explain_test.py
@@ -20,6 +20,9 @@
 from pyspark.sql.types import *
 from spark_session import with_cpu_session, with_gpu_session
 
+# mark this test as ci_1 for mvn verify sanity check in pre-merge CI
+pytestmark = pytest.mark.premerge_ci_1
+
 def create_df(spark, data_gen, left_length, right_length):
     left = binary_op_df(spark, data_gen, length=left_length)
     right = binary_op_df(spark, data_gen, length=right_length).withColumnRenamed("a", "r_a")\
diff --git a/integration_tests/src/main/python/hash_aggregate_test.py b/integration_tests/src/main/python/hash_aggregate_test.py
index 90a9884359a..752a461f58f 100644
--- a/integration_tests/src/main/python/hash_aggregate_test.py
+++ b/integration_tests/src/main/python/hash_aggregate_test.py
@@ -417,11 +417,12 @@ def test_hash_reduction_sum_full_decimal(data_gen, conf):
 @approximate_float
 @ignore_order
 @incompat
-@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans + [_grpkey_short_mid_decimals, _grpkey_short_big_decimals], ids=idfn)
+@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans + [_grpkey_short_mid_decimals, 
+    _grpkey_short_big_decimals, _grpkey_short_very_big_decimals, _grpkey_short_full_decimals], ids=idfn)
 @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
 def test_hash_grpby_avg(data_gen, conf):
     assert_gpu_and_cpu_are_equal_collect(
-        lambda spark: gen_df(spark, data_gen, length=100).groupby('a').agg(f.avg('b')),
+        lambda spark: gen_df(spark, data_gen, length=200).groupby('a').agg(f.avg('b')),
         conf=conf
     )
 
@@ -460,10 +461,27 @@ def test_exceptAll(data_gen):
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark : gen_df(spark, data_gen, length=100).exceptAll(gen_df(spark, data_gen, length=100).filter('a != b')))
 
+# Spark fails to sort some decimal values due to overflow when calculating the sorting prefix.
+# See https://issues.apache.org/jira/browse/SPARK-40129
+# Since pivot orders by value, avoid generating these extreme values for this test.
+_pivot_gen_128bit = DecimalGen(precision=20, scale=2, special_cases=[])
+_pivot_big_decimals = [
+    ('a', RepeatSeqGen(DecimalGen(precision=32, scale=10, nullable=(True, 10.0)), length=50)),
+    ('b', _pivot_gen_128bit),
+    ('c', DecimalGen(precision=36, scale=5))]
+_pivot_short_big_decimals = [
+    ('a', RepeatSeqGen(short_gen, length=50)),
+    ('b', _pivot_gen_128bit),
+    ('c', decimal_gen_128bit)]
+
+_pivot_gens_with_decimals = _init_list_with_nans_and_no_nans + [
+    _grpkey_small_decimals, _pivot_big_decimals, _grpkey_short_mid_decimals,
+    _pivot_short_big_decimals, _grpkey_short_very_big_decimals,
+    _grpkey_short_very_big_neg_scale_decimals]
 @approximate_float
 @ignore_order(local=True)
 @incompat
-@pytest.mark.parametrize('data_gen', _init_list_with_nans_and_no_nans_with_decimalbig, ids=idfn)
+@pytest.mark.parametrize('data_gen', _pivot_gens_with_decimals, ids=idfn)
 @pytest.mark.parametrize('conf', get_params(_confs_with_nans, params_markers_for_confs_nans), ids=idfn)
 def test_hash_grpby_pivot(data_gen, conf):
     assert_gpu_and_cpu_are_equal_collect(
@@ -604,6 +622,8 @@ def test_hash_pivot_groupby_duplicates_fallback(data_gen):
     ('a', RepeatSeqGen(LongGen(), length=20)),
     ('b', value_gen)] for value_gen in _repeat_agg_column_for_collect_set_op_nested + _array_of_array_gen]
 
+_all_basic_gens_with_all_nans_cases = all_basic_gens + [SetValuesGen(t, [math.nan, None]) for t in [FloatType(), DoubleType()]]
+
 # very simple test for just a count on decimals 128 values until we can support more with them
 @ignore_order(local=True)
 @pytest.mark.parametrize('data_gen', [decimal_gen_128bit], ids=idfn)
@@ -637,18 +657,12 @@ def test_decimal128_min_max_group_by(data_gen):
             .agg(f.min('b'), f.max('b')))
 
 @ignore_order(local=True)
-@pytest.mark.parametrize('data_gen', [float_gen, double_gen], ids=idfn)
-def test_float_max_reduction_with_nan(data_gen):
-    assert_gpu_and_cpu_are_equal_collect(
-        lambda spark: unary_op_df(spark, data_gen).selectExpr('max(a)'))
-
-@ignore_order(local=True)
-@pytest.mark.parametrize('data_gen', [float_gen, double_gen], ids=idfn)
-def test_float_max_group_by_with_nan(data_gen):
+@pytest.mark.parametrize('data_gen', _all_basic_gens_with_all_nans_cases, ids=idfn)
+def test_min_max_group_by(data_gen):
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark: two_col_df(spark, byte_gen, data_gen)
             .groupby('a')
-            .agg(f.max('b')))
+            .agg(f.min('b'), f.max('b')))
 
 # to avoid ordering issues with collect_list we do it all in a single task
 @ignore_order(local=True)
@@ -1105,7 +1119,7 @@ def test_first_last_reductions_nested_types(data_gen):
         lambda spark: unary_op_df(spark, data_gen).coalesce(1).selectExpr(
             'first(a)', 'last(a)', 'first(a, true)', 'last(a, true)'))
 
-@pytest.mark.parametrize('data_gen', non_nan_all_basic_gens, ids=idfn)
+@pytest.mark.parametrize('data_gen', _all_basic_gens_with_all_nans_cases, ids=idfn)
 def test_generic_reductions(data_gen):
     local_conf = copy_and_update(_no_nans_float_conf, {'spark.sql.legacy.allowParameterlessCount': 'true'})
     assert_gpu_and_cpu_are_equal_collect(
@@ -1133,7 +1147,7 @@ def test_count(data_gen):
             'count(1)'),
         conf = {'spark.sql.legacy.allowParameterlessCount': 'true'})
 
-@pytest.mark.parametrize('data_gen', non_nan_all_basic_gens, ids=idfn)
+@pytest.mark.parametrize('data_gen', all_basic_gens, ids=idfn)
 def test_distinct_count_reductions(data_gen):
     assert_gpu_and_cpu_are_equal_collect(
             lambda spark : binary_op_df(spark, data_gen).selectExpr(
@@ -1155,7 +1169,7 @@ def test_arithmetic_reductions(data_gen):
             conf = _no_nans_float_conf)
 
 @pytest.mark.parametrize('data_gen',
-                         non_nan_all_basic_gens + decimal_gens + _nested_gens,
+                         all_basic_gens + decimal_gens + _nested_gens,
                          ids=idfn)
 def test_collect_list_reductions(data_gen):
     assert_gpu_and_cpu_are_equal_collect(
@@ -1205,7 +1219,9 @@ def test_sorted_groupby_first_last(data_gen):
         lambda spark: agg_fn(gen_df(spark, gen_fn, num_slices=1)),
         conf = {'spark.sql.shuffle.partitions': '1'})
 
-@ignore_order
+# Spark has a sorting bug with decimals, see https://issues.apache.org/jira/browse/SPARK-40129.
+# Have pytest do the sorting rather than Spark as a workaround.
+@ignore_order(local=True)
 @pytest.mark.parametrize('data_gen', all_gen, ids=idfn)
 @pytest.mark.parametrize('count_func', [f.count, f.countDistinct])
 def test_agg_count(data_gen, count_func):
@@ -1815,7 +1831,7 @@ def test_groupby_std_variance_partial_replace_fallback(data_gen,
 # test min max on single level structure
 #
 gens_for_max_min = [byte_gen, short_gen, int_gen, long_gen,
-    FloatGen(no_nans = True), DoubleGen(no_nans = True),
+    float_gen, double_gen,
     string_gen, boolean_gen,
     date_gen, timestamp_gen,
     DecimalGen(precision=12, scale=2),
@@ -1852,4 +1868,4 @@ def test_min_max_for_single_level_struct(data_gen):
         lambda spark : gen_df(spark, df_gen, length=1024),
         "hash_agg_table",
         'select min(a) from hash_agg_table',
-        _no_nans_float_conf)
\ No newline at end of file
+        _no_nans_float_conf)
diff --git a/integration_tests/src/main/python/iceberg_test.py b/integration_tests/src/main/python/iceberg_test.py
index 76ee30fcbe8..4b4ccfacc10 100644
--- a/integration_tests/src/main/python/iceberg_test.py
+++ b/integration_tests/src/main/python/iceberg_test.py
@@ -35,6 +35,9 @@
      ArrayGen(StructGen([['child0', string_gen], ['child1', double_gen], ['child2', int_gen]]))
     ] + iceberg_map_gens + decimal_gens ]
 
+rapids_reader_types = ['PERFILE', 'MULTITHREADED', 'COALESCING']
+
+
 @allow_non_gpu("BatchScanExec")
 @iceberg
 @ignore_order(local=True) # Iceberg plans with a thread pool and is not deterministic in file ordering
@@ -53,7 +56,7 @@ def setup_iceberg_table(spark):
 @ignore_order(local=True)
 @pytest.mark.skipif(is_before_spark_320() or is_databricks_runtime(),
                     reason="AQE+DPP not supported until Spark 3.2.0+ and AQE+DPP not supported on Databricks")
-@pytest.mark.parametrize('reader_type', ['PERFILE', 'MULTITHREADED'])
+@pytest.mark.parametrize('reader_type', rapids_reader_types)
 def test_iceberg_aqe_dpp(spark_tmp_table_factory, reader_type):
     table = spark_tmp_table_factory.get()
     tmpview = spark_tmp_table_factory.get()
@@ -72,7 +75,7 @@ def setup_iceberg_table(spark):
 @iceberg
 @ignore_order(local=True) # Iceberg plans with a thread pool and is not deterministic in file ordering
 @pytest.mark.parametrize("data_gens", iceberg_gens_list, ids=idfn)
-@pytest.mark.parametrize('reader_type', ['PERFILE', 'MULTITHREADED'])
+@pytest.mark.parametrize('reader_type', rapids_reader_types)
 def test_iceberg_parquet_read_round_trip(spark_tmp_table_factory, data_gens, reader_type):
     gen_list = [('_c' + str(i), gen) for i, gen in enumerate(data_gens)]
     table = spark_tmp_table_factory.get()
@@ -89,7 +92,7 @@ def setup_iceberg_table(spark):
 @iceberg
 @pytest.mark.parametrize("data_gens", [[long_gen]], ids=idfn)
 @pytest.mark.parametrize("iceberg_format", ["orc", "avro"], ids=idfn)
-@pytest.mark.parametrize('reader_type', ['PERFILE', 'MULTITHREADED'])
+@pytest.mark.parametrize('reader_type', rapids_reader_types)
 def test_iceberg_unsupported_formats(spark_tmp_table_factory, data_gens, iceberg_format, reader_type):
     gen_list = [('_c' + str(i), gen) for i, gen in enumerate(data_gens)]
     table = spark_tmp_table_factory.get()
@@ -136,7 +139,7 @@ def setup_iceberg_table(spark):
                  marks=pytest.mark.skipif(is_before_spark_320(),
                                           reason="Hadoop with Spark 3.1.x does not support lz4 by default")),
     ("zstd", None)], ids=idfn)
-@pytest.mark.parametrize('reader_type', ['PERFILE', 'MULTITHREADED'])
+@pytest.mark.parametrize('reader_type', rapids_reader_types)
 def test_iceberg_read_parquet_compression_codec(spark_tmp_table_factory, codec_info, reader_type):
     codec, error_msg = codec_info
     table = spark_tmp_table_factory.get()
@@ -160,7 +163,7 @@ def setup_iceberg_table(spark):
 @iceberg
 @ignore_order(local=True) # Iceberg plans with a thread pool and is not deterministic in file ordering
 @pytest.mark.parametrize("key_gen", [int_gen, long_gen, string_gen, boolean_gen, date_gen, timestamp_gen, decimal_gen_64bit], ids=idfn)
-@pytest.mark.parametrize('reader_type', ['PERFILE', 'MULTITHREADED'])
+@pytest.mark.parametrize('reader_type', rapids_reader_types)
 def test_iceberg_read_partition_key(spark_tmp_table_factory, key_gen, reader_type):
     table = spark_tmp_table_factory.get()
     tmpview = spark_tmp_table_factory.get()
@@ -176,7 +179,7 @@ def setup_iceberg_table(spark):
 
 @iceberg
 @ignore_order(local=True) # Iceberg plans with a thread pool and is not deterministic in file ordering
-@pytest.mark.parametrize('reader_type', ['PERFILE', 'MULTITHREADED'])
+@pytest.mark.parametrize('reader_type', rapids_reader_types)
 def test_iceberg_input_meta(spark_tmp_table_factory, reader_type):
     table = spark_tmp_table_factory.get()
     tmpview = spark_tmp_table_factory.get()
@@ -194,7 +197,7 @@ def setup_iceberg_table(spark):
 
 @iceberg
 @ignore_order(local=True) # Iceberg plans with a thread pool and is not deterministic in file ordering
-@pytest.mark.parametrize('reader_type', ['PERFILE', 'MULTITHREADED'])
+@pytest.mark.parametrize('reader_type', rapids_reader_types)
 def test_iceberg_disorder_read_schema(spark_tmp_table_factory, reader_type):
     table = spark_tmp_table_factory.get()
     tmpview = spark_tmp_table_factory.get()
@@ -274,7 +277,7 @@ def setup_iceberg_table(spark):
 @iceberg
 @ignore_order(local=True) # Iceberg plans with a thread pool and is not deterministic in file ordering
 @pytest.mark.skipif(is_before_spark_320(), reason="Spark 3.1.x has a catalog bug precluding scope prefix in table names")
-@pytest.mark.parametrize('reader_type', ['PERFILE', 'MULTITHREADED'])
+@pytest.mark.parametrize('reader_type', rapids_reader_types)
 def test_iceberg_read_timetravel(spark_tmp_table_factory, reader_type):
     table = spark_tmp_table_factory.get()
     tmpview = spark_tmp_table_factory.get()
@@ -298,7 +301,7 @@ def setup_snapshots(spark):
 @iceberg
 @ignore_order(local=True) # Iceberg plans with a thread pool and is not deterministic in file ordering
 @pytest.mark.skipif(is_before_spark_320(), reason="Spark 3.1.x has a catalog bug precluding scope prefix in table names")
-@pytest.mark.parametrize('reader_type', ['PERFILE', 'MULTITHREADED'])
+@pytest.mark.parametrize('reader_type', rapids_reader_types)
 def test_iceberg_incremental_read(spark_tmp_table_factory, reader_type):
     table = spark_tmp_table_factory.get()
     tmpview = spark_tmp_table_factory.get()
@@ -328,7 +331,7 @@ def setup_snapshots(spark):
 
 @iceberg
 @ignore_order(local=True) # Iceberg plans with a thread pool and is not deterministic in file ordering
-@pytest.mark.parametrize('reader_type', ['PERFILE', 'MULTITHREADED'])
+@pytest.mark.parametrize('reader_type', rapids_reader_types)
 def test_iceberg_reorder_columns(spark_tmp_table_factory, reader_type):
     table = spark_tmp_table_factory.get()
     tmpview = spark_tmp_table_factory.get()
@@ -349,7 +352,7 @@ def setup_iceberg_table(spark):
 
 @iceberg
 @ignore_order(local=True) # Iceberg plans with a thread pool and is not deterministic in file ordering
-@pytest.mark.parametrize('reader_type', ['PERFILE', 'MULTITHREADED'])
+@pytest.mark.parametrize('reader_type', rapids_reader_types)
 def test_iceberg_rename_column(spark_tmp_table_factory, reader_type):
     table = spark_tmp_table_factory.get()
     tmpview = spark_tmp_table_factory.get()
@@ -370,7 +373,7 @@ def setup_iceberg_table(spark):
 
 @iceberg
 @ignore_order(local=True) # Iceberg plans with a thread pool and is not deterministic in file ordering
-@pytest.mark.parametrize('reader_type', ['PERFILE', 'MULTITHREADED'])
+@pytest.mark.parametrize('reader_type', rapids_reader_types)
 def test_iceberg_column_names_swapped(spark_tmp_table_factory, reader_type):
     table = spark_tmp_table_factory.get()
     tmpview = spark_tmp_table_factory.get()
@@ -393,7 +396,7 @@ def setup_iceberg_table(spark):
 
 @iceberg
 @ignore_order(local=True) # Iceberg plans with a thread pool and is not deterministic in file ordering
-@pytest.mark.parametrize('reader_type', ['PERFILE', 'MULTITHREADED'])
+@pytest.mark.parametrize('reader_type', rapids_reader_types)
 def test_iceberg_alter_column_type(spark_tmp_table_factory, reader_type):
     table = spark_tmp_table_factory.get()
     tmpview = spark_tmp_table_factory.get()
@@ -416,7 +419,7 @@ def setup_iceberg_table(spark):
 
 @iceberg
 @ignore_order(local=True) # Iceberg plans with a thread pool and is not deterministic in file ordering
-@pytest.mark.parametrize('reader_type', ['PERFILE', 'MULTITHREADED'])
+@pytest.mark.parametrize('reader_type', rapids_reader_types)
 def test_iceberg_add_column(spark_tmp_table_factory, reader_type):
     table = spark_tmp_table_factory.get()
     tmpview = spark_tmp_table_factory.get()
@@ -437,7 +440,7 @@ def setup_iceberg_table(spark):
 
 @iceberg
 @ignore_order(local=True) # Iceberg plans with a thread pool and is not deterministic in file ordering
-@pytest.mark.parametrize('reader_type', ['PERFILE', 'MULTITHREADED'])
+@pytest.mark.parametrize('reader_type', rapids_reader_types)
 def test_iceberg_remove_column(spark_tmp_table_factory, reader_type):
     table = spark_tmp_table_factory.get()
     tmpview = spark_tmp_table_factory.get()
@@ -458,7 +461,7 @@ def setup_iceberg_table(spark):
 
 @iceberg
 @ignore_order(local=True) # Iceberg plans with a thread pool and is not deterministic in file ordering
-@pytest.mark.parametrize('reader_type', ['PERFILE', 'MULTITHREADED'])
+@pytest.mark.parametrize('reader_type', rapids_reader_types)
 def test_iceberg_add_partition_field(spark_tmp_table_factory, reader_type):
     table = spark_tmp_table_factory.get()
     tmpview = spark_tmp_table_factory.get()
@@ -479,7 +482,7 @@ def setup_iceberg_table(spark):
 
 @iceberg
 @ignore_order(local=True) # Iceberg plans with a thread pool and is not deterministic in file ordering
-@pytest.mark.parametrize('reader_type', ['PERFILE', 'MULTITHREADED'])
+@pytest.mark.parametrize('reader_type', rapids_reader_types)
 def test_iceberg_drop_partition_field(spark_tmp_table_factory, reader_type):
     table = spark_tmp_table_factory.get()
     tmpview = spark_tmp_table_factory.get()
@@ -500,7 +503,7 @@ def setup_iceberg_table(spark):
 
 @iceberg
 @ignore_order(local=True) # Iceberg plans with a thread pool and is not deterministic in file ordering
-@pytest.mark.parametrize('reader_type', ['PERFILE', 'MULTITHREADED'])
+@pytest.mark.parametrize('reader_type', rapids_reader_types)
 def test_iceberg_v1_delete(spark_tmp_table_factory, reader_type):
     table = spark_tmp_table_factory.get()
     tmpview = spark_tmp_table_factory.get()
@@ -517,7 +520,7 @@ def setup_iceberg_table(spark):
 
 @iceberg
 @pytest.mark.skipif(is_before_spark_320(), reason="merge-on-read not supported on Spark 3.1.x")
-@pytest.mark.parametrize('reader_type', ['PERFILE', 'MULTITHREADED'])
+@pytest.mark.parametrize('reader_type', rapids_reader_types)
 def test_iceberg_v2_delete_unsupported(spark_tmp_table_factory, reader_type):
     table = spark_tmp_table_factory.get()
     tmpview = spark_tmp_table_factory.get()
@@ -534,3 +537,19 @@ def setup_iceberg_table(spark):
             lambda spark : spark.sql("SELECT * FROM {}".format(table)).collect(),
             conf={'spark.rapids.sql.format.parquet.reader.type': reader_type}),
         "UnsupportedOperationException: Delete filter is not supported")
+
+
+@iceberg
+@ignore_order(local=True) # Iceberg plans with a thread pool and is not deterministic in file ordering
+@pytest.mark.parametrize('reader_type', rapids_reader_types)
+def test_iceberg_parquet_read_with_input_file(spark_tmp_table_factory, reader_type):
+    table = spark_tmp_table_factory.get()
+    tmpview = spark_tmp_table_factory.get()
+    def setup_iceberg_table(spark):
+        df = binary_op_df(spark, long_gen)
+        df.createOrReplaceTempView(tmpview)
+        spark.sql("CREATE TABLE {} USING ICEBERG AS SELECT * FROM {}".format(table, tmpview))
+    with_cpu_session(setup_iceberg_table)
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark : spark.sql("SELECT *, input_file_name() FROM {}".format(table)),
+        conf={'spark.rapids.sql.format.parquet.reader.type': reader_type})
diff --git a/integration_tests/src/main/python/join_test.py b/integration_tests/src/main/python/join_test.py
index 50c1b1ca70e..b64eab99d41 100644
--- a/integration_tests/src/main/python/join_test.py
+++ b/integration_tests/src/main/python/join_test.py
@@ -22,15 +22,14 @@
 from marks import ignore_order, allow_non_gpu, incompat, validate_execs_in_gpu_plan
 from spark_session import with_cpu_session, with_spark_session
 
-# Mark all tests in current file as premerge_ci_1 in order to be run in first k8s pod for parallel build premerge job
-pytestmark = [pytest.mark.premerge_ci_1, pytest.mark.nightly_resource_consuming_test]
+pytestmark = [pytest.mark.nightly_resource_consuming_test]
 
 all_join_types = ['Left', 'Right', 'Inner', 'LeftSemi', 'LeftAnti', 'Cross', 'FullOuter']
 
 all_gen = [StringGen(), ByteGen(), ShortGen(), IntegerGen(), LongGen(),
            BooleanGen(), DateGen(), TimestampGen(), null_gen,
            pytest.param(FloatGen(), marks=[incompat]),
-           pytest.param(DoubleGen(), marks=[incompat])] + decimal_gens
+           pytest.param(DoubleGen(), marks=[incompat])] + orderable_decimal_gens
 
 all_gen_no_nulls = [StringGen(nullable=False), ByteGen(nullable=False),
         ShortGen(nullable=False), IntegerGen(nullable=False), LongGen(nullable=False),
@@ -73,7 +72,8 @@
 # Types to use when running joins on small batches. Small batch joins can take a long time
 # to run and are mostly redundant with the normal batch size test, so we only run these on a
 # set of representative types rather than all types.
-join_small_batch_gens = [ StringGen(), IntegerGen(), decimal_gen_128bit ]
+
+join_small_batch_gens = [ StringGen(), IntegerGen(), orderable_decimal_gen_128bit ]
 cartesian_join_small_batch_gens = join_small_batch_gens + [basic_struct_gen, ArrayGen(string_gen)]
 
 _sortmerge_join_conf = {'spark.sql.autoBroadcastJoinThreshold': '-1',
diff --git a/integration_tests/src/main/python/orc_cast_test.py b/integration_tests/src/main/python/orc_cast_test.py
new file mode 100644
index 00000000000..6a84407a632
--- /dev/null
+++ b/integration_tests/src/main/python/orc_cast_test.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error
+from data_gen import *
+from pyspark.sql.types import *
+from spark_session import with_cpu_session
+from orc_test import reader_opt_confs
+
+
+def create_orc(data_gen_list, data_path):
+    # generate ORC dataframe, and dump it to local file 'data_path'
+    with_cpu_session(
+        lambda spark: gen_df(spark, data_gen_list).write.orc(data_path)
+    )
+
+
+@pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn)
+@pytest.mark.parametrize('v1_enabled_list', ["", "orc"])
+@pytest.mark.parametrize('to_type', ['boolean', 'tinyint', 'smallint', 'int', 'bigint'])
+def test_casting_among_integer_types(spark_tmp_path, reader_confs, v1_enabled_list, to_type):
+    # cast integral types to another integral types
+    int_gens = [boolean_gen] + integral_gens
+    gen_list = [('c' + str(i), gen) for i, gen in enumerate(int_gens)]
+    data_path = spark_tmp_path + '/ORC_DATA'
+    create_orc(gen_list, data_path)
+
+    # generate schema string like "c0 to_type, c1 to_type, ..., c4 to_type"
+    schema_str = " {}, ".join([x[0] for x in gen_list]) + " {}"
+    schema_str = schema_str.format(*([to_type] * len(gen_list)))
+    all_confs = copy_and_update(reader_confs,
+                                {'spark.sql.sources.useV1SourceList': v1_enabled_list})
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark: spark.read.schema(schema_str).orc(data_path),
+        conf=all_confs)
+
+
+@pytest.mark.parametrize('to_type', ['float', 'double', 'string', 'timestamp'])
+def test_casting_from_integer(spark_tmp_path, to_type):
+    orc_path = spark_tmp_path + '/orc_cast_integer'
+    # The Python 'datetime' module only supports a max-year of 10000, so we set the Long type max
+    # to '1e11'. If the long-value is out of this range, pytest will throw an exception.
+    data_gen = [('boolean_col', boolean_gen), ('tinyint_col', byte_gen),
+                ('smallint_col', ShortGen(min_val=BYTE_MAX + 1)),
+                ('int_col', IntegerGen(min_val=SHORT_MAX + 1)),
+                ('bigint_col', LongGen(min_val=INT_MAX + 1, max_val=int(1e11))),
+                ('negint_col', IntegerGen(max_val=-1))]
+    create_orc(data_gen, orc_path)
+
+    schema_str = "boolean_col {}, tinyint_col {}, smallint_col {}, int_col {}, bigint_col {}, negint_col {}"
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark: spark.read.schema(
+            schema_str.format(*([to_type] * len(data_gen)))).orc(orc_path)
+    )
+
+@pytest.mark.parametrize('overflow_long_gen', [LongGen(min_val=int(1e16)),
+                                               LongGen(max_val=int(-1e16))])
+@pytest.mark.parametrize('to_type', ['timestamp'])
+def test_casting_from_overflow_long(spark_tmp_path, overflow_long_gen,to_type):
+    # Timestamp(micro-seconds) is actually type of int64, when casting long(int64) to timestamp,
+    # we need to multiply 1e6 (or 1e3), and it may cause overflow. This function aims to test
+    # whether if 'ArithmeticException' is caught.
+    orc_path = spark_tmp_path + '/orc_cast_overflow_long'
+    create_orc([('long_column', overflow_long_gen)], orc_path)
+    schema_str = "long_column {}".format(to_type)
+    assert_gpu_and_cpu_error(
+        df_fun=lambda spark: spark.read.schema(schema_str).orc(orc_path).collect(),
+        conf={},
+        error_message="ArithmeticException"
+    )
diff --git a/integration_tests/src/main/python/orc_test.py b/integration_tests/src/main/python/orc_test.py
index 204b4a127b9..7c0c775fc68 100644
--- a/integration_tests/src/main/python/orc_test.py
+++ b/integration_tests/src/main/python/orc_test.py
@@ -31,6 +31,14 @@ def read_orc_df(data_path):
 def read_orc_sql(data_path):
     return lambda spark : spark.sql('select * from orc.`{}`'.format(data_path))
 
+# ORC has issues reading timestamps where it is off by 1 second if the timestamp is before
+# epoch in 1970 and the microsecond value is between 0 and 1000.
+# See https://github.com/rapidsai/cudf/issues/11525.
+def get_orc_timestamp_gen(nullable=True):
+    return TimestampGen(start=datetime(1970, 1, 1, tzinfo=timezone.utc), nullable=nullable)
+
+orc_timestamp_gen = get_orc_timestamp_gen()
+
 # test with original orc file reader, the multi-file parallel reader for cloud
 original_orc_file_reader_conf = {'spark.rapids.sql.format.orc.reader.type': 'PERFILE'}
 multithreaded_orc_file_reader_conf = {'spark.rapids.sql.format.orc.reader.type': 'MULTITHREADED'}
@@ -51,7 +59,7 @@ def test_basic_read(std_input_path, name, read_func, v1_enabled_list, orc_impl,
             conf=all_confs)
 
 # ORC does not support negative scale for decimal. So here is "decimal_gens_no_neg".
-# Otherwsie it will get the below exception.
+# Otherwise it will get the below exception.
 # ...
 #E                   Caused by: java.lang.IllegalArgumentException: Missing integer at
 #   'struct<`_c0`:decimal(7,^-3),`_c1`:decimal(7,3),`_c2`:decimal(7,7),`_c3`:decimal(12,2)>'
@@ -60,8 +68,7 @@ def test_basic_read(std_input_path, name, read_func, v1_enabled_list, orc_impl,
 # ...
 orc_basic_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
     string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)),
-    TimestampGen(start=datetime(1590, 1, 1, tzinfo=timezone.utc))
-                  ] + decimal_gens
+    orc_timestamp_gen] + decimal_gens
 
 orc_basic_struct_gen = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(orc_basic_gens)])
 
@@ -85,7 +92,7 @@ def test_basic_read(std_input_path, name, read_func, v1_enabled_list, orc_impl,
 
 orc_basic_map_gens = [simple_string_to_string_map_gen] + [MapGen(f(nullable=False), f()) for f in [
     BooleanGen, ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen,
-    lambda nullable=True: TimestampGen(start=datetime(1900, 1, 1, tzinfo=timezone.utc), nullable=nullable),
+    lambda nullable=True: get_orc_timestamp_gen(nullable),
     lambda nullable=True: DateGen(start=date(1590, 1, 1), nullable=nullable),
     lambda nullable=True: DecimalGen(precision=15, scale=1, nullable=nullable),
     lambda nullable=True: DecimalGen(precision=36, scale=5, nullable=nullable)]]
@@ -152,7 +159,7 @@ def test_read_round_trip(spark_tmp_path, orc_gens, read_func, reader_confs, v1_e
         DateGen(start=date(1590, 1, 1)),
         # Once https://github.com/NVIDIA/spark-rapids/issues/140 is fixed replace this with
         # timestamp_gen 
-        TimestampGen(start=datetime(1970, 1, 1, tzinfo=timezone.utc))]
+        orc_timestamp_gen]
 
 @pytest.mark.order(2)
 @pytest.mark.parametrize('orc_gen', orc_pred_push_gens, ids=idfn)
@@ -220,7 +227,7 @@ def test_simple_partitioned_read(spark_tmp_path, v1_enabled_list, reader_confs):
     # we should go with a more standard set of generators
     orc_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
     string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)),
-    TimestampGen(start=datetime(1590, 1, 1, tzinfo=timezone.utc))]
+    orc_timestamp_gen]
     gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)]
     first_data_path = spark_tmp_path + '/ORC_DATA/key=0/key2=20'
     with_cpu_session(
@@ -287,7 +294,7 @@ def test_merge_schema_read(spark_tmp_path, v1_enabled_list, reader_confs):
     # we should go with a more standard set of generators
     orc_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
     string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)),
-    TimestampGen(start=datetime(1590, 1, 1, tzinfo=timezone.utc))]
+    orc_timestamp_gen]
     first_gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)]
     first_data_path = spark_tmp_path + '/ORC_DATA/key=0'
     with_cpu_session(
@@ -664,26 +671,6 @@ def test_orc_scan_with_aggregate_no_pushdown_on_col_partition(spark_tmp_path, ag
                 conf=_orc_aggregate_pushdown_enabled_conf)
 
 
-@pytest.mark.parametrize('offset', [1,2,3,4], ids=idfn)
-@pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn)
-@pytest.mark.parametrize('v1_enabled_list', ["", "orc"])
-def test_read_type_casting_integral(spark_tmp_path, offset, reader_confs, v1_enabled_list):
-    int_gens = [boolean_gen] + integral_gens
-    gen_list = [('c' + str(i), gen) for i, gen in enumerate(int_gens)]
-    data_path = spark_tmp_path + '/ORC_DATA'
-    with_cpu_session(
-        lambda spark: gen_df(spark, gen_list).write.orc(data_path))
-
-    # build the read schema by a left shift of int_gens
-    shifted_int_gens = int_gens[offset:] + int_gens[:offset]
-    rs_gen_list = [('c' + str(i), gen) for i, gen in enumerate(shifted_int_gens)]
-    rs = StructGen(rs_gen_list, nullable=False).data_type
-    all_confs = copy_and_update(reader_confs,
-        {'spark.sql.sources.useV1SourceList': v1_enabled_list})
-    assert_gpu_and_cpu_are_equal_collect(
-        lambda spark: spark.read.schema(rs).orc(data_path),
-        conf=all_confs)
-
 def test_orc_read_count(spark_tmp_path):
     data_path = spark_tmp_path + '/ORC_DATA'
     orc_gens = [int_gen, string_gen, double_gen]
diff --git a/integration_tests/src/main/python/sort_test.py b/integration_tests/src/main/python/sort_test.py
index 4b09db2740b..2f6de5b7f48 100644
--- a/integration_tests/src/main/python/sort_test.py
+++ b/integration_tests/src/main/python/sort_test.py
@@ -19,12 +19,19 @@
 from marks import allow_non_gpu
 from pyspark.sql.types import *
 import pyspark.sql.functions as f
+from spark_session import is_before_spark_340
+
+# Many Spark versions have issues sorting decimals.
+# https://issues.apache.org/jira/browse/SPARK-40089
+_orderable_not_null_big_decimal_gen = DecimalGen(precision=20, scale=2, nullable=False)
+if is_before_spark_340():
+    _orderable_not_null_big_decimal_gen = DecimalGen(precision=20, scale=2, nullable=False, special_cases=[])
 
 orderable_not_null_gen = [ByteGen(nullable=False), ShortGen(nullable=False), IntegerGen(nullable=False),
         LongGen(nullable=False), FloatGen(nullable=False), DoubleGen(nullable=False), BooleanGen(nullable=False),
         TimestampGen(nullable=False), DateGen(nullable=False), StringGen(nullable=False),
         DecimalGen(precision=7, scale=3, nullable=False), DecimalGen(precision=12, scale=2, nullable=False),
-        DecimalGen(precision=20, scale=2, nullable=False)]
+        _orderable_not_null_big_decimal_gen]
 
 @allow_non_gpu('SortExec', 'ShuffleExchangeExec', 'RangePartitioning', 'SortOrder')
 @pytest.mark.parametrize('data_gen', [StringGen(nullable=False)], ids=idfn)
@@ -164,7 +171,8 @@ def test_single_nested_sort_in_part(data_gen, order, stable_sort):
         conf=sort_conf)
 
 orderable_gens_sort = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
-        boolean_gen, timestamp_gen, date_gen, string_gen, null_gen, StructGen([('child0', long_gen)])] + decimal_gens
+        boolean_gen, timestamp_gen, date_gen, string_gen, null_gen, StructGen([('child0', long_gen)])
+                       ] + orderable_decimal_gens
 @pytest.mark.parametrize('data_gen', orderable_gens_sort, ids=idfn)
 def test_multi_orderby(data_gen):
     assert_gpu_and_cpu_are_equal_collect(
diff --git a/integration_tests/src/main/python/spark_init_internal.py b/integration_tests/src/main/python/spark_init_internal.py
index e36cc3d282b..3ba6c390c0d 100644
--- a/integration_tests/src/main/python/spark_init_internal.py
+++ b/integration_tests/src/main/python/spark_init_internal.py
@@ -12,34 +12,89 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import os
+import re
+import sys
 
-try:
-    import pyspark
-except ImportError as error:
-    import findspark
-    findspark.init()
-    import pyspark
+logging.basicConfig(
+    format="%(asctime)s %(levelname)-8s %(message)s",
+    level=logging.INFO,
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
 
 _CONF_ENV_PREFIX = 'PYSP_TEST_'
 _EXECUTOR_ENV_PREFIX = 'spark_executorEnv_'
 
 def env_for_conf(spark_conf_name):
-    return _CONF_ENV_PREFIX + spark_conf_name.replace('.', '_')
+    # escape underscores
+    escaped_conf = spark_conf_name.replace('_', r'__')
+    return _CONF_ENV_PREFIX + escaped_conf.replace('.', '_')
 
 def conf_for_env(env_name):
     conf_key = env_name[len(_CONF_ENV_PREFIX):]
     if conf_key.startswith(_EXECUTOR_ENV_PREFIX):
         res = _EXECUTOR_ENV_PREFIX.replace('_', '.') + conf_key[len(_EXECUTOR_ENV_PREFIX):]
     else:
-        res = conf_key.replace('_', '.')
+        # replace standalone underscores
+        res1 = re.sub(r'(?<!_)_(?!_)', '.', conf_key)
+        # unescape: remove duplicate underscores
+        res = res1.replace('__', '_')
     return res
 
 _DRIVER_ENV = env_for_conf('spark.driver.extraJavaOptions')
+_SPARK_JARS = env_for_conf("spark.jars")
+_SPARK_JARS_PACKAGES = env_for_conf("spark.jars.packages")
+spark_jars_env = {
+    _SPARK_JARS,
+    _SPARK_JARS_PACKAGES
+}
+
+def findspark_init():
+    import findspark
+    findspark.init()
+    logging.info("Checking if add_jars/packages to findspark required")
+    spark_jars = os.getenv(_SPARK_JARS)
+    spark_jars_packages = os.getenv(_SPARK_JARS_PACKAGES)
+    if spark_jars is not None:
+        logging.info(f"Adding to findspark jars: {spark_jars}")
+        findspark.add_jars(spark_jars)
+
+    if spark_jars_packages is not None:
+        logging.info(f"Adding to findspark packages: {spark_jars_packages}")
+        findspark.add_packages(spark_jars_packages)
+
+def running_with_xdist(session, is_worker):
+    try:
+        import xdist
+        return xdist.is_xdist_worker(session) if is_worker\
+            else xdist.is_xdist_master(session)
+    except ImportError:
+        return False
+
+
+def pyspark_ready():
+    try:
+        import pyspark
+        return True
+    except ImportError:
+        return False
+
+
+def pytest_sessionstart(session):
+    if running_with_xdist(session, is_worker = True):
+        logging.info("Initializing findspark because running with xdist worker")
+        findspark_init()
+    elif running_with_xdist(session, is_worker = False):
+        logging.info("Skipping findspark init because on xdist master")
+        return
+    elif not pyspark_ready():
+        logging.info("Initializing findspark because pyspark unimportable on a standalone Pytest instance")
+        findspark_init()
 
+    import pyspark
 
-def _spark__init():
-    #Force the RapidsPlugin to be enabled, so it blows up if the classpath is not set properly
+    # Force the RapidsPlugin to be enabled, so it blows up if the classpath is not set properly
     # DO NOT SET ANY OTHER CONFIGS HERE!!!
     # due to bugs in pyspark/pytest it looks like any configs set here
     # can be reset in the middle of a test if specific operations are done (some types of cast etc)
@@ -49,7 +104,7 @@ def _spark__init():
             .config('spark.sql.queryExecutionListeners', 'com.nvidia.spark.rapids.ExecutionPlanCaptureCallback')
 
     for key, value in os.environ.items():
-        if key.startswith(_CONF_ENV_PREFIX) and key != _DRIVER_ENV:
+        if key.startswith(_CONF_ENV_PREFIX) and key != _DRIVER_ENV and key not in spark_jars_env:
             _sb.config(conf_for_env(key), value)
 
     driver_opts = os.environ.get(_DRIVER_ENV, "")
@@ -68,7 +123,8 @@ def _spark__init():
     #TODO catch the ClassNotFound error that happens if the classpath is not set up properly and
     # make it a better error message
     _s.sparkContext.setLogLevel("WARN")
-    return _s
+    global _spark
+    _spark = _s
 
 
 def _handle_derby_dir(sb, driver_opts, wid):
@@ -82,9 +138,10 @@ def _handle_event_log_dir(sb, wid):
     if os.environ.get('SPARK_EVENTLOG_ENABLED', str(True)).lower() in [
         str(False).lower(), 'off', '0'
     ]:
-        print('Automatic configuration for spark event log disabled')
+        logging.info('Automatic configuration for spark event log disabled')
         return
 
+    import pyspark
     spark_conf = pyspark.SparkConf()
     master_url = os.environ.get(env_for_conf('spark.master'),
                                 spark_conf.get("spark.master", 'local'))
@@ -93,14 +150,14 @@ def _handle_event_log_dir(sb, wid):
     event_log_codec = os.environ.get(env_for_conf('spark.eventLog.compression.codec'), 'zstd')
 
     if not master_url.startswith('local') or event_log_config != str(False).lower():
-        print("SPARK_EVENTLOG_ENABLED is ignored for non-local Spark master and when "
+        logging.info("SPARK_EVENTLOG_ENABLED is ignored for non-local Spark master and when "
               "it's pre-configured by the user")
         return
     d = "./eventlog_{}".format(wid)
     if not os.path.exists(d):
         os.makedirs(d)
 
-    print('Spark event logs will appear under {}. Set the environmnet variable '
+    logging.info('Spark event logs will appear under {}. Set the environmnet variable '
           'SPARK_EVENTLOG_ENABLED=false if you want to disable it'.format(d))
 
     sb\
@@ -109,9 +166,6 @@ def _handle_event_log_dir(sb, wid):
         .config('spark.eventLog.enabled', True) \
         .config('spark.eventLog.compression.codec', event_log_codec)
 
-
-_spark = _spark__init()
-
 def get_spark_i_know_what_i_am_doing():
     """
     Get the current SparkSession.
diff --git a/integration_tests/src/main/python/spark_session.py b/integration_tests/src/main/python/spark_session.py
index f2781f510e2..4175c3c6f8e 100644
--- a/integration_tests/src/main/python/spark_session.py
+++ b/integration_tests/src/main/python/spark_session.py
@@ -14,8 +14,7 @@
 
 import os
 from conftest import is_allowing_any_non_gpu, get_non_gpu_allowed, get_validate_execs_in_gpu_plan, is_databricks_runtime
-from pyspark.sql import SparkSession, DataFrame
-from pyspark import SparkContext
+from pyspark.sql import DataFrame
 from spark_init_internal import get_spark_i_know_what_i_am_doing, spark_version
 
 def _from_scala_map(scala_map):
@@ -176,8 +175,7 @@ def is_databricks104_or_later():
     return is_databricks_version_or_later(10, 4)
 
 def get_java_major_version():
-    sc = SparkContext.getOrCreate()
-    ver = sc._jvm.System.getProperty("java.version")
+    ver = _spark.sparkContext._jvm.System.getProperty("java.version")
     # Allow these formats:
     # 1.8.0_72-ea
     # 9-ea
diff --git a/integration_tests/src/main/python/window_function_test.py b/integration_tests/src/main/python/window_function_test.py
index 7869122fe43..fb7f9ce1258 100644
--- a/integration_tests/src/main/python/window_function_test.py
+++ b/integration_tests/src/main/python/window_function_test.py
@@ -135,22 +135,20 @@
             ['child_string', StringGen()]
         ])]
 
-all_basic_gens_no_nans = [byte_gen, short_gen, int_gen, long_gen, 
-        FloatGen(no_nans=True, special_cases=[]), DoubleGen(no_nans=True, special_cases=[]),
-        string_gen, boolean_gen, date_gen, timestamp_gen, null_gen]
-
 _no_nans_float_conf = {'spark.rapids.sql.variableFloatAgg.enabled': 'true',
                        'spark.rapids.sql.hasNans': 'false',
                        'spark.rapids.sql.castStringToFloat.enabled': 'true'
                       }
 
 @ignore_order(local=True)
-@pytest.mark.parametrize('data_gen', [float_gen, double_gen], ids=idfn)
-def test_float_window_max_with_nan(data_gen):
+@pytest.mark.parametrize('data_gen', [SetValuesGen(t, [math.nan, None]) for t in [FloatType(), DoubleType()]], ids=idfn)
+def test_float_window_min_max_all_nans(data_gen):
   w = Window().partitionBy('a')
   assert_gpu_and_cpu_are_equal_collect(
       lambda spark: two_col_df(spark, byte_gen, data_gen)
-          .withColumn("max_b", f.max('a').over(w)))
+          .withColumn("min_b", f.min('a').over(w))
+          .withColumn("max_b", f.max('a').over(w))
+  )
 
 @ignore_order
 @pytest.mark.parametrize('data_gen', [decimal_gen_128bit], ids=idfn)
@@ -385,10 +383,9 @@ def test_window_aggs_for_rows(data_gen, batch_size):
 # specially, but it only works if all of the aggregations can support this.
 # the order returned should be consistent because the data ends up in a single task (no partitioning)
 @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches 
-@pytest.mark.parametrize('b_gen', all_basic_gens_no_nans + [decimal_gen_32bit, decimal_gen_128bit], ids=meta_idfn('data:'))
+@pytest.mark.parametrize('b_gen', all_basic_gens + [decimal_gen_32bit, decimal_gen_128bit], ids=meta_idfn('data:'))
 def test_window_running_no_part(b_gen, batch_size):
     conf = {'spark.rapids.sql.batchSizeBytes': batch_size,
-            'spark.rapids.sql.hasNans': False,
             'spark.rapids.sql.castFloatToDecimal.enabled': True}
     query_parts = ['row_number() over (order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as row_num',
             'rank() over (order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as rank_val',
@@ -441,7 +438,7 @@ def test_running_float_sum_no_part(batch_size):
 # to allow for duplication in the ordering, because there will be no other columns. This means that if you swtich
 # rows it does not matter because the only time rows are switched is when the rows are exactly the same.
 @pytest.mark.parametrize('data_gen',
-                         all_basic_gens_no_nans + [decimal_gen_32bit, decimal_gen_128bit],
+                         all_basic_gens + [decimal_gen_32bit, orderable_decimal_gen_128bit],
                          ids=meta_idfn('data:'))
 def test_window_running_rank_no_part(data_gen):
     # Keep the batch size small. We have tested these with operators with exact inputs already, this is mostly
@@ -495,10 +492,9 @@ def test_window_running_rank(data_gen):
 @ignore_order(local=True)
 @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches
 @pytest.mark.parametrize('b_gen, c_gen', [(long_gen, x) for x in running_part_and_order_gens] +
-        [(x, long_gen) for x in all_basic_gens_no_nans + [decimal_gen_32bit]], ids=idfn)
+        [(x, long_gen) for x in all_basic_gens + [decimal_gen_32bit]], ids=idfn)
 def test_window_running(b_gen, c_gen, batch_size):
     conf = {'spark.rapids.sql.batchSizeBytes': batch_size,
-            'spark.rapids.sql.hasNans': False,
             'spark.rapids.sql.variableFloatAgg.enabled': True,
             'spark.rapids.sql.castFloatToDecimal.enabled': True}
     query_parts = ['b', 'a', 'row_number() over (partition by b order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as row_num',
diff --git a/jenkins/Dockerfile-blossom.ubuntu b/jenkins/Dockerfile-blossom.ubuntu
index cad575dec99..e30551c4a6d 100644
--- a/jenkins/Dockerfile-blossom.ubuntu
+++ b/jenkins/Dockerfile-blossom.ubuntu
@@ -45,7 +45,7 @@ RUN apt-get update -y && \
 RUN add-apt-repository ppa:deadsnakes/ppa && \
     apt-get update -y && \
     DEBIAN_FRONTEND="noninteractive" apt-get install -y maven \
-    openjdk-8-jdk openjdk-11-jdk python3.8 python3.8-distutils python3-setuptools tzdata git
+    openjdk-8-jdk openjdk-11-jdk python3.8 python3.8-distutils python3-setuptools tzdata git zip unzip
 RUN python3.8 -m easy_install pip
 
 # Set default jdk as 1.8.0
diff --git a/jenkins/databricks/run-tests.py b/jenkins/databricks/run-tests.py
index 67d07583e28..b7dcd8dfca6 100644
--- a/jenkins/databricks/run-tests.py
+++ b/jenkins/databricks/run-tests.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -36,7 +36,7 @@ def main():
     subprocess.check_call(rsync_command, shell=True)
 
     ssh_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s " \
-        "'LOCAL_JAR_PATH=%s SPARK_CONF=%s BASE_SPARK_VER=%s bash %s %s 2>&1 | tee testout; " \
+        "'LOCAL_JAR_PATH=%s SPARK_CONF=%s BASE_SPARK_VERSION=%s bash %s %s 2>&1 | tee testout; " \
         "if [ ${PIPESTATUS[0]} -ne 0 ]; then false; else true; fi'" % \
         (master_addr, params.private_key_file, params.jar_path, params.spark_conf, params.base_spark_pom_version,
          params.script_dest, ' '.join(params.script_args))
diff --git a/jenkins/databricks/test.sh b/jenkins/databricks/test.sh
index 8cbbb4b526a..5cb660ad2a0 100755
--- a/jenkins/databricks/test.sh
+++ b/jenkins/databricks/test.sh
@@ -19,8 +19,11 @@ set -ex
 
 LOCAL_JAR_PATH=${LOCAL_JAR_PATH:-''}
 SPARK_CONF=${SPARK_CONF:-''}
-BASE_SPARK_VER=${BASE_SPARK_VER:-'3.1.2'}
-[[ -z $SPARK_SHIM_VER ]] && export SPARK_SHIM_VER=spark${BASE_SPARK_VER//.}db
+BASE_SPARK_VERSION=${BASE_SPARK_VERSION:-'3.1.2'}
+[[ -z $SPARK_SHIM_VER ]] && export SPARK_SHIM_VER=spark${BASE_SPARK_VERSION//.}db
+
+# install required packages
+sudo apt -y install zip unzip
 
 # Try to use "cudf-udf" conda environment for the python cudf-udf tests.
 if [ -d "/databricks/conda/envs/cudf-udf" ]; then
@@ -34,7 +37,7 @@ export SPARK_HOME=/databricks/spark
 # change to not point at databricks confs so we don't conflict with their settings
 export SPARK_CONF_DIR=$PWD
 export PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$SPARK_HOME/python/lib/py4j-0.10.9-src.zip
-if [[ $BASE_SPARK_VER == "3.2.1" ]]
+if [[ $BASE_SPARK_VERSION == "3.2.1" ]]
 then
   # Databricks Koalas can conflict with the actual Pandas version, so put site packages first
   export PYTHONPATH=/databricks/python3/lib/python3.8/site-packages:$PYTHONPATH
@@ -65,7 +68,7 @@ if [ -n "$SPARK_CONF" ]; then
 fi
 
 IS_SPARK_311_OR_LATER=0
-[[ "$(printf '%s\n' "3.1.1" "$BASE_SPARK_VER" | sort -V | head -n1)" = "3.1.1" ]] && IS_SPARK_311_OR_LATER=1
+[[ "$(printf '%s\n' "3.1.1" "$BASE_SPARK_VERSION" | sort -V | head -n1)" = "3.1.1" ]] && IS_SPARK_311_OR_LATER=1
 
 
 # TEST_MODE
@@ -78,7 +81,7 @@ TEST_TYPE="nightly"
 PCBS_CONF="com.nvidia.spark.ParquetCachedBatchSerializer"
 
 ICEBERG_VERSION=${ICEBERG_VERSION:-0.13.2}
-ICEBERG_SPARK_VER=$(echo $BASE_SPARK_VER | cut -d. -f1,2)
+ICEBERG_SPARK_VER=$(echo $BASE_SPARK_VERSION | cut -d. -f1,2)
 # Classloader config is here to work around classloader issues with
 # --packages in distributed setups, should be fixed by
 # https://github.com/NVIDIA/spark-rapids/pull/5646
diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh
index 256f5fdf7e4..228fb6f51fa 100755
--- a/jenkins/spark-premerge-build.sh
+++ b/jenkins/spark-premerge-build.sh
@@ -48,9 +48,7 @@ mvn_verify() {
     $MVN_INSTALL_CMD -DskipTests -Dbuildver=313
     [[ $BUILD_MAINTENANCE_VERSION_SNAPSHOTS == "true" ]] && $MVN_INSTALL_CMD -Dbuildver=314
 
-    # don't skip tests
-    env -u SPARK_HOME $MVN_CMD -U -B $MVN_URM_MIRROR -Dbuildver=320 clean install $MVN_BUILD_ARGS \
-      -Dpytest.TEST_TAGS='' -pl '!tools'
+    $MVN_INSTALL_CMD -DskipTests -Dbuildver=320
     # enable UTF-8 for regular expression tests
     env -u SPARK_HOME LC_ALL="en_US.UTF-8" $MVN_CMD $MVN_URM_MIRROR -Dbuildver=320 test $MVN_BUILD_ARGS \
       -Dpytest.TEST_TAGS='' -pl '!tools' \
@@ -130,15 +128,17 @@ ci_2() {
     $MVN_CMD -U -B $MVN_URM_MIRROR clean package $MVN_BUILD_ARGS -DskipTests=true
     export TEST_TAGS="not premerge_ci_1"
     export TEST_TYPE="pre-commit"
-    export TEST_PARALLEL=4
-    # separate process to avoid OOM kill
-    TEST='conditionals_test or window_function_test' ./integration_tests/run_pyspark_from_build.sh
-    TEST_PARALLEL=5 TEST='struct_test or time_window_test' ./integration_tests/run_pyspark_from_build.sh
-    TEST='not conditionals_test and not window_function_test and not struct_test and not time_window_test' \
-      ./integration_tests/run_pyspark_from_build.sh
+    export TEST_PARALLEL=5
+    ./integration_tests/run_pyspark_from_build.sh
+    # enable avro test separately
     INCLUDE_SPARK_AVRO_JAR=true TEST='avro_test.py' ./integration_tests/run_pyspark_from_build.sh
     # export 'LC_ALL' to set locale with UTF-8 so regular expressions are enabled
     LC_ALL="en_US.UTF-8" TEST="regexp_test.py" ./integration_tests/run_pyspark_from_build.sh
+
+    # put some mvn tests here to balance durations of parallel stages
+    echo "Run mvn package..."
+    env -u SPARK_HOME $MVN_CMD -U -B $MVN_URM_MIRROR -Dbuildver=320 clean package $MVN_BUILD_ARGS \
+      -Dpytest.TEST_TAGS='' -pl '!tools'
 }
 
 
@@ -183,7 +183,10 @@ export SPARK_HOME="$ARTF_ROOT/spark-$SPARK_VER-bin-hadoop3.2"
 export PATH="$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH"
 tar zxf $SPARK_HOME.tgz -C $ARTF_ROOT && \
     rm -f $SPARK_HOME.tgz
-export PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$SPARK_HOME/python/lib/py4j-0.10.9-src.zip
+# copy python path libs to container /tmp instead of workspace to avoid ephemeral PVC issue
+TMP_PYTHON=/tmp/$(date +"%Y%m%d")
+rm -rf $TMP_PYTHON && cp -r $SPARK_HOME/python $TMP_PYTHON
+export PYTHONPATH=$TMP_PYTHON/python:$TMP_PYTHON/python/pyspark/:$TMP_PYTHON/python/lib/py4j-0.10.9-src.zip
 
 case $BUILD_TYPE in
 
diff --git a/jenkins/spark-tests.sh b/jenkins/spark-tests.sh
index 5ffeddc7619..ea480d93e47 100755
--- a/jenkins/spark-tests.sh
+++ b/jenkins/spark-tests.sh
@@ -110,7 +110,11 @@ export SPARK_HOME="$ARTF_ROOT/spark-$SPARK_VER-bin-hadoop3.2"
 export PATH="$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH"
 tar zxf $SPARK_HOME.tgz -C $ARTF_ROOT && \
     rm -f $SPARK_HOME.tgz
-export PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$SPARK_HOME/python/lib/py4j-0.10.9-src.zip
+# copy python path libs to container /tmp instead of workspace to avoid ephemeral PVC issue
+TMP_PYTHON=/tmp/$(date +"%Y%m%d")
+rm -rf $TMP_PYTHON && cp -r $SPARK_HOME/python $TMP_PYTHON
+export PYTHONPATH=$TMP_PYTHON/python:$TMP_PYTHON/python/pyspark/:$TMP_PYTHON/python/lib/py4j-0.10.9-src.zip
+
 # Extract 'value' from conda config string 'key: value'
 CONDA_ROOT=`conda config --show root_prefix | cut -d ' ' -f2`
 PYTHON_VER=`conda config --show default_python | cut -d ' ' -f2`
@@ -195,9 +199,6 @@ run_iceberg_tests() {
 
   # Iceberg does not support Spark 3.3+ yet
   if [[ "$ICEBERG_SPARK_VER" < "3.3" ]]; then
-    # Classloader config is here to work around classloader issues with
-    # --packages in distributed setups, should be fixed by
-    # https://github.com/NVIDIA/spark-rapids/pull/5646
     SPARK_SUBMIT_FLAGS="$BASE_SPARK_SUBMIT_ARGS $SEQ_CONF \
       --packages org.apache.iceberg:iceberg-spark-runtime-${ICEBERG_SPARK_VER}_2.12:${ICEBERG_VERSION} \
       --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \
diff --git a/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala b/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala
new file mode 100644
index 00000000000..fe914811df2
--- /dev/null
+++ b/sql-plugin/src/main/311until320-all/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.shims
+
+import ai.rapids.cudf.{ColumnView, DType, Scalar}
+import com.nvidia.spark.rapids.GpuOrcScan.{testLongMultiplicationOverflow, withResource}
+
+object OrcCastingShims {
+  /**
+   * Cast ColumnView of integer types to timestamp (in milliseconds).
+   * @param col The column view of integer types.
+   * @param fromType BOOL8, INT8/16/32/64
+   * @return A new timestamp columnar vector.
+   */
+  def castIntegerToTimestamp(col: ColumnView, fromType: DType): ColumnView = {
+    fromType match {
+      case DType.BOOL8 | DType.INT8 | DType.INT16 | DType.INT32 =>
+        // From spark311 until spark314 (not include it), spark consider the integers as
+        // milli-seconds.
+        // cuDF requires casting to Long first, then we can cast Long to Timestamp(in microseconds)
+        // In CPU code of ORC casting, its conversion is 'integer -> milliseconds -> microseconds'
+        withResource(col.castTo(DType.INT64)) { longs =>
+          withResource(Scalar.fromLong(1000L)) { thousand =>
+            withResource(longs.mul(thousand)) { milliSeconds =>
+              milliSeconds.castTo(DType.TIMESTAMP_MICROSECONDS)
+            }
+          }
+        }
+      case DType.INT64 =>
+        // We need overflow checking here, since max value of INT64 is about 9 * 1e18, and convert
+        // INT64 to milliseconds(also a INT64 actually), we need multiply 1000, it may cause long
+        // integer-overflow.
+        // If these two 'testLongMultiplicationOverflow' throw no exception, it means no
+        // Long-overflow when casting 'col' to TIMESTAMP_MICROSECONDS.
+        if (col.max() != null) {
+          testLongMultiplicationOverflow(col.max().getLong, 1000L)
+        }
+        if (col.min() != null) {
+          testLongMultiplicationOverflow(col.min().getLong, 1000L)
+        }
+        withResource(Scalar.fromLong(1000L)) { thousand =>
+          withResource(col.mul(thousand)) { milliSeconds =>
+            milliSeconds.castTo(DType.TIMESTAMP_MICROSECONDS)
+          }
+        }
+    }
+  }
+}
diff --git a/sql-plugin/src/main/311until320-nondb/scala/com/nvidia/spark/rapids/shims/Spark31XShims.scala b/sql-plugin/src/main/311until320-nondb/scala/com/nvidia/spark/rapids/shims/Spark31XShims.scala
index 4496ea9c93c..648a498893e 100644
--- a/sql-plugin/src/main/311until320-nondb/scala/com/nvidia/spark/rapids/shims/Spark31XShims.scala
+++ b/sql-plugin/src/main/311until320-nondb/scala/com/nvidia/spark/rapids/shims/Spark31XShims.scala
@@ -262,26 +262,7 @@ abstract class Spark31XShims extends SparkShims with Spark31Xuntil33XShims with
           TypeSig.cpuNumeric))),
       (a, conf, p, r) => new AggExprMeta[Average](a, conf, p, r) {
         override def tagAggForGpu(): Unit = {
-          // For Decimal Average the SUM adds a precision of 10 to avoid overflowing
-          // then it divides by the count with an output scale that is 4 more than the input
-          // scale. With how our divide works to match Spark, this means that we will need a
-          // precision of 5 more. So 38 - 10 - 5 = 23
-          val dataType = a.child.dataType
-          dataType match {
-            case dt: DecimalType =>
-              if (dt.precision > 23) {
-                if (conf.needDecimalGuarantees) {
-                  willNotWorkOnGpu("GpuAverage cannot guarantee proper overflow checks for " +
-                      s"a precision large than 23. The current precision is ${dt.precision}")
-                } else {
-                  logWarning("Decimal overflow guarantees disabled for " +
-                      s"Average(${a.child.dataType}) produces ${dt} with an " +
-                      s"intermediate precision of ${dt.precision + 15}")
-                }
-              }
-            case _ => // NOOP
-          }
-          GpuOverrides.checkAndTagFloatAgg(dataType, conf, this)
+          GpuOverrides.checkAndTagFloatAgg(a.child.dataType, conf, this)
         }
 
         override def convertToGpu(childExprs: Seq[Expression]): GpuExpression =
diff --git a/sql-plugin/src/main/31xdb/scala/com/nvidia/spark/rapids/shims/Spark31XdbShims.scala b/sql-plugin/src/main/31xdb/scala/com/nvidia/spark/rapids/shims/Spark31XdbShims.scala
index f09873251ca..b1328b2502b 100644
--- a/sql-plugin/src/main/31xdb/scala/com/nvidia/spark/rapids/shims/Spark31XdbShims.scala
+++ b/sql-plugin/src/main/31xdb/scala/com/nvidia/spark/rapids/shims/Spark31XdbShims.scala
@@ -133,26 +133,7 @@ abstract class Spark31XdbShims extends Spark31XdbShimsBase with Logging {
           TypeSig.cpuNumeric))),
       (a, conf, p, r) => new AggExprMeta[Average](a, conf, p, r) {
         override def tagAggForGpu(): Unit = {
-          // For Decimal Average the SUM adds a precision of 10 to avoid overflowing
-          // then it divides by the count with an output scale that is 4 more than the input
-          // scale. With how our divide works to match Spark, this means that we will need a
-          // precision of 5 more. So 38 - 10 - 5 = 23
-          val dataType = a.child.dataType
-          dataType match {
-            case dt: DecimalType =>
-              if (dt.precision > 23) {
-                if (conf.needDecimalGuarantees) {
-                  willNotWorkOnGpu("GpuAverage cannot guarantee proper overflow checks for " +
-                      s"a precision large than 23. The current precision is ${dt.precision}")
-                } else {
-                  logWarning("Decimal overflow guarantees disabled for " +
-                      s"Average(${a.child.dataType}) produces ${dt} with an " +
-                      s"intermediate precision of ${dt.precision + 15}")
-                }
-              }
-            case _ => // NOOP
-          }
-          GpuOverrides.checkAndTagFloatAgg(dataType, conf, this)
+          GpuOverrides.checkAndTagFloatAgg(a.child.dataType, conf, this)
         }
 
         override def convertToGpu(childExprs: Seq[Expression]): GpuExpression =
diff --git a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala b/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala
new file mode 100644
index 00000000000..b793a683e04
--- /dev/null
+++ b/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/OrcCastingShims.scala
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.shims
+
+import ai.rapids.cudf.{ColumnView, DType, Scalar}
+import com.nvidia.spark.rapids.GpuOrcScan.{testLongMultiplicationOverflow, withResource}
+
+object OrcCastingShims {
+  /**
+   * Cast ColumnView of integer types to timestamp (in milliseconds).
+   * @param col The column view of integer types.
+   * @param fromType BOOL8, INT8/16/32/64
+   * @return A new timestamp columnar vector.
+   */
+  def castIntegerToTimestamp(col: ColumnView, fromType: DType): ColumnView = {
+    fromType match {
+      case DType.BOOL8 | DType.INT8 | DType.INT16 | DType.INT32 =>
+        // From spark320, spark consider the integers as seconds.
+        withResource(col.castTo(DType.INT64)) { longs =>
+          // In CPU, ORC assumes the integer value is in seconds, and returns timestamp in
+          // micro seconds, so we need to multiply 1e6 here.
+          withResource(Scalar.fromLong(1000000L)) { value =>
+            withResource(longs.mul(value)) { microSeconds =>
+              microSeconds.castTo(DType.TIMESTAMP_MICROSECONDS)
+            }
+          }
+        }
+
+      case DType.INT64 =>
+        // In CPU code of ORC casting, its conversion is 'integer -> milliseconds -> microseconds'
+        withResource(Scalar.fromLong(1000L)) { thousand =>
+          withResource(col.mul(thousand)) { milliSeconds =>
+            // We need to check long-overflow here. If milliseconds can not convert to
+            // micorseconds, then testLongMultiplicationOverflow will throw exception.
+            if (milliSeconds.max() != null) {
+              testLongMultiplicationOverflow(milliSeconds.max().getLong, 1000L)
+            }
+            if (milliSeconds.min() != null) {
+              testLongMultiplicationOverflow(milliSeconds.min().getLong, 1000L)
+            }
+            withResource(milliSeconds.mul(thousand)) { microSeconds =>
+              microSeconds.castTo(DType.TIMESTAMP_MICROSECONDS)
+            }
+          }
+        }
+    }
+  }
+}
diff --git a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/Spark320PlusShims.scala b/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/Spark320PlusShims.scala
index ea175b2e6d8..90137bdacf6 100644
--- a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/Spark320PlusShims.scala
+++ b/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/Spark320PlusShims.scala
@@ -141,26 +141,7 @@ trait Spark320PlusShims extends SparkShims with RebaseShims with Logging {
           TypeSig.numericAndInterval + TypeSig.NULL))),
       (a, conf, p, r) => new AggExprMeta[Average](a, conf, p, r) {
         override def tagAggForGpu(): Unit = {
-          // For Decimal Average the SUM adds a precision of 10 to avoid overflowing
-          // then it divides by the count with an output scale that is 4 more than the input
-          // scale. With how our divide works to match Spark, this means that we will need a
-          // precision of 5 more. So 38 - 10 - 5 = 23
-          val dataType = a.child.dataType
-          dataType match {
-            case dt: DecimalType =>
-              if (dt.precision > 23) {
-                if (conf.needDecimalGuarantees) {
-                  willNotWorkOnGpu("GpuAverage cannot guarantee proper overflow checks for " +
-                    s"a precision large than 23. The current precision is ${dt.precision}")
-                } else {
-                  logWarning("Decimal overflow guarantees disabled for " +
-                    s"Average(${a.child.dataType}) produces $dt with an " +
-                    s"intermediate precision of ${dt.precision + 15}")
-                }
-              }
-            case _ => // NOOP
-          }
-          GpuOverrides.checkAndTagFloatAgg(dataType, conf, this)
+          GpuOverrides.checkAndTagFloatAgg(a.child.dataType, conf, this)
         }
 
         override def convertToGpu(childExprs: Seq[Expression]): GpuExpression =
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/IcebergProvider.scala b/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/IcebergProvider.scala
index cd9cc9666c0..b17a70d303f 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/IcebergProvider.scala
+++ b/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/IcebergProvider.scala
@@ -25,6 +25,8 @@ trait IcebergProvider {
   def isSupportedScan(scan: Scan): Boolean
 
   def getScans: Map[Class[_ <: Scan], ScanRule[_ <: Scan]]
+
+  def copyScanWithInputFileTrue(scan: Scan): Scan
 }
 
 object IcebergProvider {
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/IcebergProviderImpl.scala b/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/IcebergProviderImpl.scala
index 9d440885e91..ae855214f2e 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/IcebergProviderImpl.scala
+++ b/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/IcebergProviderImpl.scala
@@ -66,4 +66,11 @@ class IcebergProviderImpl extends IcebergProvider {
       ClassTag(cpuIcebergScanClass))
     ).map(r => (r.getClassFor.asSubclass(classOf[Scan]), r)).toMap
   }
+
+  override def copyScanWithInputFileTrue(scan: Scan): Scan = scan match {
+    case icebergBatchScan: GpuSparkBatchQueryScan =>
+      icebergBatchScan.copyWithInputFileTrue();
+    case _ =>
+      throw new RuntimeException(s"Unsupported scan type: ${scan.getClass.getSimpleName}")
+  }
 }
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/spark/source/GpuMultiFileBatchReader.java b/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/spark/source/GpuMultiFileBatchReader.java
index 9b958cbaa8b..8755fa27289 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/spark/source/GpuMultiFileBatchReader.java
+++ b/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/spark/source/GpuMultiFileBatchReader.java
@@ -38,10 +38,12 @@
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.execution.datasources.PartitionedFile;
 import org.apache.spark.sql.rapids.InputFileUtils;
+import org.apache.spark.sql.rapids.execution.TrampolineUtil;
 import org.apache.spark.sql.types.StructType;
 import org.apache.spark.sql.vectorized.ColumnarBatch;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import scala.collection.JavaConverters;
 import scala.collection.Seq;
 import scala.Tuple2;
 
@@ -49,15 +51,12 @@
 import java.io.UncheckedIOException;
 import java.net.URI;
 import java.net.URISyntaxException;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
+import java.util.stream.Collectors;
 
-/** The wrapper of the GPU multi-threaded and coalescing(TBD) reader for Iceberg */
+/** The wrapper of the GPU multi-threaded and coalescing reader for Iceberg */
 class GpuMultiFileBatchReader extends BaseDataReader<ColumnarBatch> {
   private static final Logger LOG = LoggerFactory.getLogger(GpuMultiFileBatchReader.class);
-  private final Map<String, Tuple2<Map<Integer, ?>, Schema>> constsSchemaMap =
-      Maps.newConcurrentMap();
   private final LinkedHashMap<String, FileScanTask> files;
   private final Schema expectedSchema;
   private final boolean caseSensitive;
@@ -75,7 +74,7 @@ class GpuMultiFileBatchReader extends BaseDataReader<ColumnarBatch> {
   private boolean needNext = true;
   private boolean isBatchPending;
   // lazy variables
-  private FilePartitionReaderBase rapidsReader = null;
+  private CloseableIterator<ColumnarBatch> batchReader = null;
 
   GpuMultiFileBatchReader(CombinedScanTask task, Table table, Schema expectedSchema,
       boolean caseSensitive, Configuration conf, int maxBatchSizeRows, long maxBatchSizeBytes,
@@ -104,38 +103,28 @@ class GpuMultiFileBatchReader extends BaseDataReader<ColumnarBatch> {
 
   @Override
   public ColumnarBatch get() {
-    if (rapidsReader == null) {
+    if (batchReader == null) {
       // Not initialized, return null to align with PerFile reader.
       return null;
     }
     needNext = true;
     isBatchPending = false;
-    // The same post-process with PerFile reader.
-    try (ColumnarBatch batch = rapidsReader.get()) {
-      // The Rapids reader should already set the current file.
-      String curFile = InputFileUtils.getCurInputFilePath();
-      Tuple2<Map<Integer, ?>, Schema> constsSchema = constsSchemaMap.get(curFile);
-      Map<Integer, ?> idToConsts = constsSchema._1();
-      Schema updatedReadSchema = constsSchema._2();
-      return GpuIcebergReader.addUpcastsIfNeeded(
-          GpuIcebergReader.addConstantColumns(batch, updatedReadSchema, idToConsts),
-          updatedReadSchema);
-    }
+    return batchReader.next();
   }
 
   @Override
   public boolean next() throws IOException {
-    ensureRapidsReader();
+    ensureBatchReader();
     if (needNext) {
       needNext = false;
-      isBatchPending = rapidsReader.next();
+      isBatchPending = batchReader.hasNext();
     }
     return isBatchPending;
   }
 
   @Override
   public void close() throws IOException {
-    if (rapidsReader != null) rapidsReader.close();
+    if (batchReader != null) batchReader.close();
     super.close();
   }
 
@@ -146,89 +135,23 @@ CloseableIterator<ColumnarBatch> open(FileScanTask task) {
     throw new IllegalStateException();
   }
 
-  private void ensureRapidsReader() {
-    if (rapidsReader == null) {
-      if (FileFormat.PARQUET.equals(fileFormat)) {
-        if (useMultiThread) {
-          rapidsReader = createParquetMultiThreadReader();
-        } else {
-          // TODO Support coalescing reading, tracked by
-          // https://github.com/NVIDIA/spark-rapids/issues/5942
-          throw new UnsupportedOperationException(
-              "Coalescing reading is not supported for Parquet reads yet");
-        }
+  private void ensureBatchReader() {
+    if (batchReader != null) {
+      return;
+    }
+    if (FileFormat.PARQUET.equals(fileFormat)) {
+      if (useMultiThread) {
+        LOG.debug("Using Iceberg Parquet multi-threaded reader, task attempt ID: " +
+            TaskContext.get().taskAttemptId());
+        batchReader = new ParquetMultiThreadBatchReader();
       } else {
-        throw new UnsupportedOperationException(
-            "Format: " + fileFormat + " is not supported for batched reads");
+        LOG.debug("Using Iceberg Parquet coalescing reader, task attempt ID: " +
+            TaskContext.get().taskAttemptId());
+        batchReader = new ParquetCoalescingBatchReader();
       }
-    }
-  }
-
-  private FilePartitionReaderBase createParquetMultiThreadReader() {
-    LOG.debug("Using multi-threaded Iceberg Parquet reader, task attempt ID: " +
-        TaskContext.get().taskAttemptId());
-    // Iceberg will handle partition values itself.
-    StructType emptyPartSchema = new StructType();
-    InternalRow emptyPartValue = InternalRow.empty();
-
-    PartitionedFile[] files = this.files.values().stream()
-      .map(fst -> PartitionedFileUtils.newPartitionedFile(emptyPartValue,
-          fst.file().path().toString(), fst.start(), fst.length()))
-      .toArray(PartitionedFile[]::new);
-
-    return new MultiFileCloudParquetPartitionReader(conf, files, this::filterParquetBlocks,
-        caseSensitive, parquetDebugDumpPrefix, maxBatchSizeRows, maxBatchSizeBytes,
-        metrics, emptyPartSchema, numThreads, maxNumFileProcessed,
-        false, // ignoreMissingFiles
-        false, // ignoreCorruptFiles
-        false // useFieldId
-    );
-  }
-
-  /** The filter function for the Parquet multi-file reader */
-  private ParquetFileInfoWithBlockMeta filterParquetBlocks(PartitionedFile file) {
-    FileScanTask fst = this.files.get(file.filePath());
-    GpuDeleteFilter deleteFilter = deleteFilter(fst);
-    if (deleteFilter != null) {
-      throw new UnsupportedOperationException("Delete filter is not supported");
-    }
-    Schema updatedSchema = requiredSchema(deleteFilter);
-    Map<Integer, ?> idToConstant = constantsMap(fst, updatedSchema);
-    InputFile inFile = getInputFile(fst);
-    ParquetReadOptions readOptions =
-        GpuParquet.buildReaderOptions(inFile, fst.start(), fst.length());
-    try (ParquetFileReader reader = GpuParquetReader.newReader(inFile, readOptions)) {
-      MessageType fileSchema = reader.getFileMetaData().getSchema();
-
-      List<BlockMetaData> filteredRowGroups = GpuParquetReader.filterRowGroups(reader,
-          nameMapping, updatedSchema, fst.residual(), caseSensitive);
-
-      GpuParquetReader.ReorderColumns reorder = ParquetSchemaUtil.hasIds(fileSchema) ?
-          new GpuParquetReader.ReorderColumns(idToConstant) :
-          new GpuParquetReader.ReorderColumnsFallback(idToConstant);
-
-      MessageType fileReadSchema = (MessageType) TypeWithSchemaVisitor.visit(
-          updatedSchema.asStruct(), fileSchema, reorder);
-      Seq<BlockMetaData> clippedBlocks = GpuParquetUtils.clipBlocksToSchema(
-          fileReadSchema, filteredRowGroups, caseSensitive);
-      StructType partReaderSparkSchema = (StructType) TypeWithSchemaVisitor.visit(
-          updatedSchema.asStruct(), fileReadSchema, new GpuParquetReader.SparkSchemaConverter());
-
-      // cache the updated constants
-      Map<Integer, ?> updatedConstants =
-          GpuParquetReader.addNullsForMissingFields(idToConstant, reorder.getMissingFields());
-      constsSchemaMap.put(file.filePath(), Tuple2.apply(updatedConstants, updatedSchema));
-
-      return ParquetFileInfoWithBlockMeta.apply(new Path(new URI(file.filePath())),
-          clippedBlocks, InternalRow.empty(), fileReadSchema, partReaderSparkSchema,
-          true, // isCorrectedInt96RebaseMode
-          true, // isCorrectedRebaseMode
-          true //  hasInt96Timestamps
-      );
-    } catch (IOException e) {
-      throw new UncheckedIOException("Failed to open file: " + inFile, e);
-    } catch (URISyntaxException ue) {
-      throw new IllegalArgumentException("Invalid file path: " + inFile, ue);
+    } else {
+      throw new UnsupportedOperationException(
+          "Format: " + fileFormat + " is not supported for multi-file batched reads");
     }
   }
 
@@ -250,4 +173,276 @@ private Schema requiredSchema(GpuDeleteFilter deleteFilter) {
       return expectedSchema;
     }
   }
+
+  static class FilteredParquetFileInfo {
+    private final ParquetFileInfoWithBlockMeta parquetBlockMeta;
+    private final Map<Integer, ?> idToConstant;
+    private final Schema expectedSchema;
+
+    FilteredParquetFileInfo(ParquetFileInfoWithBlockMeta parquetBlockMeta,
+        Map<Integer, ?> idToConstant, Schema expectedSchema) {
+      this.parquetBlockMeta = parquetBlockMeta;
+      this.idToConstant = idToConstant;
+      this.expectedSchema = expectedSchema;
+    }
+
+    ParquetFileInfoWithBlockMeta parquetBlockMeta() {
+      return parquetBlockMeta;
+    }
+
+    Map<Integer, ?> idToConstant() {
+      return idToConstant;
+    }
+
+    Schema expectedSchema() {
+      return expectedSchema;
+    }
+  }
+
+  static class IcebergParquetExtraInfo extends ParquetExtraInfo {
+    private final Map<Integer, ?> idToConstant;
+    private final Schema expectedSchema;
+    private final PartitionSpec partitionSpec;
+
+    IcebergParquetExtraInfo(boolean isCorrectedRebaseMode,
+        boolean isCorrectedInt96RebaseMode, boolean hasInt96Timestamps,
+        Map<Integer, ?> idToConstant, Schema expectedSchema, PartitionSpec partitionSpec) {
+      super(isCorrectedRebaseMode, isCorrectedInt96RebaseMode, hasInt96Timestamps);
+      this.idToConstant = idToConstant;
+      this.expectedSchema = expectedSchema;
+      this.partitionSpec = partitionSpec;
+    }
+
+    Map<Integer, ?> idToConstant() {
+      return idToConstant;
+    }
+
+    Schema expectedSchema() {
+      return expectedSchema;
+    }
+
+    PartitionSpec partitionSpec() {
+      return partitionSpec;
+    }
+  }
+
+  abstract class MultiFileBatchReaderBase implements CloseableIterator<ColumnarBatch> {
+    protected final FilePartitionReaderBase rapidsReader;
+
+    protected MultiFileBatchReaderBase() {
+      // Iceberg will handle partition values itself. So both
+      // the partitioned schema and values are empty for the Rapids reader.
+      final StructType emptyPartSchema = new StructType();
+      final InternalRow emptyPartValue = InternalRow.empty();
+      PartitionedFile[] pFiles = files.values().stream()
+          .map(fst -> PartitionedFileUtils.newPartitionedFile(emptyPartValue,
+              fst.file().path().toString(), fst.start(), fst.length()))
+          .toArray(PartitionedFile[]::new);
+      rapidsReader = createRapidsReader(pFiles, emptyPartSchema);
+    }
+
+    @Override
+    public void close() throws IOException {
+      rapidsReader.close();
+    }
+
+    @Override
+    public boolean hasNext() {
+      try {
+        return rapidsReader.next();
+      } catch (IOException e) {
+        throw new UncheckedIOException(e);
+      }
+    }
+
+    protected abstract FilePartitionReaderBase createRapidsReader(PartitionedFile[] pFiles,
+        StructType partitionSchema);
+
+    /** The filter function for the Parquet multi-file reader */
+    protected FilteredParquetFileInfo filterParquetBlocks(FileScanTask fst) {
+      GpuDeleteFilter deleteFilter = deleteFilter(fst);
+      if (deleteFilter != null) {
+        throw new UnsupportedOperationException("Delete filter is not supported");
+      }
+      Schema updatedSchema = requiredSchema(deleteFilter);
+      Map<Integer, ?> idToConstant = constantsMap(fst, updatedSchema);
+      InputFile inFile = getInputFile(fst);
+      ParquetReadOptions readOptions =
+          GpuParquet.buildReaderOptions(inFile, fst.start(), fst.length());
+      try (ParquetFileReader reader = GpuParquetReader.newReader(inFile, readOptions)) {
+        MessageType fileSchema = reader.getFileMetaData().getSchema();
+
+        List<BlockMetaData> filteredRowGroups = GpuParquetReader.filterRowGroups(reader,
+            nameMapping, updatedSchema, fst.residual(), caseSensitive);
+
+        GpuParquetReader.ReorderColumns reorder = ParquetSchemaUtil.hasIds(fileSchema) ?
+            new GpuParquetReader.ReorderColumns(idToConstant) :
+            new GpuParquetReader.ReorderColumnsFallback(idToConstant);
+
+        MessageType fileReadSchema = (MessageType) TypeWithSchemaVisitor.visit(
+            updatedSchema.asStruct(), fileSchema, reorder);
+        Seq<BlockMetaData> clippedBlocks = GpuParquetUtils.clipBlocksToSchema(
+            fileReadSchema, filteredRowGroups, caseSensitive);
+        StructType partReaderSparkSchema = (StructType) TypeWithSchemaVisitor.visit(
+            updatedSchema.asStruct(), fileReadSchema, new GpuParquetReader.SparkSchemaConverter());
+
+        // cache the updated constants
+        Map<Integer, ?> updatedConstants =
+            GpuParquetReader.addNullsForMissingFields(idToConstant, reorder.getMissingFields());
+
+        ParquetFileInfoWithBlockMeta parquetBlockMeta = ParquetFileInfoWithBlockMeta.apply(
+            new Path(new URI(fst.file().path().toString())), clippedBlocks,
+            InternalRow.empty(), fileReadSchema, partReaderSparkSchema,
+            true, // isCorrectedInt96RebaseMode
+            true, // isCorrectedRebaseMode
+            true //  hasInt96Timestamps
+        );
+        return new FilteredParquetFileInfo(parquetBlockMeta, updatedConstants, updatedSchema);
+      } catch (IOException e) {
+        throw new UncheckedIOException("Failed to open file: " + inFile, e);
+      } catch (URISyntaxException ue) {
+        throw new IllegalArgumentException("Invalid file path: " + inFile, ue);
+      }
+    } // end of filterParquetBlocks
+  }
+
+  class ParquetMultiThreadBatchReader extends MultiFileBatchReaderBase {
+    private final Map<String, Tuple2<Map<Integer, ?>, Schema>> constsSchemaMap =
+        Maps.newConcurrentMap();
+
+    ParquetMultiThreadBatchReader() {
+      super();
+    }
+
+    @Override
+    protected FilePartitionReaderBase createRapidsReader(PartitionedFile[] pFiles,
+        StructType partitionSchema) {
+      return new MultiFileCloudParquetPartitionReader(conf, pFiles,
+          this::filterParquetBlocks, caseSensitive, parquetDebugDumpPrefix,
+          maxBatchSizeRows, maxBatchSizeBytes, metrics, partitionSchema,
+          numThreads, maxNumFileProcessed,
+          false, // ignoreMissingFiles
+          false, // ignoreCorruptFiles
+          false // useFieldId
+      );
+    }
+
+    private ParquetFileInfoWithBlockMeta filterParquetBlocks(PartitionedFile file) {
+      FileScanTask fst = files.get(file.filePath());
+      FilteredParquetFileInfo filteredInfo = filterParquetBlocks(fst);
+      constsSchemaMap.put(file.filePath(),
+          Tuple2.apply(filteredInfo.idToConstant(), filteredInfo.expectedSchema()));
+      return filteredInfo.parquetBlockMeta();
+    }
+
+    @Override
+    public ColumnarBatch next() {
+      // The same post-process with PerFile reader.
+      try (ColumnarBatch batch = rapidsReader.get()) {
+        // The Rapids reader should already set the current file.
+        String curFile = InputFileUtils.getCurInputFilePath();
+        Tuple2<Map<Integer, ?>, Schema> constsSchema = constsSchemaMap.get(curFile);
+        Map<Integer, ?> idToConsts = constsSchema._1();
+        Schema updatedReadSchema = constsSchema._2();
+        return GpuIcebergReader.addUpcastsIfNeeded(
+            GpuIcebergReader.addConstantColumns(batch, updatedReadSchema, idToConsts),
+            updatedReadSchema);
+      }
+    }
+  }
+
+  class ParquetCoalescingBatchReader extends MultiFileBatchReaderBase {
+
+    ParquetCoalescingBatchReader() {
+      super();
+    }
+
+    @Override
+    protected FilePartitionReaderBase createRapidsReader(PartitionedFile[] pFiles,
+        StructType partitionSchema) {
+      ArrayList<ParquetSingleDataBlockMeta> clippedBlocks = new ArrayList();
+      files.values().forEach(fst -> {
+        FilteredParquetFileInfo filteredInfo = filterParquetBlocks(fst);
+        List<ParquetSingleDataBlockMeta> fileSingleMetas =
+          JavaConverters.asJavaCollection(filteredInfo.parquetBlockMeta.blocks()).stream()
+            .map(b -> ParquetSingleDataBlockMeta.apply(
+                filteredInfo.parquetBlockMeta.filePath(),
+                ParquetDataBlock.apply(b),
+                InternalRow.empty(),
+                ParquetSchemaWrapper.apply(filteredInfo.parquetBlockMeta.schema()),
+                filteredInfo.parquetBlockMeta.readSchema(),
+                new IcebergParquetExtraInfo(
+                    filteredInfo.parquetBlockMeta.isCorrectedRebaseMode(),
+                    filteredInfo.parquetBlockMeta.isCorrectedInt96RebaseMode(),
+                    filteredInfo.parquetBlockMeta.hasInt96Timestamps(),
+                    filteredInfo.idToConstant(),
+                    filteredInfo.expectedSchema(),
+                    fst.spec())))
+            .collect(Collectors.toList());
+        clippedBlocks.addAll(fileSingleMetas);
+      });
+
+      return new MultiFileParquetPartitionReader(conf, pFiles,
+          JavaConverters.asScalaBuffer(clippedBlocks).toSeq(),
+          caseSensitive, parquetDebugDumpPrefix, maxBatchSizeRows, maxBatchSizeBytes,
+          metrics, partitionSchema, numThreads,
+          false, // ignoreMissingFiles
+          false, // ignoreCorruptFiles
+          false // useFieldId
+      ) {
+        @Override
+        public boolean checkIfNeedToSplitDataBlock(SingleDataBlockInfo currentBlockInfo,
+            SingleDataBlockInfo nextBlockInfo) {
+          // Check the read schema. Because it may differ among files in Iceberg.
+          if (!TrampolineUtil.sameType(currentBlockInfo.readSchema(),
+              nextBlockInfo.readSchema())) {
+            return true;
+          }
+          // Now for Iceberg, blocks with different partition schemas or partition values
+          // do not coalesce.
+          // Will try to figure out if it is possible to merge and add different
+          // partition values correctly in the future, to allow coalescing even
+          // partition values differ but with the same partition schema,
+          // tracked by https://github.com/NVIDIA/spark-rapids/issues/6423.
+          IcebergParquetExtraInfo curEInfo =
+              (IcebergParquetExtraInfo)currentBlockInfo.extraInfo();
+          IcebergParquetExtraInfo nextEInfo =
+              (IcebergParquetExtraInfo)nextBlockInfo.extraInfo();
+          if (!samePartitionSpec(curEInfo, nextEInfo)) {
+            return true;
+          }
+
+          return super.checkIfNeedToSplitDataBlock(currentBlockInfo, nextBlockInfo);
+        }
+
+        @Override
+        public ColumnarBatch finalizeOutputBatch(ColumnarBatch batch, ExtraInfo extraInfo) {
+          Map<Integer, ?> idToConsts = ((IcebergParquetExtraInfo)extraInfo).idToConstant();
+          Schema expectedSchema = ((IcebergParquetExtraInfo)extraInfo).expectedSchema();
+          return GpuIcebergReader.addUpcastsIfNeeded(
+              GpuIcebergReader.addConstantColumns(batch, expectedSchema, idToConsts),
+              expectedSchema);
+        }
+
+        private boolean samePartitionSpec(IcebergParquetExtraInfo curEInfo,
+            IcebergParquetExtraInfo nextEInfo) {
+          if (curEInfo.partitionSpec().partitionType()
+              .equals(nextEInfo.partitionSpec().partitionType())) {
+            // partition schema is equivalent, check the partition value next.
+            // Only identity fields were added into constants map.
+            return curEInfo.partitionSpec().identitySourceIds().stream().allMatch(id ->
+              Objects.deepEquals(
+                  curEInfo.idToConstant().get(id),
+                  nextEInfo.idToConstant().get(id)));
+          }
+          return false;
+        }
+      }; // end of "return new MultiFileParquetPartitionReader"
+    }
+
+    @Override
+    public ColumnarBatch next() {
+      return rapidsReader.get();
+    }
+  }
 }
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/spark/source/GpuSparkBatchQueryScan.java b/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/spark/source/GpuSparkBatchQueryScan.java
index d1fa44c9469..b89ad62163e 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/spark/source/GpuSparkBatchQueryScan.java
+++ b/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/spark/source/GpuSparkBatchQueryScan.java
@@ -106,7 +106,9 @@ public static GpuSparkBatchQueryScan fromCpu(Scan cpuInstance, RapidsConf rapids
       // No TableScan instance, so try to build one now
       scan = buildScan(cpuInstance, table, readConf, expectedSchema, filters);
     }
-    return new GpuSparkBatchQueryScan(SparkSession.active(), table, scan, readConf, expectedSchema, filters, rapidsConf);
+    return new GpuSparkBatchQueryScan(SparkSession.active(), table, scan, readConf, expectedSchema, filters, rapidsConf,
+        false // queryUsesInputFile
+    );
   }
 
   // Try to build an Iceberg TableScan when one was not found in the CPU instance.
@@ -165,9 +167,10 @@ private static TableScan buildScan(Scan cpuInstance,
   }
 
   GpuSparkBatchQueryScan(SparkSession spark, Table table, TableScan scan, SparkReadConf readConf,
-                         Schema expectedSchema, List<Expression> filters, RapidsConf rapidsConf) {
+                         Schema expectedSchema, List<Expression> filters, RapidsConf rapidsConf,
+                         boolean queryUsesInputFile) {
 
-    super(spark, table, readConf, expectedSchema, filters, rapidsConf);
+    super(spark, table, readConf, expectedSchema, filters, rapidsConf, queryUsesInputFile);
 
     this.scan = scan;
     this.snapshotId = readConf.snapshotId();
@@ -346,14 +349,15 @@ public boolean equals(Object o) {
         Objects.equals(snapshotId, that.snapshotId) &&
         Objects.equals(startSnapshotId, that.startSnapshotId) &&
         Objects.equals(endSnapshotId, that.endSnapshotId) &&
-        Objects.equals(asOfTimestamp, that.asOfTimestamp);
+        Objects.equals(asOfTimestamp, that.asOfTimestamp) &&
+        queryUsesInputFile() == that.queryUsesInputFile();
   }
 
   @Override
   public int hashCode() {
     return Objects.hash(
         table().name(), readSchema(), filterExpressions().toString(), runtimeFilterExpressions.toString(),
-        snapshotId, startSnapshotId, endSnapshotId, asOfTimestamp);
+        snapshotId, startSnapshotId, endSnapshotId, asOfTimestamp, queryUsesInputFile());
   }
 
   @Override
@@ -362,4 +366,12 @@ public String toString() {
         "IcebergScan(table=%s, type=%s, filters=%s, runtimeFilters=%s, caseSensitive=%s)",
         table(), expectedSchema().asStruct(), filterExpressions(), runtimeFilterExpressions, caseSensitive());
   }
+
+  /** Return a copy of "this" but with "queryUsesInputFile = true" */
+  public GpuSparkBatchQueryScan copyWithInputFileTrue() {
+    return new GpuSparkBatchQueryScan(SparkSession.active(), table(), this.scan, readConf(),
+        expectedSchema(), filterExpressions(), rapidsConf(),
+        true // queryUsesInputFile
+    );
+  }
 }
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/spark/source/GpuSparkScan.java b/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/spark/source/GpuSparkScan.java
index 2b956d33a7e..2c5b670fc3a 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/spark/source/GpuSparkScan.java
+++ b/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/spark/source/GpuSparkScan.java
@@ -79,13 +79,14 @@ abstract class GpuSparkScan extends ScanWithMetricsWrapper
   private final List<Expression> filterExpressions;
   private final boolean readTimestampWithoutZone;
   private final RapidsConf rapidsConf;
+  private final boolean queryUsesInputFile;
 
   // lazy variables
   private StructType readSchema = null;
 
   GpuSparkScan(SparkSession spark, Table table, SparkReadConf readConf,
                Schema expectedSchema, List<Expression> filters,
-               RapidsConf rapidsConf) {
+               RapidsConf rapidsConf, boolean queryUsesInputFile) {
 
     SparkSchemaUtil.validateMetadataColumnReferences(table.schema(), expectedSchema);
 
@@ -97,12 +98,21 @@ abstract class GpuSparkScan extends ScanWithMetricsWrapper
     this.filterExpressions = filters != null ? filters : Collections.emptyList();
     this.readTimestampWithoutZone = readConf.handleTimestampWithoutZone();
     this.rapidsConf = rapidsConf;
+    this.queryUsesInputFile = queryUsesInputFile;
   }
 
   protected Table table() {
     return table;
   }
 
+  protected SparkReadConf readConf() {
+    return readConf;
+  }
+
+  protected RapidsConf rapidsConf() {
+    return rapidsConf;
+  }
+
   protected boolean caseSensitive() {
     return caseSensitive;
   }
@@ -117,6 +127,10 @@ protected List<Expression> filterExpressions() {
 
   protected abstract List<CombinedScanTask> tasks();
 
+  boolean queryUsesInputFile() {
+    return queryUsesInputFile;
+  }
+
   @Override
   public Batch toBatch() {
     return new SparkBatch(sparkContext, table, readConf, tasks(), expectedSchema,
@@ -184,14 +198,19 @@ static class ReaderFactory implements PartitionReaderFactory {
     private final scala.collection.immutable.Set<String> allCloudSchemes;
     private final boolean canUseParquetMultiThread;
     private final boolean canUseParquetCoalescing;
+    private final boolean isParquetPerFileReadEnabled;
 
     public ReaderFactory(scala.collection.immutable.Map<String, GpuMetric> metrics,
-                         RapidsConf rapidsConf) {
+                         RapidsConf rapidsConf, boolean queryUsesInputFile) {
       this.metrics = metrics;
       this.allCloudSchemes = rapidsConf.getCloudSchemes().toSet();
-      // Only multi-threaded Parquet is supported.
+      this.isParquetPerFileReadEnabled = rapidsConf.isParquetPerFileReadEnabled();
       this.canUseParquetMultiThread = rapidsConf.isParquetMultiThreadReadEnabled();
-      this.canUseParquetCoalescing = false;
+      // Here ignores the "ignoreCorruptFiles" comparing to the code in
+      // "GpuParquetMultiFilePartitionReaderFactory", since "ignoreCorruptFiles" is
+      // not honored by Iceberg.
+      this.canUseParquetCoalescing = rapidsConf.isParquetCoalesceFileReadEnabled() &&
+          !queryUsesInputFile;
     }
 
     @Override
@@ -203,7 +222,6 @@ public PartitionReader<InternalRow> createReader(InputPartition partition) {
     public PartitionReader<ColumnarBatch> createColumnarReader(InputPartition partition) {
       if (partition instanceof ReadTask) {
         ReadTask rTask = (ReadTask) partition;
-        // ret = (canAccelerateRead, isMultiThread, fileFormat) = (_1(), _2(), _3())
         scala.Tuple3<Boolean, Boolean, FileFormat> ret = multiFileReadCheck(rTask);
         boolean canAccelerateRead = ret._1();
         if (canAccelerateRead) {
@@ -233,6 +251,7 @@ public boolean supportColumnarReads(InputPartition partition) {
      */
     private scala.Tuple3<Boolean, Boolean, FileFormat> multiFileReadCheck(ReadTask readTask) {
       Collection<FileScanTask> scans = readTask.files();
+      boolean isSingleFormat = false, isPerFileReadEnabled = false;
       boolean canUseMultiThread = false, canUseCoalescing = false;
       FileFormat ff = null;
       // Require all the files in a partition have the same file format.
@@ -240,9 +259,11 @@ private scala.Tuple3<Boolean, Boolean, FileFormat> multiFileReadCheck(ReadTask r
         // Now only Parquet is supported.
         canUseMultiThread = canUseParquetMultiThread;
         canUseCoalescing = canUseParquetCoalescing;
+        isPerFileReadEnabled = isParquetPerFileReadEnabled;
+        isSingleFormat = true;
         ff = FileFormat.PARQUET;
       }
-      boolean canAccelerateRead = canUseMultiThread || canUseCoalescing;
+      boolean canAccelerateRead = !isPerFileReadEnabled && isSingleFormat;
       String[] files = scans.stream().map(f -> f.file().path().toString())
           .toArray(String[]::new);
       // Get the final decision for the subtype of the Rapids reader.
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/spark/source/SparkBatch.java b/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/spark/source/SparkBatch.java
index 413d07694b0..e10b0883586 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/spark/source/SparkBatch.java
+++ b/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/spark/source/SparkBatch.java
@@ -84,7 +84,8 @@ public InputPartition[] planInputPartitions() {
 
   @Override
   public PartitionReaderFactory createReaderFactory() {
-    return new GpuSparkScan.ReaderFactory(parentScan.metrics(), rapidsConf);
+    return new GpuSparkScan.ReaderFactory(parentScan.metrics(), rapidsConf,
+        parentScan.queryUsesInputFile());
   }
 
   @Override
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AlluxioUtils.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AlluxioUtils.scala
index ceef7b400dc..33851dffae8 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AlluxioUtils.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AlluxioUtils.scala
@@ -42,7 +42,7 @@ object AlluxioUtils extends Logging {
           throw new FileNotFoundException(
             s"Alluxio path $alluxio_path does not exist, maybe forgot to mount it")
         }
-        logInfo(s"Alluxio path $alluxio_path is mounted")
+        logDebug(s"Alluxio path $alluxio_path is mounted")
         checkedAlluxioPath.add(alluxio_path)
       } else {
         logDebug(s"Alluxio path $alluxio_path already mounted")
@@ -106,7 +106,7 @@ object AlluxioUtils extends Logging {
               // record it as a mounted point
               if (items(0).contains("://")) {
                 mountedBuckets(items(2)) = items(0)
-                logInfo(s"Found mounted bucket ${items(0)} to ${items(2)}")
+                logDebug(s"Found mounted bucket ${items(0)} to ${items(2)}")
               }
             }
           }
@@ -176,7 +176,7 @@ object AlluxioUtils extends Logging {
         logInfo(s"Mounted bucket $remote_path to $local_bucket in Alluxio $output")
         mountedBuckets(local_bucket) = remote_path
       } else if (mountedBuckets(local_bucket).equals(remote_path)) {
-        logInfo(s"Already mounted bucket $remote_path to $local_bucket in Alluxio")
+        logDebug(s"Already mounted bucket $remote_path to $local_bucket in Alluxio")
       } else {
         throw new RuntimeException(s"Found a same bucket name in $remote_path " +
           s"and ${mountedBuckets(local_bucket)}")
@@ -242,7 +242,7 @@ object AlluxioUtils extends Logging {
         // replace s3://foo/.. to alluxio://alluxioMasterHost/foo/...
         val newPath = new Path(pathStr.replaceFirst(
           scheme + ":/", "alluxio://" + alluxioMasterHost.get))
-        logInfo(s"Replace $pathStr to ${newPath.toString}")
+        logDebug(s"Replace $pathStr to ${newPath.toString}")
         newPath
       } else {
         f
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala
index 1e52ff60de2..600669ccf55 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala
@@ -169,8 +169,9 @@ object GpuCast extends Arm {
   private val TIMESTAMP_REGEX_YYYY_MM = "\\A\\d{4}\\-\\d{1,2}[ ]?\\Z"
   private val TIMESTAMP_REGEX_YYYY = "\\A\\d{4}[ ]?\\Z"
   private val TIMESTAMP_REGEX_FULL =
-    "\\A\\d{4}\\-\\d{1,2}\\-\\d{1,2}[ T]?(\\d{1,2}:\\d{1,2}:\\d{1,2}\\.\\d{6}Z)\\Z"
-  private val TIMESTAMP_REGEX_NO_DATE = "\\A[T]?(\\d{1,2}:\\d{1,2}:\\d{1,2}\\.\\d{6}Z)\\Z"
+    "\\A\\d{4}\\-\\d{1,2}\\-\\d{1,2}[ T]?(\\d{1,2}:\\d{1,2}:([0-5]\\d|\\d)(\\.\\d{0,6})?Z?)\\Z"
+  private val TIMESTAMP_REGEX_NO_DATE =
+    "\\A[T]?(\\d{1,2}:\\d{1,2}:([0-5]\\d|\\d)(\\.\\d{0,6})?Z?)\\Z"
 
   private val BIG_DECIMAL_LONG_MIN = BigDecimal(Long.MinValue)
   private val BIG_DECIMAL_LONG_MAX = BigDecimal(Long.MaxValue)
@@ -1314,13 +1315,23 @@ object GpuCast extends Arm {
 
     val cudfFormat1 = "%Y-%m-%d %H:%M:%S.%f"
     val cudfFormat2 = "%Y-%m-%dT%H:%M:%S.%f"
+    val cudfFormat3 = "%Y-%m-%d %H:%M:%S"
+    val cudfFormat4 = "%Y-%m-%dT%H:%M:%S"
 
     withResource(orElse) { orElse =>
 
       // valid dates must match the regex and either of the cuDF formats
       val isCudfMatch = withResource(input.isTimestamp(cudfFormat1)) { isTimestamp1 =>
         withResource(input.isTimestamp(cudfFormat2)) { isTimestamp2 =>
-          isTimestamp1.or(isTimestamp2)
+          withResource(input.isTimestamp(cudfFormat3)) { isTimestamp3 =>
+            withResource(input.isTimestamp(cudfFormat4)) { isTimestamp4 =>
+              withResource(isTimestamp1.or(isTimestamp2)) { isTimestamp12 =>
+                withResource(isTimestamp12.or(isTimestamp3)) { isTimestamp123 =>
+                  isTimestamp123.or(isTimestamp4)
+                }
+              }
+            }
+          }
         }
       }
 
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuInSet.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuInSet.scala
index 2835b3ef158..d90b8b87c70 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuInSet.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuInSet.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,32 +18,19 @@ package com.nvidia.spark.rapids
 
 import ai.rapids.cudf.ColumnVector
 
-import org.apache.spark.TaskContext
 import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, Predicate}
 
 case class GpuInSet(
     child: Expression,
     list: Seq[Any]) extends GpuUnaryExpression with Predicate {
-  @transient private[this] lazy val _needles: ThreadLocal[ColumnVector] =
-    new ThreadLocal[ColumnVector]
-
   require(list != null, "list should not be null")
 
   override def nullable: Boolean = child.nullable || list.contains(null)
 
   override def doColumnar(haystack: GpuColumnVector): ColumnVector = {
-    val needles = getNeedles
-    haystack.getBase.contains(needles)
-  }
-
-  private def getNeedles: ColumnVector = {
-    var needleVec = _needles.get
-    if (needleVec == null) {
-      needleVec = buildNeedles
-      _needles.set(needleVec)
-      TaskContext.get.addTaskCompletionListener[Unit](_ => _needles.get.close())
+    withResource(buildNeedles) { needles =>
+      haystack.getBase.contains(needles)
     }
-    needleVec
   }
 
   private def buildNeedles: ColumnVector =
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala
index b83751ee8fc..9104e01c024 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala
@@ -541,6 +541,7 @@ trait SingleDataBlockInfo {
   def partitionValues: InternalRow // partition value
   def dataBlock: DataBlockBase // a single block info of a single file
   def schema: SchemaBase // schema information
+  def readSchema: StructType // read schema information
   def extraInfo: ExtraInfo // extra information
 }
 
@@ -578,8 +579,7 @@ class BatchContext(
  *
  * @param conf                  Configuration
  * @param clippedBlocks         the block metadata from the original file that has been
- *                                clipped to only contain the column chunks to be read
- * @param readDataSchema        the Spark schema describing what will be read
+ *                              clipped to only contain the column chunks to be read
  * @param partitionSchema       schema of partitions
  * @param maxReadBatchSizeRows  soft limit on the maximum number of rows the reader reads per batch
  * @param maxReadBatchSizeBytes soft limit on the maximum number of bytes the reader reads per batch
@@ -589,7 +589,6 @@ class BatchContext(
 abstract class MultiFileCoalescingPartitionReaderBase(
     conf: Configuration,
     clippedBlocks: Seq[SingleDataBlockInfo],
-    readDataSchema: StructType,
     partitionSchema: StructType,
     maxReadBatchSizeRows: Integer,
     maxReadBatchSizeBytes: Long,
@@ -603,6 +602,7 @@ abstract class MultiFileCoalescingPartitionReaderBase(
 
   private case class CurrentChunkMeta(
     clippedSchema: SchemaBase,
+    readSchema: StructType,
     currentChunk: Seq[(Path, DataBlockBase)],
     numTotalRows: Long,
     rowsPerPartition: Array[Long],
@@ -688,11 +688,12 @@ abstract class MultiFileCoalescingPartitionReaderBase(
    * @param dataBuffer  the data which can be decoded in GPU
    * @param dataSize    data size
    * @param clippedSchema the clipped schema
+   * @param readSchema the expected schema
    * @param extraInfo the extra information for specific file format
    * @return Table
    */
   def readBufferToTable(dataBuffer: HostMemoryBuffer, dataSize: Long, clippedSchema: SchemaBase,
-    extraInfo: ExtraInfo): Table
+    readSchema: StructType, extraInfo: ExtraInfo): Table
 
   /**
    * Write a header for a specific file format. If there is no header for the file format,
@@ -741,6 +742,21 @@ abstract class MultiFileCoalescingPartitionReaderBase(
     new BatchContext(chunkedBlocks, clippedSchema)
   }
 
+  /**
+   * A callback to finalize the output batch. The batch returned will be the final
+   * output batch of the reader's "get" method.
+   *
+   * @param batch the batch after decoding, adding partitioned columns.
+   * @param extraInfo the corresponding extra information of the input batch.
+   * @return the finalized columnar batch.
+   */
+  protected def finalizeOutputBatch(
+      batch: ColumnarBatch,
+      extraInfo: ExtraInfo): ColumnarBatch = {
+    // Equivalent to returning the input batch directly.
+    GpuColumnVector.incRefCounts(batch)
+  }
+
   override def next(): Boolean = {
     batch.foreach(_.close())
     batch = None
@@ -765,7 +781,7 @@ abstract class MultiFileCoalescingPartitionReaderBase(
   private def readBatch(): Option[ColumnarBatch] = {
     withResource(new NvtxRange(s"$getFileFormatShortName readBatch", NvtxColor.GREEN)) { _ =>
       val currentChunkMeta = populateCurrentBlockChunk()
-      if (currentChunkMeta.clippedSchema.fieldNames.isEmpty) {
+      val retBatch = if (currentChunkMeta.clippedSchema.fieldNames.isEmpty) {
         // not reading any data, so return a degenerate ColumnarBatch with the row count
         if (currentChunkMeta.numTotalRows == 0) {
           None
@@ -773,7 +789,7 @@ abstract class MultiFileCoalescingPartitionReaderBase(
           val rows = currentChunkMeta.numTotalRows.toInt
           // Someone is going to process this data, even if it is just a row count
           GpuSemaphore.acquireIfNecessary(TaskContext.get(), metrics(SEMAPHORE_WAIT_TIME))
-          val nullColumns = readDataSchema.safeMap(f =>
+          val nullColumns = currentChunkMeta.readSchema.safeMap(f =>
             GpuColumnVector.fromNull(rows, f.dataType).asInstanceOf[SparkVector])
           val emptyBatch = new ColumnarBatch(nullColumns.toArray, rows)
           addAllPartitionValues(Some(emptyBatch), currentChunkMeta.allPartValues,
@@ -781,9 +797,9 @@ abstract class MultiFileCoalescingPartitionReaderBase(
         }
       } else {
         val table = readToTable(currentChunkMeta.currentChunk, currentChunkMeta.clippedSchema,
-          currentChunkMeta.extraInfo)
+          currentChunkMeta.readSchema, currentChunkMeta.extraInfo)
         try {
-          val colTypes = readDataSchema.fields.map(f => f.dataType)
+          val colTypes = currentChunkMeta.readSchema.fields.map(f => f.dataType)
           val maybeBatch = table.map(t => GpuColumnVector.from(t, colTypes))
           maybeBatch.foreach { batch =>
             logDebug(s"GPU batch size: ${GpuColumnVector.getTotalDeviceMemoryUsed(batch)} bytes")
@@ -796,12 +812,16 @@ abstract class MultiFileCoalescingPartitionReaderBase(
           table.foreach(_.close())
         }
       }
+      withResource(retBatch) { _ =>
+        retBatch.map(b => finalizeOutputBatch(b, currentChunkMeta.extraInfo))
+      }
     }
   }
 
   private def readToTable(
       currentChunkedBlocks: Seq[(Path, DataBlockBase)],
       clippedSchema: SchemaBase,
+      readDataSchema: StructType,
       extraInfo: ExtraInfo): Option[Table] = {
     if (currentChunkedBlocks.isEmpty) {
       return None
@@ -811,7 +831,8 @@ abstract class MultiFileCoalescingPartitionReaderBase(
       if (dataSize == 0) {
         None
       } else {
-        val table = readBufferToTable(dataBuffer, dataSize, clippedSchema, extraInfo)
+        val table = readBufferToTable(dataBuffer, dataSize, clippedSchema, readDataSchema,
+          extraInfo)
         closeOnExcept(table) { _ =>
           maxDeviceMemory = max(GpuColumnVector.getTotalDeviceMemoryUsed(table), maxDeviceMemory)
           if (readDataSchema.length < table.getNumberOfColumns) {
@@ -890,7 +911,7 @@ abstract class MultiFileCoalescingPartitionReaderBase(
         // Just ensure to close buffer when there is an exception
         closeOnExcept(buffer) { _ =>
           logWarning(s"The original estimated size $initTotalSize is too small, " +
-            s"reallocing and copying data to bigger buffer size: $bufferSize")
+            s"reallocating and copying data to bigger buffer size: $bufferSize")
         }
         // Copy the old buffer to a new allocated bigger buffer and close the old buffer
         buf = withResource(buffer) { _ =>
@@ -944,10 +965,11 @@ abstract class MultiFileCoalescingPartitionReaderBase(
     var currentFile: Path = null
     var currentPartitionValues: InternalRow = null
     var currentClippedSchema: SchemaBase = null
+    var currentReadSchema: StructType = null
     val rowsPerPartition = new ArrayBuffer[Long]()
     var lastPartRows: Long = 0
     val allPartValues = new ArrayBuffer[InternalRow]()
-    var currrentDataBlock: SingleDataBlockInfo = null
+    var currentDataBlock: SingleDataBlockInfo = null
     var extraInfo: ExtraInfo = null
 
     @tailrec
@@ -955,11 +977,12 @@ abstract class MultiFileCoalescingPartitionReaderBase(
       if (blockIterator.hasNext) {
         if (currentFile == null) {
           // first time of readNextBatch
-          currrentDataBlock = blockIterator.head
+          currentDataBlock = blockIterator.head
           currentFile = blockIterator.head.filePath
           currentPartitionValues = blockIterator.head.partitionValues
           allPartValues += currentPartitionValues
           currentClippedSchema = blockIterator.head.schema
+          currentReadSchema = blockIterator.head.readSchema
           extraInfo = blockIterator.head.extraInfo
         }
 
@@ -969,12 +992,12 @@ abstract class MultiFileCoalescingPartitionReaderBase(
         }
 
         if (numRows == 0 || numRows + peekedRowCount <= maxReadBatchSizeRows) {
-          val estimatedBytes = GpuBatchUtils.estimateGpuMemory(readDataSchema, peekedRowCount)
+          val estimatedBytes = GpuBatchUtils.estimateGpuMemory(currentReadSchema, peekedRowCount)
           if (numBytes == 0 || numBytes + estimatedBytes <= maxReadBatchSizeBytes) {
             // only care to check if we are actually adding in the next chunk
             if (currentFile != blockIterator.head.filePath) {
               // check if need to split next data block into another ColumnarBatch
-              if (checkIfNeedToSplitDataBlock(currrentDataBlock, blockIterator.head)) {
+              if (checkIfNeedToSplitDataBlock(currentDataBlock, blockIterator.head)) {
                 logInfo(s"splitting ${blockIterator.head.filePath} into another batch!")
                 return
               }
@@ -992,7 +1015,8 @@ abstract class MultiFileCoalescingPartitionReaderBase(
               currentFile = blockIterator.head.filePath
               currentPartitionValues = blockIterator.head.partitionValues
               currentClippedSchema = blockIterator.head.schema
-              currrentDataBlock = blockIterator.head
+              currentReadSchema = blockIterator.head.readSchema
+              currentDataBlock = blockIterator.head
             }
 
             val nextBlock = blockIterator.next()
@@ -1011,7 +1035,7 @@ abstract class MultiFileCoalescingPartitionReaderBase(
     logDebug(s"Loaded $numRows rows from ${getFileFormatShortName}. " +
       s"${getFileFormatShortName} bytes read: $numChunkBytes. Estimated GPU bytes: $numBytes. " +
       s"Number of different partitions: ${allPartValues.size}")
-    CurrentChunkMeta(currentClippedSchema, currentChunk,
+    CurrentChunkMeta(currentClippedSchema, currentReadSchema, currentChunk,
       numRows, rowsPerPartition.toArray, allPartValues.toArray, extraInfo)
   }
 
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala
index 45fa3c3b320..fa921e517be 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala
@@ -21,7 +21,7 @@ import java.net.URI
 import java.nio.ByteBuffer
 import java.nio.channels.{Channels, WritableByteChannel}
 import java.util
-import java.util.concurrent.Callable
+import java.util.concurrent.{Callable, TimeUnit}
 
 import scala.annotation.tailrec
 import scala.collection.JavaConverters._
@@ -35,7 +35,7 @@ import com.google.protobuf.CodedOutputStream
 import com.nvidia.spark.rapids.GpuMetric._
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.SchemaUtils._
-import com.nvidia.spark.rapids.shims.{OrcReadingShims, OrcShims, ShimFilePartitionReaderFactory}
+import com.nvidia.spark.rapids.shims.{OrcCastingShims, OrcReadingShims, OrcShims, ShimFilePartitionReaderFactory}
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.hive.common.io.DiskRangeList
@@ -211,6 +211,28 @@ object GpuOrcScan extends Arm {
         } else {
           downCastAnyInteger(col, toDt)
         }
+
+      // bool to float, double(float64)
+      case (DType.BOOL8, DType.FLOAT32 | DType.FLOAT64) =>
+        col.castTo(toDt)
+
+      // bool to string
+      case (DType.BOOL8, DType.STRING) =>
+        withResource(col.castTo(toDt)) { casted =>
+          // cuDF produces "ture"/"false" while CPU outputs "TRUE"/"FALSE".
+          casted.upper()
+        }
+
+      // integer to float, double(float64), string
+      case (DType.INT8 | DType.INT16 | DType.INT32 | DType.INT64,
+      DType.FLOAT32 | DType.FLOAT64 | DType.STRING) =>
+        col.castTo(toDt)
+
+      // {bool, integer types} to timestamp(micro seconds)
+      case (DType.BOOL8 | DType.INT8 | DType.INT16 | DType.INT32 | DType.INT64,
+      DType.TIMESTAMP_MICROSECONDS) =>
+        OrcCastingShims.castIntegerToTimestamp(col, fromDt)
+
       // TODO more types, tracked in https://github.com/NVIDIA/spark-rapids/issues/5895
       case (f, t) =>
         throw new QueryExecutionException(s"Unsupported type casting: $f -> $t")
@@ -231,19 +253,33 @@ object GpuOrcScan extends Arm {
       // Align with what CPU does.
       return false
     }
+    val toType = to.getCategory
     from.getCategory match {
       case BOOLEAN | BYTE | SHORT | INT | LONG =>
-        to.getCategory match {
-          case BOOLEAN | BYTE | SHORT | INT | LONG => true
+        toType match {
+          case BOOLEAN | BYTE | SHORT | INT | LONG | FLOAT | DOUBLE | STRING |
+               TIMESTAMP => true
+          // BINARY and DATE are not supported by design.
+          // The 'to' type (aka read schema) is from Spark, and VARCHAR and CHAR will
+          // be replaced by STRING. Meanwhile, cuDF doesn't support them as output
+          // types, and also replaces them with STRING.
+          // TIMESTAMP_INSTANT is not supported by cuDF.
           case _ => false
         }
       case VARCHAR =>
-        to.getCategory == STRING
-      // TODO more types, tracked in https://github.com/NVIDIA/spark-rapids/issues/5895
-      case _ =>
-        false
+        toType == STRING
+      case _ => false
     }
   }
+
+  /**
+   * Test whether if a * b will cause Long-overflow.
+   * In Math.multiplyExact, if there is an integer-overflow, then it will throw an
+   * ArithmeticException.
+   */
+  def testLongMultiplicationOverflow(a: Long, b: Long) = {
+    Math.multiplyExact(a, b)
+  }
 }
 
 /**
@@ -316,6 +352,7 @@ case class GpuOrcMultiFilePartitionReaderFactory(
     // we must split the different compress files into different ColumnarBatch.
     // So here try the best to group the same compression files together before hand.
     val compressionAndStripes = LinkedHashMap[CompressionKind, ArrayBuffer[OrcSingleStripeMeta]]()
+    val currentTime = System.nanoTime()
     files.map { file =>
       val orcPartitionReaderContext = filterHandler.filterStripes(file, dataSchema,
         readDataSchema, partitionSchema)
@@ -327,8 +364,12 @@ case class GpuOrcMultiFilePartitionReaderFactory(
             OrcDataStripe(OrcStripeWithMeta(block, orcPartitionReaderContext)),
             file.partitionValues,
             OrcSchemaWrapper(orcPartitionReaderContext.updatedReadSchema),
+            readDataSchema,
             OrcExtraInfo(orcPartitionReaderContext.requestedMapping)))
     }
+    metrics.get("scanTime").foreach {
+      _ += TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - currentTime)
+    }
     val clippedStripes = compressionAndStripes.values.flatten.toSeq
     new MultiFileOrcPartitionReader(conf, files, clippedStripes, readDataSchema, debugDumpPrefix,
       maxReadBatchSizeRows, maxReadBatchSizeBytes, metrics, partitionSchema, numThreads,
@@ -1589,6 +1630,7 @@ private case class OrcSingleStripeMeta(
   dataBlock: OrcDataStripe, // Orc stripe information with the OrcPartitionReaderContext
   partitionValues: InternalRow, // partitioned values
   schema: OrcSchemaWrapper, // Orc schema
+  readSchema: StructType, // Orc read schema
   extraInfo: OrcExtraInfo // Orc ExtraInfo containing the requested column ids
 ) extends SingleDataBlockInfo
 
@@ -1619,7 +1661,7 @@ class MultiFileOrcPartitionReader(
     partitionSchema: StructType,
     numThreads: Int,
     isCaseSensitive: Boolean)
-  extends MultiFileCoalescingPartitionReaderBase(conf, clippedStripes, readDataSchema,
+  extends MultiFileCoalescingPartitionReaderBase(conf, clippedStripes,
     partitionSchema, maxReadBatchSizeRows, maxReadBatchSizeBytes, numThreads, execMetrics)
     with OrcCommonFunctions {
 
@@ -1861,6 +1903,7 @@ class MultiFileOrcPartitionReader(
       dataBuffer: HostMemoryBuffer,
       dataSize: Long,
       clippedSchema: SchemaBase,
+      readSchema: StructType,
       extraInfo: ExtraInfo): Table =
     decodeToTable(dataBuffer, dataSize, clippedSchema, extraInfo.requestedMapping,
       isCaseSensitive, files)
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
index dde253f5897..cf33be44905 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
@@ -985,53 +985,18 @@ object GpuOverrides extends Logging {
         private[this] lazy val rhsDecimalType =
           DecimalUtil.asDecimalType(rhs.wrapped.asInstanceOf[Expression].dataType)
 
-        override def tagExprForGpu(): Unit = {
-          a.child match {
-            // Division and Multiplication of Decimal types is a little odd. Spark will cast the
-            // inputs to a common wider value where the scale is the max of the two input scales,
-            // and the precision is max of the two input non-scale portions + the new scale. Then it
-            // will do the divide or multiply as a BigDecimal value but lie about the return type.
-            // Finally here in CheckOverflow it will reset the scale and check the precision so that
-            // Spark knows it fits in the final desired result.
-            // Here we try to strip out the extra casts, etc to get to as close to the original
-            // query as possible. This lets us then calculate what CUDF needs to get the correct
-            // answer, which in some cases is a lot smaller.
-            case _: Divide =>
-              val intermediatePrecision =
-                GpuDecimalDivide.nonRoundedIntermediateArgPrecision(lhsDecimalType,
-                  rhsDecimalType, a.dataType)
-
-              if (intermediatePrecision > DType.DECIMAL128_MAX_PRECISION) {
-                if (conf.needDecimalGuarantees) {
-                  binExpr.willNotWorkOnGpu(s"the intermediate precision of " +
-                      s"$intermediatePrecision that is required to guarantee no overflow issues " +
-                      s"for this divide is too large to be supported on the GPU")
-                } else {
-                  logWarning("Decimal overflow guarantees disabled for " +
-                      s"${lhs.dataType} / ${rhs.dataType} produces ${a.dataType} with an " +
-                      s"intermediate precision of $intermediatePrecision")
-                }
-              }
-            case _: Multiply =>
-              val intermediatePrecision =
-                GpuDecimalMultiply.nonRoundedIntermediatePrecision(lhsDecimalType,
-                  rhsDecimalType, a.dataType)
-              if (intermediatePrecision > DType.DECIMAL128_MAX_PRECISION) {
-                if (conf.needDecimalGuarantees) {
-                  binExpr.willNotWorkOnGpu(s"the intermediate precision of " +
-                      s"$intermediatePrecision that is required to guarantee no overflow issues " +
-                      s"for this multiply is too large to be supported on the GPU")
-                } else {
-                  logWarning("Decimal overflow guarantees disabled for " +
-                      s"${lhs.dataType} * ${rhs.dataType} produces ${a.dataType} with an " +
-                      s"intermediate precision of $intermediatePrecision")
-                }
-              }
-            case _ => // NOOP
-          }
-        }
-
         override def convertToGpu(): GpuExpression = {
+          // Prior to Spark 3.4.0
+          // Division and Multiplication of Decimal types is a little odd. Spark will cast the
+          // inputs to a common wider value where the scale is the max of the two input scales,
+          // and the precision is max of the two input non-scale portions + the new scale. Then it
+          // will do the divide or multiply as a BigDecimal value but lie about the return type.
+          // Finally here in CheckOverflow it will reset the scale and check the precision so that
+          // Spark knows it fits in the final desired result.
+          // Here we try to strip out the extra casts, etc to get to as close to the original
+          // query as possible. This lets us then calculate what CUDF needs to get the correct
+          // answer, which in some cases is a lot smaller.
+
           a.child match {
             case _: Divide =>
               // GpuDecimalDivide includes the overflow check in it.
@@ -1042,7 +1007,7 @@ object GpuOverrides extends Logging {
                 GpuDecimalMultiply.nonRoundedIntermediatePrecision(lhsDecimalType,
                   rhsDecimalType, a.dataType)
               GpuDecimalMultiply(lhs.convertToGpu(), rhs.convertToGpu(), wrapped.dataType,
-                needsExtraOverflowChecks = intermediatePrecision > DType.DECIMAL128_MAX_PRECISION)
+                useLongMultiply = intermediatePrecision > DType.DECIMAL128_MAX_PRECISION)
             case _ =>
               GpuCheckOverflow(childExprs.head.convertToGpu(),
                 wrapped.dataType, wrapped.nullOnOverflow)
@@ -1856,12 +1821,30 @@ object GpuOverrides extends Logging {
           // passing the already converted strf string for a little optimization
           GpuFromUnixTime(lhs, rhs, strfFormat)
       }),
+    expr[FromUTCTimestamp](
+      "Render the input UTC timestamp in the input timezone",
+      ExprChecks.binaryProject(TypeSig.TIMESTAMP, TypeSig.TIMESTAMP,
+        ("timestamp", TypeSig.TIMESTAMP, TypeSig.TIMESTAMP),
+        ("timezone", TypeSig.lit(TypeEnum.STRING)
+          .withPsNote(TypeEnum.STRING, "Only timezones equivalent to UTC are supported"),
+          TypeSig.lit(TypeEnum.STRING))),
+      (a, conf, p, r) => new FromUTCTimestampExprMeta(a, conf, p, r)
+    ),
     expr[Pmod](
       "Pmod",
       ExprChecks.binaryProject(TypeSig.gpuNumeric, TypeSig.cpuNumeric,
-        ("lhs", TypeSig.gpuNumeric, TypeSig.cpuNumeric),
+        ("lhs", TypeSig.gpuNumeric.withPsNote(TypeEnum.DECIMAL,
+          s"decimals with precision ${DecimalType.MAX_PRECISION} are not supported"),
+            TypeSig.cpuNumeric),
         ("rhs", TypeSig.gpuNumeric, TypeSig.cpuNumeric)),
       (a, conf, p, r) => new BinaryExprMeta[Pmod](a, conf, p, r) {
+        override def tagExprForGpu(): Unit = {
+          a.dataType match {
+            case dt: DecimalType if dt.precision == DecimalType.MAX_PRECISION =>
+              willNotWorkOnGpu("pmod at maximum decimal precision is not supported")
+            case _ =>
+          }
+        }
         override def convertToGpu(lhs: Expression, rhs: Expression): GpuExpression =
           GpuPmod(lhs, rhs)
       }),
@@ -2128,10 +2111,7 @@ object GpuOverrides extends Logging {
     expr[Divide](
       "Division",
       ExprChecks.binaryProject(
-        TypeSig.DOUBLE + TypeSig.DECIMAL_128 +
-            TypeSig.psNote(TypeEnum.DECIMAL,
-              "Because of Spark's inner workings the full range of decimal precision " +
-                  "(even for 128-bit values) is not supported."),
+        TypeSig.DOUBLE + TypeSig.DECIMAL_128,
         TypeSig.DOUBLE + TypeSig.DECIMAL_128,
         ("lhs", TypeSig.DOUBLE + TypeSig.DECIMAL_128,
             TypeSig.DOUBLE + TypeSig.DECIMAL_128),
@@ -2311,23 +2291,16 @@ object GpuOverrides extends Logging {
           TypeSig.orderable,
           Seq(ParamCheck("input",
             (TypeSig.commonCudfTypes + TypeSig.DECIMAL_128 + TypeSig.NULL + TypeSig.STRUCT)
-              .nested(TypeSig.commonCudfTypes + TypeSig.DECIMAL_128 + TypeSig.NULL)
-              .withPsNote(Seq(TypeEnum.DOUBLE, TypeEnum.FLOAT), nanAggPsNote),
+              .nested(TypeSig.commonCudfTypes + TypeSig.DECIMAL_128 + TypeSig.NULL),
             TypeSig.orderable))).asInstanceOf[ExprChecksImpl].contexts
           ++
           ExprChecks.windowOnly(
             (TypeSig.commonCudfTypes + TypeSig.DECIMAL_128 + TypeSig.NULL),
             TypeSig.orderable,
             Seq(ParamCheck("input",
-              (TypeSig.commonCudfTypes + TypeSig.DECIMAL_128 + TypeSig.NULL)
-                .withPsNote(Seq(TypeEnum.DOUBLE, TypeEnum.FLOAT), nanAggPsNote),
+              (TypeSig.commonCudfTypes + TypeSig.DECIMAL_128 + TypeSig.NULL),
               TypeSig.orderable))).asInstanceOf[ExprChecksImpl].contexts),
       (a, conf, p, r) => new AggExprMeta[Min](a, conf, p, r) {
-        override def tagAggForGpu(): Unit = {
-          val dataType = a.child.dataType
-          checkAndTagFloatNanAgg("Min", dataType, conf, this)
-        }
-
         override def convertToGpu(childExprs: Seq[Expression]): GpuExpression =
           GpuMin(childExprs.head)
 
@@ -2993,7 +2966,7 @@ object GpuOverrides extends Logging {
         "the older versions of Spark in this instance and handle NaNs the same as 3.1.3+"),
     expr[ArraysOverlap](
       "Returns true if a1 contains at least a non-null element present also in a2. If the arrays " +
-      "have no common element and they are both non-empty and either of them contains a null " + 
+      "have no common element and they are both non-empty and either of them contains a null " +
       "element null is returned, false otherwise.",
       ExprChecks.binaryProject(TypeSig.BOOLEAN, TypeSig.BOOLEAN,
         ("array1",
@@ -3204,10 +3177,6 @@ object GpuOverrides extends Logging {
       }),
     expr[MapConcat](
       "Returns the union of all the given maps",
-      // Currently, GpuMapConcat supports nested values but not nested keys.
-      // We will add the nested key support after
-      // cuDF can fully support nested types in lists::drop_list_duplicates.
-      // Issue link: https://github.com/rapidsai/cudf/issues/11093
       ExprChecks.projectOnly(TypeSig.MAP.nested(TypeSig.commonCudfTypes + TypeSig.DECIMAL_128 +
           TypeSig.NULL + TypeSig.ARRAY + TypeSig.STRUCT + TypeSig.MAP),
         TypeSig.MAP.nested(TypeSig.all),
@@ -3216,13 +3185,6 @@ object GpuOverrides extends Logging {
           TypeSig.NULL + TypeSig.ARRAY + TypeSig.STRUCT + TypeSig.MAP),
           TypeSig.MAP.nested(TypeSig.all)))),
       (a, conf, p, r) => new ComplexTypeMergingExprMeta[MapConcat](a, conf, p, r) {
-        override def tagExprForGpu(): Unit = {
-          a.dataType.keyType match {
-            case MapType(_,_,_) | ArrayType(_,_) | StructType(_) => willNotWorkOnGpu(
-              s"GpuMapConcat does not currently support the key type ${a.dataType.keyType}.")
-            case _ =>
-          }
-        }
         override def convertToGpu(child: Seq[Expression]): GpuExpression = GpuMapConcat(child)
       }),
     expr[ConcatWs](
@@ -3421,7 +3383,7 @@ object GpuOverrides extends Logging {
         TypeSig.ARRAY.nested(TypeSig.all),
         Seq(ParamCheck("input",
           (TypeSig.commonCudfTypes + TypeSig.DECIMAL_128 +
-              TypeSig.NULL + 
+              TypeSig.NULL +
               TypeSig.STRUCT.withPsNote(TypeEnum.STRUCT, "Support for structs containing " +
               s"float/double array columns requires ${RapidsConf.HAS_NANS} to be set to false") +
               TypeSig.ARRAY.withPsNote(TypeEnum.ARRAY, "Support for arrays of arrays of " +
@@ -3431,7 +3393,7 @@ object GpuOverrides extends Logging {
 
         private def isNestedArrayType(dt: DataType): Boolean = {
           dt match {
-            case StructType(fields) => 
+            case StructType(fields) =>
               fields.exists { field =>
                 field.dataType match {
                   case sdt: StructType => isNestedArrayType(sdt)
@@ -4460,6 +4422,9 @@ case class GpuOverrides() extends Rule[SparkPlan] with Logging {
    *  check for a ScalaUDF using a tahoe.Snapshot function and if we ever see
    *  an AdaptiveSparkPlan on a Spark version we don't expect, fallback to the
    *  CPU for those plans.
+   *  Note that the Delta Lake delta log checkpoint parquet files are just inefficient
+   *  to have to copy the data to GPU and then back off after it does the scan on
+   *  Delta Table Checkpoint, so have the entire plan fallback to CPU at that point.
    */
   def isDeltaLakeMetadataQuery(plan: SparkPlan): Boolean = {
     val deltaLogScans = PlanUtils.findOperators(plan, {
@@ -4469,17 +4434,21 @@ case class GpuOverrides() extends Rule[SparkPlan] with Logging {
         true
       case f: FileSourceScanExec =>
         // example filename: "file:/tmp/delta-table/_delta_log/00000000000000000000.json"
-        val found = f.relation.inputFiles.exists(name =>
-          name.contains("/_delta_log/") && name.endsWith(".json"))
+        val found = f.relation.inputFiles.exists { name =>
+          name.contains("/_delta_log/") && name.endsWith(".json")
+        }
         if (found) {
           logDebug(s"Fallback for FileSourceScanExec delta log: $f")
         }
         found
       case rdd: RDDScanExec =>
-        // example rdd name: "Delta Table State #1 - file:///tmp/delta-table/_delta_log"
+        // example rdd name: "Delta Table State #1 - file:///tmp/delta-table/_delta_log" or
+        // "Scan ExistingRDD Delta Table Checkpoint with Stats #1 -
+        // file:///tmp/delta-table/_delta_log"
         val found = rdd.inputRDD != null &&
           rdd.inputRDD.name != null &&
-          rdd.inputRDD.name.startsWith("Delta Table State") &&
+          (rdd.inputRDD.name.startsWith("Delta Table State")
+            || rdd.inputRDD.name.startsWith("Delta Table Checkpoint")) &&
           rdd.inputRDD.name.endsWith("/_delta_log")
         if (found) {
           logDebug(s"Fallback for RDDScanExec delta log: $rdd")
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala
index a7902f37104..a24472947f9 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala
@@ -765,10 +765,18 @@ private case class GpuParquetFileFilterHandler(@transient sqlConf: SQLConf) exte
           }
         }
       case array: ArrayType =>
-        val fileChild = fileType.asGroupType().getType(0)
-          .asGroupType().getType(0)
-        checkSchemaCompat(fileChild, array.elementType, errorCallback, isCaseSensitive, useFieldId,
-          rootFileType, rootReadType)
+        if (fileType.isPrimitive) {
+          if (fileType.getRepetition == Type.Repetition.REPEATED) {
+            checkSchemaCompat(fileType, array.elementType, errorCallback, isCaseSensitive,
+              useFieldId, rootFileType, rootReadType)
+          } else {
+            errorCallback(fileType, readType)
+          }
+        } else {
+          val fileChild = fileType.asGroupType().getType(0).asGroupType().getType(0)
+          checkSchemaCompat(fileChild, array.elementType, errorCallback, isCaseSensitive,
+            useFieldId, rootFileType, rootReadType)
+        }
 
       case map: MapType =>
         val parquetMap = fileType.asGroupType().getType(0).asGroupType()
@@ -976,6 +984,7 @@ case class GpuParquetMultiFilePartitionReaderFactory(
       files: Array[PartitionedFile],
       conf: Configuration): PartitionReader[ColumnarBatch] = {
     val clippedBlocks = ArrayBuffer[ParquetSingleDataBlockMeta]()
+    val currentTime = System.nanoTime()
     files.map { file =>
       val singleFileInfo = try {
         filterHandler.filterBlocks(footerReadType, file, conf, filters, readDataSchema)
@@ -1000,14 +1009,16 @@ case class GpuParquetMultiFilePartitionReaderFactory(
           ParquetDataBlock(block),
           file.partitionValues,
           ParquetSchemaWrapper(singleFileInfo.schema),
-          ParquetExtraInfo(singleFileInfo.isCorrectedRebaseMode,
+          singleFileInfo.readSchema,
+          new ParquetExtraInfo(singleFileInfo.isCorrectedRebaseMode,
             singleFileInfo.isCorrectedInt96RebaseMode, singleFileInfo.hasInt96Timestamps)))
     }
-    new MultiFileParquetPartitionReader(conf, files, clippedBlocks,
-      isCaseSensitive, readDataSchema, debugDumpPrefix,
-      maxReadBatchSizeRows, maxReadBatchSizeBytes, metrics,
-      partitionSchema, numThreads, ignoreMissingFiles, ignoreCorruptFiles,
-      readUseFieldId)
+    metrics.get("scanTime").foreach {
+      _ += TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - currentTime)
+    }
+    new MultiFileParquetPartitionReader(conf, files, clippedBlocks, isCaseSensitive,
+      debugDumpPrefix, maxReadBatchSizeRows, maxReadBatchSizeBytes, metrics,
+      partitionSchema, numThreads, ignoreMissingFiles, ignoreCorruptFiles, readUseFieldId)
   }
 
   /**
@@ -1415,8 +1426,8 @@ private case class ParquetDataBlock(dataBlock: BlockMetaData) extends DataBlockB
 }
 
 /** Parquet extra information containing isCorrectedRebaseMode */
-case class ParquetExtraInfo(isCorrectedRebaseMode: Boolean,
-    isCorrectedInt96RebaseMode: Boolean, hasInt96Timestamps: Boolean) extends ExtraInfo
+class ParquetExtraInfo(val isCorrectedRebaseMode: Boolean,
+    val isCorrectedInt96RebaseMode: Boolean, val hasInt96Timestamps: Boolean) extends ExtraInfo
 
 // contains meta about a single block in a file
 private case class ParquetSingleDataBlockMeta(
@@ -1424,6 +1435,7 @@ private case class ParquetSingleDataBlockMeta(
   dataBlock: ParquetDataBlock,
   partitionValues: InternalRow,
   schema: ParquetSchemaWrapper,
+  readSchema: StructType,
   extraInfo: ParquetExtraInfo) extends SingleDataBlockInfo
 
 /**
@@ -1440,7 +1452,6 @@ private case class ParquetSingleDataBlockMeta(
  * @param clippedBlocks the block metadata from the original Parquet file that has been clipped
  *                      to only contain the column chunks to be read
  * @param isSchemaCaseSensitive whether schema is case sensitive
- * @param readDataSchema the Spark schema describing what will be read
  * @param debugDumpPrefix a path prefix to use for dumping the fabricated Parquet data or null
  * @param maxReadBatchSizeRows soft limit on the maximum number of rows the reader reads per batch
  * @param maxReadBatchSizeBytes soft limit on the maximum number of bytes the reader reads per batch
@@ -1455,7 +1466,6 @@ class MultiFileParquetPartitionReader(
     splits: Array[PartitionedFile],
     clippedBlocks: Seq[ParquetSingleDataBlockMeta],
     override val isSchemaCaseSensitive: Boolean,
-    readDataSchema: StructType,
     debugDumpPrefix: String,
     maxReadBatchSizeRows: Integer,
     maxReadBatchSizeBytes: Long,
@@ -1465,7 +1475,7 @@ class MultiFileParquetPartitionReader(
     ignoreMissingFiles: Boolean,
     ignoreCorruptFiles: Boolean,
     useFieldId: Boolean)
-  extends MultiFileCoalescingPartitionReaderBase(conf, clippedBlocks, readDataSchema,
+  extends MultiFileCoalescingPartitionReaderBase(conf, clippedBlocks,
     partitionSchema, maxReadBatchSizeRows, maxReadBatchSizeBytes, numThreads, execMetrics)
   with ParquetPartitionReaderBase {
 
@@ -1477,7 +1487,7 @@ class MultiFileParquetPartitionReader(
     block.asInstanceOf[ParquetDataBlock].dataBlock
 
   implicit def toDataBlockBase(blocks: Seq[BlockMetaData]): Seq[DataBlockBase] =
-    blocks.map(ParquetDataBlock(_))
+    blocks.map(ParquetDataBlock)
 
   implicit def toBlockMetaDataSeq(blocks: Seq[DataBlockBase]): Seq[BlockMetaData] =
     blocks.map(_.asInstanceOf[ParquetDataBlock].dataBlock)
@@ -1573,7 +1583,7 @@ class MultiFileParquetPartitionReader(
   override final def getFileFormatShortName: String = "Parquet"
 
   override def readBufferToTable(dataBuffer: HostMemoryBuffer, dataSize: Long,
-      clippedSchema: SchemaBase, extraInfo: ExtraInfo): Table = {
+      clippedSchema: SchemaBase, readDataSchema: StructType, extraInfo: ExtraInfo): Table = {
 
     // Dump parquet data into a file
     dumpDataToFile(dataBuffer, dataSize, splits, Option(debugDumpPrefix), Some("parquet"))
@@ -1656,7 +1666,7 @@ class MultiFileParquetPartitionReader(
  *                            processed on the GPU. This affects the amount of host memory used.
  * @param ignoreMissingFiles Whether to ignore missing files
  * @param ignoreCorruptFiles Whether to ignore corrupt files
- * @param useFieldId Whether to ignore corrupt files
+ * @param useFieldId Whether to use field id for column matching
  */
 class MultiFileCloudParquetPartitionReader(
     override val conf: Configuration,
@@ -1843,8 +1853,7 @@ class MultiFileCloudParquetPartitionReader(
         val (hostBuffer, size) = memBuffersAndSize.head
         val nextBatch = readBufferToTable(buffer.isCorrectRebaseMode,
           buffer.isCorrectInt96RebaseMode, buffer.hasInt96Timestamps, buffer.clippedSchema,
-          buffer.readSchema, buffer.partitionedFile.partitionValues,
-          hostBuffer, size, buffer.partitionedFile.filePath)
+          buffer.readSchema, buffer.partitionedFile, hostBuffer, size)
         if (memBuffersAndSize.length > 1) {
           val updatedBuffers = memBuffersAndSize.drop(1)
           currentFileHostBuffers = Some(buffer.copy(memBuffersAndSizes = updatedBuffers))
@@ -1862,10 +1871,9 @@ class MultiFileCloudParquetPartitionReader(
       hasInt96Timestamps: Boolean,
       clippedSchema: MessageType,
       readDataSchema: StructType,
-      partValues: InternalRow,
+      partedFile: PartitionedFile,
       hostBuffer: HostMemoryBuffer,
-      dataSize: Long,
-      fileName: String): Option[ColumnarBatch] = {
+      dataSize: Long): Option[ColumnarBatch] = {
     val table = withResource(hostBuffer) { _ =>
 
       // Dump parquet data into a file
@@ -1885,7 +1893,7 @@ class MultiFileCloudParquetPartitionReader(
         maxDeviceMemory = max(GpuColumnVector.getTotalDeviceMemoryUsed(table), maxDeviceMemory)
         if (readDataSchema.length < table.getNumberOfColumns) {
           throw new QueryExecutionException(s"Expected ${readDataSchema.length} columns " +
-              s"but read ${table.getNumberOfColumns} from $fileName")
+              s"but read ${table.getNumberOfColumns} from ${partedFile.filePath}")
         }
       }
       metrics(NUM_OUTPUT_BATCHES) += 1
@@ -1900,7 +1908,7 @@ class MultiFileCloudParquetPartitionReader(
       }
       // we have to add partition values here for this batch, we already verified that
       // its not different for all the blocks in this batch
-      addPartitionValues(maybeBatch, partValues, partitionSchema)
+      addPartitionValues(maybeBatch, partedFile.partitionValues, partitionSchema)
     } finally {
       table.foreach(_.close())
     }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ParquetSchemaUtils.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ParquetSchemaUtils.scala
index 00508b9dd49..09d8cf8257b 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ParquetSchemaUtils.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ParquetSchemaUtils.scala
@@ -345,7 +345,7 @@ object ParquetSchemaUtils extends Arm {
     sparkType match {
       case t: ArrayType =>
         // Only clips array types with nested type as element type.
-        clipSparkArrayType(t, parquetType.asGroupType(), caseSensitive, useFieldId)
+        clipSparkArrayType(t, parquetType, caseSensitive, useFieldId)
 
       case t: MapType =>
         clipSparkMapType(t, parquetType.asGroupType(), caseSensitive, useFieldId)
@@ -360,18 +360,20 @@ object ParquetSchemaUtils extends Arm {
 
   private def clipSparkArrayType(
       sparkType: ArrayType,
-      parquetList: GroupType,
+      parquetList: Type,
       caseSensitive: Boolean,
       useFieldId: Boolean): DataType = {
     val elementType = sparkType.elementType
     // Unannotated repeated group should be interpreted as required list of required element, so
     // list element type is just the group itself.
     // TODO: When we drop Spark 3.1.x, this should use Parquet's LogicalTypeAnnotation
+    //       Note that the original type is not null for leaf nodes.
     //if (parquetList.getLogicalTypeAnnotation == null &&
-    if (parquetList.getOriginalType == null &&
+    val newSparkType = if (parquetList.getOriginalType == null &&
         parquetList.isRepetition(Repetition.REPEATED)) {
       clipSparkType(elementType, parquetList, caseSensitive, useFieldId)
     } else {
+      val parquetListGroup = parquetList.asGroupType()
       assert(
         // TODO: When we drop Spark 3.1.x, this should use Parquet's LogicalTypeAnnotation
         //parquetList.getLogicalTypeAnnotation.isInstanceOf[ListLogicalTypeAnnotation],
@@ -381,14 +383,15 @@ object ParquetSchemaUtils extends Arm {
             "ListLogicalTypeAnnotation: " + parquetList.toString)
 
       assert(
-        parquetList.getFieldCount == 1 && parquetList.getType(0).isRepetition(Repetition.REPEATED),
+        parquetListGroup.getFieldCount == 1 &&
+            parquetListGroup.getType(0).isRepetition(Repetition.REPEATED),
         "Invalid Parquet schema. " +
             "LIST-annotated group should only have exactly one repeated field: " +
             parquetList)
 
-      val repeated = parquetList.getType(0)
-      val newSparkType = if (repeated.isPrimitive) {
-        clipSparkType(elementType, parquetList.getType(0), caseSensitive, useFieldId)
+      val repeated = parquetListGroup.getType(0)
+      if (repeated.isPrimitive) {
+        clipSparkType(elementType, parquetListGroup.getType(0), caseSensitive, useFieldId)
       } else {
         val repeatedGroup = repeated.asGroupType()
 
@@ -408,9 +411,9 @@ object ParquetSchemaUtils extends Arm {
         }
         clipSparkType(elementType, parquetElementType, caseSensitive, useFieldId)
       }
-
-      sparkType.copy(elementType = newSparkType)
     }
+
+    sparkType.copy(elementType = newSparkType)
   }
 
   private def clipSparkMapType(
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/nullExpressions.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/nullExpressions.scala
index 5827c2dfa4b..8d60e36dad8 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/nullExpressions.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/nullExpressions.scala
@@ -23,7 +23,7 @@ import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.shims.ShimExpression
 
 import org.apache.spark.sql.catalyst.expressions.{ComplexTypeMergingExpression, Expression, Predicate}
-import org.apache.spark.sql.types.DataType
+import org.apache.spark.sql.types.{DataType, DoubleType, FloatType}
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
 object GpuNvl extends Arm {
@@ -140,6 +140,26 @@ case class GpuIsNan(child: Expression) extends GpuUnaryExpression with Predicate
     input.getBase.isNan
 }
 
+/*
+ * Replace all `Nan`s in child to `null`s.
+ * The data type of child can only be FloatType or DoubleType.
+ *
+ * This class is used in `GpuFloatMin`.
+ */
+case class GpuNansToNulls(child: Expression) extends GpuUnaryExpression{
+
+  override def dataType: DataType = child.dataType match {
+    case FloatType => FloatType
+    case DoubleType => DoubleType
+    case t => throw new IllegalStateException(s"child type $t is not FloatType or DoubleType")
+  }
+
+  override protected def doColumnar(input: GpuColumnVector): ColumnVector =
+    input.getBase.nansToNulls
+
+  override def nullable = true
+}
+
 /**
  * A GPU accelerated predicate that is evaluated to be true if there are at least `n` non-null
  * and non-NaN values.
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/AggregateFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/AggregateFunctions.scala
index c064a41c05c..f54e8ea3050 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/AggregateFunctions.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/AggregateFunctions.scala
@@ -360,6 +360,15 @@ class CudfMin(override val dataType: DataType) extends CudfAggregate {
   override val name: String = "CudfMin"
 }
 
+/**
+ * Check if all values in a boolean column are trues.
+ * The CUDF all aggregation does not work for reductions or group by aggregations
+ * so we use Min as a workaround for this.
+ */
+object CudfAll {
+  def apply(): CudfAggregate = new CudfMin(BooleanType)
+}
+
 class CudfCollectList(override val dataType: DataType) extends CudfAggregate {
   override lazy val reductionAggregate: cudf.ColumnVector => cudf.Scalar =
     (col: cudf.ColumnVector) => col.reduce(ReductionAggregation.collectList(), DType.LIST)
@@ -470,10 +479,18 @@ class CudfMergeM2 extends CudfAggregate {
         StructField("m2", DoubleType, nullable = true) :: Nil)
 }
 
-case class GpuMin(child: Expression) extends GpuAggregateFunction
+object GpuMin{
+  def apply(child: Expression): GpuMin = child.dataType match {
+    case FloatType | DoubleType => GpuFloatMin(child)
+    case _ => GpuBasicMin(child)
+  }
+}
+
+abstract class GpuMin(child: Expression) extends GpuAggregateFunction
     with GpuBatchedRunningWindowWithFixer
     with GpuAggregateWindowFunction
-    with GpuRunningWindowFunction {
+    with GpuRunningWindowFunction
+    with Serializable {
   override lazy val initialValues: Seq[GpuLiteral] = Seq(GpuLiteral(null, child.dataType))
   override lazy val inputProjection: Seq[Expression] = Seq(child)
   override lazy val updateAggregates: Seq[CudfAggregate] = Seq(new CudfMin(child.dataType))
@@ -522,6 +539,123 @@ case class GpuMin(child: Expression) extends GpuAggregateFunction
   }
 }
 
+/** Min aggregation without `Nan` handling */
+case class GpuBasicMin(child: Expression) extends GpuMin(child)
+
+/** GpuMin for FloatType and DoubleType to handle `Nan`s.
+ *
+ * In Spark, `Nan` is the max float value, however in cuDF, the calculation
+ * involving `Nan` is undefined.
+ * We design a workaround method here to match the Spark's behaviour.
+ * The high level idea is:
+ *   if the column contains only `Nan`s or `null`s
+ *   then
+       if the column contains `Nan`
+ *     then return `Nan`
+ *     else return null
+ *   else
+ *     replace all `Nan`s with nulls;
+ *     use cuDF kernel to find the min value
+ */
+case class GpuFloatMin(child: Expression) extends GpuMin(child)
+  with GpuReplaceWindowFunction {
+  
+  override val dataType: DataType = child.dataType match {
+    case FloatType | DoubleType => child.dataType
+    case t => throw new IllegalStateException(s"child type $t is not FloatType or DoubleType")
+  }
+
+  protected val nan: Any = child.dataType match {
+    case FloatType => Float.NaN
+    case DoubleType => Double.NaN
+    case t => throw new IllegalStateException(s"child type $t is not FloatType or DoubleType")
+  }
+
+  protected lazy val updateAllNansOrNulls = CudfAll()
+  protected lazy val updateHasNan = CudfAny()
+  protected lazy val updateMinVal = new CudfMin(dataType)
+
+  protected lazy val mergeAllNansOrNulls = CudfAll()
+  protected lazy val mergeHasNan = CudfAny()
+  protected lazy val mergeMinVal = new CudfMin(dataType)
+
+  // Project 3 columns:
+  // 1. A boolean column indicating whether the values in `child` are `Nan`s or `null`s
+  // 2. A boolean column indicating whether the values in `child` are `Nan`s
+  // 3. Replace all `Nan`s in the `child` with `null`s
+  override lazy val inputProjection: Seq[Expression] = Seq(
+    GpuOr(GpuIsNan(child), GpuIsNull(child)),
+    GpuIsNan(child),
+    // We must eliminate all Nans before calling the cuDF min kernel.
+    // As this expression is only used when `allNansOrNulls` = false,
+    // and `Nan` is the max value in Spark, the elimination will
+    // not affect the final result.
+    GpuNansToNulls(child)
+  )
+  // 1. Check if all values in the `child` are `Nan`s or `null`s
+  // 2. Check if `child` contains `Nan`
+  // 3. Calculate the min value on `child` with all `Nan`s has been replaced.
+  override lazy val updateAggregates: Seq[CudfAggregate] =
+    Seq(updateAllNansOrNulls, updateHasNan, updateMinVal)
+
+  // If the column only contains `Nan`s or `null`s
+  // Then
+  //   if the column contains `Nan`
+  //   then return `Nan`
+  //   else return `null`
+  // Else return the min value
+  override lazy val postUpdate: Seq[Expression] = Seq(
+    GpuIf(
+      updateAllNansOrNulls.attr, 
+      GpuIf(
+        updateHasNan.attr, GpuLiteral(nan, dataType), GpuLiteral(null, dataType)
+      ),
+      updateMinVal.attr
+    )
+  )
+
+  // Same logic as the `inputProjection` stage.
+  override lazy val preMerge: Seq[Expression] = Seq (
+    GpuOr(GpuIsNan(evaluateExpression), GpuIsNull(evaluateExpression)),
+    GpuIsNan(evaluateExpression),
+    GpuNansToNulls(evaluateExpression)
+  )
+
+  // Same logic as the `updateAggregates` stage.
+  override lazy val mergeAggregates: Seq[CudfAggregate] =
+    Seq(mergeAllNansOrNulls, mergeHasNan, mergeMinVal)
+
+  // Same logic as the `postUpdate` stage.
+  override lazy val postMerge: Seq[Expression] = Seq(
+    GpuIf(
+      mergeAllNansOrNulls.attr,
+      GpuIf(
+        mergeHasNan.attr, GpuLiteral(nan, dataType), GpuLiteral(null, dataType)
+      ),
+      mergeMinVal.attr
+    )
+  )
+
+  // We should always override the windowing expression to handle `Nan`.
+  override def shouldReplaceWindow(spec: GpuWindowSpecDefinition): Boolean = true
+
+  override def windowReplacement(spec: GpuWindowSpecDefinition): Expression = {
+    // The `GpuBasicMin` here has the same functionality as `CudfAll`,
+    // as `true > false` in cuDF.
+    val allNansOrNull = GpuWindowExpression(
+      GpuBasicMin(GpuOr(GpuIsNan(child), GpuIsNull(child))), spec
+    )
+    val hasNan = GpuWindowExpression(GpuBasicMax(GpuIsNan(child)), spec)
+    // We use `GpuBasicMin` but not `GpuMin` to avoid self recursion.
+    val min = GpuWindowExpression(GpuBasicMin(GpuNansToNulls(child)), spec)
+    GpuIf(
+      allNansOrNull,
+      GpuIf(hasNan, GpuLiteral(nan, dataType), GpuLiteral(null, dataType)),
+      min
+    )
+  }
+}
+
 object GpuMax {
   def apply(child: Expression): GpuMax = {
     child.dataType match {
@@ -584,12 +718,13 @@ abstract class GpuMax(child: Expression) extends GpuAggregateFunction
   }
 }
 
-/** Max aggregation without `NaN` handling */
+/** Max aggregation without `Nan` handling */
 case class GpuBasicMax(child: Expression) extends GpuMax(child)
 
-/** Max aggregation for FloatType and DoubleType to handle `NaN`s.
+/** Max aggregation for FloatType and DoubleType to handle `Nan`s.
  *
- * In Spark, `Nan` is the max float value, however in cuDF, `Infinity` is.
+ * In Spark, `Nan` is the max float value, however in cuDF, the calculation
+ * involving `Nan` is undefined.
  * We design a workaround method here to match the Spark's behaviour.
  * The high level idea is that, in the projection stage, we create another
  * column `isNan`. If any value in this column is true, return `Nan`,
@@ -1019,10 +1154,6 @@ abstract class GpuDecimalSum(
     Seq(updateSum, updateIsEmpty)
   }
 
-  override lazy val postUpdate: Seq[Expression] = {
-    Seq(GpuCheckOverflow(updateSum.attr, dt, !failOnErrorOverride), updateIsEmpty.attr)
-  }
-
   // Used for Decimal overflow detection
   protected lazy val isEmpty: AttributeReference = AttributeReference("isEmpty", BooleanType)()
   override lazy val aggBufferAttributes: Seq[AttributeReference] = {
@@ -1045,10 +1176,7 @@ abstract class GpuDecimalSum(
 
   override lazy val postMerge: Seq[Expression] = {
     Seq(
-      GpuCheckOverflow(GpuIf(mergeIsOverflow.attr,
-        GpuLiteral.create(null, dt),
-        mergeSum.attr),
-        dt, !failOnErrorOverride),
+      GpuIf(mergeIsOverflow.attr, GpuLiteral.create(null, dt), mergeSum.attr),
       mergeIsEmpty.attr)
   }
 
@@ -1140,8 +1268,9 @@ case class GpuDecimal128Sum(
   override lazy val updateAggregates: Seq[CudfAggregate] = updateSumChunks :+ updateIsEmpty
 
   override lazy val postUpdate: Seq[Expression] = {
-    val assembleExpr = GpuAssembleSumChunks(updateSumChunks.map(_.attr), dt, !failOnErrorOverride)
-    Seq(GpuCheckOverflow(assembleExpr, dt, !failOnErrorOverride), updateIsEmpty.attr)
+    Seq(
+      GpuAssembleSumChunks(updateSumChunks.map(_.attr), dt, !failOnErrorOverride),
+      updateIsEmpty.attr)
   }
 
   override lazy val preMerge: Seq[Expression] = {
@@ -1165,10 +1294,7 @@ case class GpuDecimal128Sum(
   override lazy val postMerge: Seq[Expression] = {
     val assembleExpr = GpuAssembleSumChunks(mergeSumChunks.map(_.attr), dt, !failOnErrorOverride)
     Seq(
-      GpuCheckOverflow(GpuIf(mergeIsOverflow.attr,
-        GpuLiteral.create(null, dt),
-        assembleExpr),
-        dt, !failOnErrorOverride),
+      GpuIf(mergeIsOverflow.attr, GpuLiteral.create(null, dt), assembleExpr),
       mergeIsEmpty.attr)
   }
 
@@ -1470,11 +1596,20 @@ abstract class GpuDecimalAverage(child: Expression, sumDataType: DecimalType)
           sumDataType, nullOnOverflow = true),
     mergeCount.attr)
 
+  // This is here to be bug for bug compatible with Spark. They round in the divide and then cast
+  // the result to the final value. This loses some data in many cases and we need to be able to
+  // match that. This bug appears to have been fixed in Spark 3.4.0.
+  lazy val intermediateSparkDivideType = GpuDecimalDivide.calcOrigSparkOutputType(sumDataType,
+    DecimalType.LongDecimal)
+
   // NOTE: this sets `failOnErrorOverride=false` in `GpuDivide` to force it not to throw
   // divide-by-zero exceptions, even when ansi mode is enabled in Spark.
   // This is to conform with Spark's behavior in the Average aggregate function.
-  override lazy val evaluateExpression: Expression =
-      GpuDecimalDivide(sum, count, dataType, failOnError = false)
+  override lazy val evaluateExpression: Expression = {
+    GpuCast(
+      GpuDecimalDivide(sum, count, intermediateSparkDivideType, failOnError = false),
+      dataType)
+  }
 
   // Window
   // Replace average with SUM/COUNT. This lets us run average in running window mode without
@@ -1482,7 +1617,9 @@ abstract class GpuDecimalAverage(child: Expression, sumDataType: DecimalType)
   override def windowReplacement(spec: GpuWindowSpecDefinition): Expression = {
     val count = GpuWindowExpression(GpuCount(Seq(child)), spec)
     val sum = GpuWindowExpression(GpuSum(child, sumDataType, failOnErrorOverride = false), spec)
-    GpuDecimalDivide(sum, count, dataType, failOnError = false)
+    GpuCast(
+      GpuDecimalDivide(sum, count, intermediateSparkDivideType, failOnError = false),
+      dataType)
   }
 
   override val dataType: DecimalType = child.dataType match {
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/ExternalSource.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/ExternalSource.scala
index 43ddbf440e6..1367ee95c68 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/ExternalSource.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/ExternalSource.scala
@@ -137,8 +137,7 @@ object ExternalSource extends Logging {
     if (hasSparkAvroJar && avroProvider.isSupportedScan(scan)) {
       avroProvider.copyScanWithInputFileTrue(scan)
     } else if (hasIcebergJar && icebergProvider.isSupportedScan(scan)) {
-      // Iceberg does not yet support a coalescing reader, so nothing to change
-      scan
+      icebergProvider.copyScanWithInputFileTrue(scan)
     } else {
       throw new RuntimeException(s"Unsupported scan type: ${scan.getClass.getSimpleName}")
     }
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuAvroScan.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuAvroScan.scala
index 87ceaab9bc5..f98a8274951 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuAvroScan.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuAvroScan.scala
@@ -18,7 +18,7 @@ package org.apache.spark.sql.rapids
 
 import java.io.{FileNotFoundException, IOException, OutputStream}
 import java.net.URI
-import java.util.concurrent.Callable
+import java.util.concurrent.{Callable, TimeUnit}
 
 import scala.annotation.tailrec
 import scala.collection.JavaConverters.{asScalaBufferConverter, mapAsScalaMapConverter}
@@ -248,6 +248,7 @@ case class GpuAvroMultiFilePartitionReaderFactory(
     val clippedBlocks = ArrayBuffer[AvroSingleDataBlockInfo]()
     val mapPathHeader = LinkedHashMap[Path, Header]()
     val filterHandler = AvroFileFilterHandler(conf, options)
+    val currentTime = System.nanoTime()
     files.foreach { file =>
       val singleFileInfo = try {
         filterHandler.filterBlocks(file)
@@ -269,12 +270,16 @@ case class GpuAvroMultiFilePartitionReaderFactory(
             AvroDataBlock(block),
             file.partitionValues,
             AvroSchemaWrapper(SchemaConverters.toAvroType(readDataSchema)),
+            readDataSchema,
             AvroExtraInfo()))
       if (singleFileInfo.blocks.nonEmpty) {
         // No need to check the header since it can not be null when blocks is not empty here.
         mapPathHeader.put(fPath, singleFileInfo.header)
       }
     }
+    metrics.get("scanTime").foreach {
+      _ += TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - currentTime)
+    }
     new GpuMultiFileAvroPartitionReader(conf, files, clippedBlocks, readDataSchema,
       partitionSchema, maxReadBatchSizeRows, maxReadBatchSizeBytes, numThreads,
       debugDumpPrefix, metrics, mapPathHeader.toMap)
@@ -827,7 +832,7 @@ class GpuMultiFileAvroPartitionReader(
     override val debugDumpPrefix: Option[String],
     execMetrics: Map[String, GpuMetric],
     mapPathHeader: Map[Path, Header])
-  extends MultiFileCoalescingPartitionReaderBase(conf, clippedBlocks, readDataSchema,
+  extends MultiFileCoalescingPartitionReaderBase(conf, clippedBlocks,
     partitionSchema, maxReadBatchSizeRows, maxReadBatchSizeBytes, numThreads,
     execMetrics) with GpuAvroReaderBase {
 
@@ -886,7 +891,7 @@ class GpuMultiFileAvroPartitionReader(
   }
 
   override def readBufferToTable(dataBuffer: HostMemoryBuffer, dataSize: Long,
-      clippedSchema: SchemaBase, extraInfo: ExtraInfo): Table = {
+      clippedSchema: SchemaBase,  readSchema: StructType, extraInfo: ExtraInfo): Table = {
     sendToGpuUnchecked(dataBuffer, dataSize, splits)
   }
 
@@ -1013,6 +1018,7 @@ case class AvroSingleDataBlockInfo(
   dataBlock: AvroDataBlock,
   partitionValues: InternalRow,
   schema: AvroSchemaWrapper,
+  readSchema: StructType,
   extraInfo: AvroExtraInfo) extends SingleDataBlockInfo
 
 case class AvroBatchContext(
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/arithmetic.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/arithmetic.scala
index 0411aeafba6..e881e47fa48 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/arithmetic.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/arithmetic.scala
@@ -394,7 +394,7 @@ case class GpuDecimalMultiply(
     left: Expression,
     right: Expression,
     dataType: DecimalType,
-    needsExtraOverflowChecks: Boolean = false,
+    useLongMultiply: Boolean = false,
     failOnError: Boolean = SQLConf.get.ansiEnabled) extends
     ShimExpression with GpuExpression {
 
@@ -409,7 +409,7 @@ case class GpuDecimalMultiply(
   private[this] lazy val intermediateResultType =
     GpuDecimalMultiply.intermediateResultType(lhsType, rhsType, dataType)
 
-  override def columnarEval(batch: ColumnarBatch): Any = {
+  def regularMultiply(batch: ColumnarBatch): Any = {
     val castLhs = withResource(GpuExpressionsUtils.columnarEvalToColumn(left, batch)) { lhs =>
       GpuCast.doCast(lhs.getBase, lhs.dataType(), intermediateLhsType, ansiMode = failOnError,
         legacyCastToString = false, stringToDateAnsiModeEnabled = false)
@@ -422,7 +422,7 @@ case class GpuDecimalMultiply(
       withResource(castRhs) { castRhs =>
         withResource(castLhs.mul(castRhs,
           GpuColumnVector.getNonNestedRapidsType(intermediateResultType))) { mult =>
-          if (needsExtraOverflowChecks) {
+          if (useLongMultiply) {
             withResource(GpuDecimalMultiply.checkForOverflow(castLhs, castRhs)) { wouldOverflow =>
               if (failOnError) {
                 withResource(wouldOverflow.any()) { anyOverflow =>
@@ -450,6 +450,43 @@ case class GpuDecimalMultiply(
     }
   }
 
+  def longMultiply(batch: ColumnarBatch): Any = {
+    val castLhs = withResource(GpuExpressionsUtils.columnarEvalToColumn(left, batch)) { lhs =>
+      lhs.getBase.castTo(DType.create(DType.DTypeEnum.DECIMAL128, lhs.getBase.getType.getScale))
+    }
+    val retTab = withResource(castLhs) { castLhs =>
+      val castRhs = withResource(GpuExpressionsUtils.columnarEvalToColumn(right, batch)) { rhs =>
+        rhs.getBase.castTo(DType.create(DType.DTypeEnum.DECIMAL128, rhs.getBase.getType.getScale))
+      }
+      withResource(castRhs) { castRhs =>
+        com.nvidia.spark.rapids.jni.DecimalUtils.multiply128(castLhs, castRhs, -dataType.scale)
+      }
+    }
+    val retCol = withResource(retTab) { retTab =>
+      if (failOnError) {
+        withResource(retTab.getColumn(0).any()) { anyOverflow =>
+          if (anyOverflow.isValid && anyOverflow.getBoolean) {
+            throw new ArithmeticException(GpuCast.INVALID_INPUT_MESSAGE)
+          }
+        }
+        retTab.getColumn(1).incRefCount()
+      } else {
+        withResource(GpuScalar.from(null, dataType)) { nullVal =>
+          retTab.getColumn(0).ifElse(nullVal, retTab.getColumn(1))
+        }
+      }
+    }
+    GpuColumnVector.from(retCol, dataType)
+  }
+
+  override def columnarEval(batch: ColumnarBatch): Any = {
+    if (useLongMultiply) {
+      longMultiply(batch)
+    } else {
+      regularMultiply(batch)
+    }
+  }
+
   override def nullable: Boolean = left.nullable || right.nullable
 
   override def children: Seq[Expression] = Seq(left, right)
@@ -778,6 +815,9 @@ case class GpuDecimalDivide(
     failOnError: Boolean = SQLConf.get.ansiEnabled) extends
     ShimExpression with GpuExpression {
 
+  // For all decimal128 output we will use the long division version.
+  protected lazy val useLongDivision: Boolean = dataType.precision > Decimal.MAX_LONG_DIGITS
+
   override def toString: String = s"($left / $right)"
 
   override def sql: String = s"(${left.sql} / ${right.sql})"
@@ -815,7 +855,7 @@ case class GpuDecimalDivide(
     }
   }
 
-  override def columnarEval(batch: ColumnarBatch): Any = {
+  def regularDivide(batch: ColumnarBatch): Any = {
     val castLhs = withResource(GpuExpressionsUtils.columnarEvalToColumn(left, batch)) { lhs =>
       GpuCast.doCast(lhs.getBase, lhs.dataType(), intermediateLhsType, ansiMode = failOnError,
         legacyCastToString = false, stringToDateAnsiModeEnabled = false)
@@ -842,12 +882,83 @@ case class GpuDecimalDivide(
     }
   }
 
+  def longDivide(batch: ColumnarBatch): Any = {
+    val castLhs = withResource(GpuExpressionsUtils.columnarEvalToColumn(left, batch)) { lhs =>
+      lhs.getBase.castTo(DType.create(DType.DTypeEnum.DECIMAL128, lhs.getBase.getType.getScale))
+    }
+    val retTab = withResource(castLhs) { castLhs =>
+      val castRhs = withResource(GpuExpressionsUtils.columnarEvalToColumn(right, batch)) { rhs =>
+        withResource(divByZeroFixes(rhs.getBase)) { fixed =>
+          fixed.castTo(DType.create(DType.DTypeEnum.DECIMAL128, fixed.getType.getScale))
+        }
+      }
+      withResource(castRhs) { castRhs =>
+        com.nvidia.spark.rapids.jni.DecimalUtils.divide128(castLhs, castRhs, -dataType.scale)
+      }
+    }
+    val retCol = withResource(retTab) { retTab =>
+      val overflowed = retTab.getColumn(0)
+      val quotient = retTab.getColumn(1)
+      if (failOnError) {
+        withResource(overflowed.any()) { anyOverflow =>
+          if (anyOverflow.isValid && anyOverflow.getBoolean) {
+            throw new ArithmeticException(GpuCast.INVALID_INPUT_MESSAGE)
+          }
+        }
+        quotient.incRefCount()
+      } else {
+        withResource(GpuScalar.from(null, dataType)) { nullVal =>
+          overflowed.ifElse(nullVal, quotient)
+        }
+      }
+    }
+    GpuColumnVector.from(retCol, dataType)
+  }
+
+  override def columnarEval(batch: ColumnarBatch): Any = {
+    if (useLongDivision) {
+      longDivide(batch)
+    } else {
+      regularDivide(batch)
+    }
+  }
+
   override def nullable: Boolean = true
 
   override def children: Seq[Expression] = Seq(left, right)
 }
 
 object GpuDecimalDivide {
+  // This comes from DecimalType.MINIMUM_ADJUSTED_SCALE, but for some reason it is gone
+  // in databricks so we have it here.
+  private val MINIMUM_ADJUSTED_SCALE = 6
+
+  def calcOrigSparkOutputType(lhs: DecimalType, rhs: DecimalType): DecimalType = {
+    // This comes almost directly from Spark unchanged
+    val allowPrecisionLoss = SQLConf.get.decimalOperationsAllowPrecisionLoss
+    val p1 = lhs.precision
+    val s1 = lhs.scale
+    val p2 = rhs.precision
+    val s2 = rhs.scale
+    if (allowPrecisionLoss) {
+      // Precision: p1 - s1 + s2 + max(6, s1 + p2 + 1)
+      // Scale: max(6, s1 + p2 + 1)
+      val intDig = p1 - s1 + s2
+      val scale = math.max(MINIMUM_ADJUSTED_SCALE, s1 + p2 + 1)
+      val prec = intDig + scale
+      DecimalType.adjustPrecisionScale(prec, scale)
+    } else {
+      var intDig = math.min(DecimalType.MAX_SCALE, p1 - s1 + s2)
+      var decDig = math.min(DecimalType.MAX_SCALE, math.max(6, s1 + p2 + 1))
+      val diff = (intDig + decDig) - DecimalType.MAX_SCALE
+      if (diff > 0) {
+        decDig -= diff / 2 + 1
+        intDig = DecimalType.MAX_SCALE - decDig
+      }
+      DecimalType.bounded(intDig + decDig, decDig)
+    }
+  }
+
   // For Spark the final desired output is
   // new_scale = max(6, lhs.scale + rhs.precision + 1)
   // new_precision = lhs.precision - lhs.scale + rhs.scale + new_scale
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
index 0ec1a295b5c..f1ea25eb1fe 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
@@ -16,6 +16,7 @@
 
 package org.apache.spark.sql.rapids
 
+import java.time.ZoneId
 import java.util.concurrent.TimeUnit
 
 import ai.rapids.cudf.{BinaryOp, ColumnVector, ColumnView, DType, Scalar}
@@ -24,7 +25,7 @@ import com.nvidia.spark.rapids.GpuOverrides.{extractStringLit, getTimeParserPoli
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.shims.ShimBinaryExpression
 
-import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, ExpectsInputTypes, Expression, ImplicitCastInputTypes, NullIntolerant, TimeZoneAwareExpression}
+import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, ExpectsInputTypes, Expression, FromUTCTimestamp, ImplicitCastInputTypes, NullIntolerant, TimeZoneAwareExpression}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.vectorized.ColumnarBatch
@@ -877,6 +878,71 @@ case class GpuFromUnixTime(
   override lazy val resolved: Boolean = childrenResolved && checkInputDataTypes().isSuccess
 }
 
+
+class FromUTCTimestampExprMeta(
+    expr: FromUTCTimestamp,
+    override val conf: RapidsConf,
+    override val parent: Option[RapidsMeta[_, _, _]],
+    rule: DataFromReplacementRule)
+  extends BinaryExprMeta[FromUTCTimestamp](expr, conf, parent, rule) {
+
+  override def tagExprForGpu(): Unit = {
+    extractStringLit(expr.right) match {
+      case None =>
+        willNotWorkOnGpu("timezone input must be a literal string")
+      case Some(timezoneShortID) =>
+        if (timezoneShortID != null) {
+          val utc = ZoneId.of("UTC").normalized
+          // This is copied from Spark, to convert `(+|-)h:mm` into `(+|-)0h:mm`.
+          val timezone = ZoneId.of(timezoneShortID.replaceFirst("(\\+|\\-)(\\d):", "$10$2:"),
+            ZoneId.SHORT_IDS).normalized
+
+          if (timezone != utc) {
+            willNotWorkOnGpu("only timezones equivalent to UTC are supported")
+          }
+        }
+    }
+  }
+
+  override def convertToGpu(timestamp: Expression, timezone: Expression): GpuExpression =
+    GpuFromUTCTimestamp(timestamp, timezone)
+}
+
+case class GpuFromUTCTimestamp(timestamp: Expression, timezone: Expression)
+  extends GpuBinaryExpression with ImplicitCastInputTypes with NullIntolerant {
+
+  override def left: Expression = timestamp
+  override def right: Expression = timezone
+  override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType, StringType)
+  override def dataType: DataType = TimestampType
+
+  override def doColumnar(lhs: GpuColumnVector, rhs: GpuColumnVector): ColumnVector = {
+    throw new IllegalStateException(
+      "Cannot have time zone given by a column vector in GpuFromUTCTimestamp")
+  }
+
+  override def doColumnar(lhs: GpuScalar, rhs: GpuColumnVector): ColumnVector = {
+    throw new IllegalStateException(
+      "Cannot have time zone given by a column vector in GpuFromUTCTimestamp")
+  }
+
+  override def doColumnar(lhs: GpuColumnVector, rhs: GpuScalar): ColumnVector = {
+    if (rhs.getBase.isValid) {
+      // Just a no-op.
+      lhs.getBase.incRefCount()
+    } else {
+      // All-null output column.
+      GpuColumnVector.columnVectorFromNull(lhs.getRowCount.toInt, dataType)
+    }
+  }
+
+  override def doColumnar(numRows: Int, lhs: GpuScalar, rhs: GpuScalar): ColumnVector = {
+    withResource(GpuColumnVector.from(lhs, numRows, left.dataType)) { lhsCol =>
+      doColumnar(lhsCol, rhs)
+    }
+  }
+}
+
 trait GpuDateMathBase extends GpuBinaryExpression with ExpectsInputTypes {
   override def inputTypes: Seq[AbstractDataType] =
     Seq(DateType, TypeCollection(IntegerType, ShortType, ByteType))
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/CollectionOpSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/CollectionOpSuite.scala
index 6d1bd7fc279..7497bc42d7a 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/CollectionOpSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/CollectionOpSuite.scala
@@ -19,22 +19,18 @@ package com.nvidia.spark.rapids
 import org.apache.spark.sql.functions.map_concat
 
 class CollectionOpSuite extends SparkQueryCompareTestSuite {
-  testGpuFallback(
-    "MapConcat with Array keys fall back",
-    "ProjectExec",
-    ArrayKeyMapDF, 
-    execsAllowedNonGpu = Seq("ProjectExec", "ShuffleExchangeExec")) {
+  testSparkResultsAreEqual(
+    "MapConcat with Array keys",
+    ArrayKeyMapDF) {
     frame => {
       import frame.sparkSession.implicits._
       frame.select(map_concat($"col1", $"col2"))
     }
   }
 
-  testGpuFallback(
-    "MapConcat with Struct keys fall back",
-    "ProjectExec",
-    StructKeyMapDF, 
-    execsAllowedNonGpu = Seq("ProjectExec", "ShuffleExchangeExec")) {
+   testSparkResultsAreEqual(
+    "MapConcat with Struct keys",
+    StructKeyMapDF) {
     frame => {
       import frame.sparkSession.implicits._
       frame.select(map_concat($"col1", $"col2"))
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExecSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExecSuite.scala
index f277921c7d3..2b29555852e 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExecSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExecSuite.scala
@@ -206,7 +206,7 @@ class GpuShuffledHashJoinExecSuite extends FunSuite with Arm with MockitoSugar {
 
   test("test a SerializedTableColumn") {
     TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
-      closeOnExcept(ColumnVector.fromInts(1, 2, 3, 4, 5)) { cudfCol =>
+      withResource(ColumnVector.fromInts(1, 2, 3, 4, 5)) { cudfCol =>
         val cv = GpuColumnVector.from(cudfCol, IntegerType)
         val batch = new ColumnarBatch(Seq(cv).toArray, 5)
         withResource(GpuColumnVector.from(batch)) { tbl =>
@@ -242,7 +242,7 @@ class GpuShuffledHashJoinExecSuite extends FunSuite with Arm with MockitoSugar {
 
   test("test two batches, going over the limit") {
     TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
-      closeOnExcept(ColumnVector.fromInts(1, 2, 3, 4, 5)) { cudfCol =>
+      withResource(ColumnVector.fromInts(1, 2, 3, 4, 5)) { cudfCol =>
         val cv = GpuColumnVector.from(cudfCol, IntegerType)
         val batch = new ColumnarBatch(Seq(cv).toArray, 5)
         withResource(GpuColumnVector.from(batch)) { tbl =>
@@ -281,7 +281,7 @@ class GpuShuffledHashJoinExecSuite extends FunSuite with Arm with MockitoSugar {
 
   test("test two batches, stating within the limit") {
     TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
-      closeOnExcept(ColumnVector.fromInts(1, 2, 3, 4, 5)) { cudfCol =>
+      withResource(ColumnVector.fromInts(1, 2, 3, 4, 5)) { cudfCol =>
         val cv = GpuColumnVector.from(cudfCol, IntegerType)
         val batch = new ColumnarBatch(Seq(cv).toArray, 5)
         withResource(GpuColumnVector.from(batch)) { tbl =>
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/HashAggregatesSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/HashAggregatesSuite.scala
index ede7fba002a..9e679e61543 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/HashAggregatesSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/HashAggregatesSuite.scala
@@ -1618,21 +1618,6 @@ class HashAggregatesSuite extends SparkQueryCompareTestSuite {
     frame => frame.groupBy("large_longs").agg(avg("large_longs"))
   } { (_, gpuPlan) => checkExecPlan(gpuPlan) }
 
-  ALLOW_NON_GPU_testSparkResultsAreEqualWithCapture(
-    "min_with_nans_fall_back",
-    nanDf,
-    Seq("HashAggregateExec", "AggregateExpression",
-      "AttributeReference", "Alias", "Min", "ShuffleExchangeExec"),
-    conf = enableCsvConf()) {
-    frame => frame.agg(min("doubles"))
-  } { (_, gpuPlan) => {
-    // verify nothing ran on the gpu
-    if (gpuPlan.conf.getAllConfs(RapidsConf.SQL_ENABLED.key).toBoolean) {
-      val execNode = gpuPlan.find(_.isInstanceOf[GpuHashAggregateExec])
-      assert(execNode.isEmpty)
-    }
-  }}
-
   IGNORE_ORDER_testSparkResultsAreEqual(
     testName = "Test NormalizeNansAndZeros(Float)",
     floatWithDifferentKindsOfNansAndZeros,
diff --git a/tools/src/main/resources/operatorsScore.csv b/tools/src/main/resources/operatorsScore.csv
index aeaf6fd9fb4..ce62bb87fc5 100644
--- a/tools/src/main/resources/operatorsScore.csv
+++ b/tools/src/main/resources/operatorsScore.csv
@@ -103,6 +103,7 @@ Explode,4
 Expm1,4
 First,4
 Floor,4
+FromUTCTimestamp,4
 FromUnixTime,4
 GetArrayItem,4
 GetArrayStructFields,4
diff --git a/tools/src/main/resources/supportedExprs.csv b/tools/src/main/resources/supportedExprs.csv
index 673b2b86f9c..7b93812944c 100644
--- a/tools/src/main/resources/supportedExprs.csv
+++ b/tools/src/main/resources/supportedExprs.csv
@@ -169,7 +169,7 @@ DenseRank,S,`dense_rank`,None,window,ordering,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,
 DenseRank,S,`dense_rank`,None,window,result,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Divide,S,`/`,None,project,lhs,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA
 Divide,S,`/`,None,project,rhs,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA
-Divide,S,`/`,None,project,result,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA
+Divide,S,`/`,None,project,result,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA
 ElementAt,S,`element_at`,None,project,array/map,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,PS,NA,NA
 ElementAt,S,`element_at`,None,project,index/key,PS,PS,PS,S,PS,PS,PS,PS,PS,PS,PS,NS,NS,NS,NS,NS,NS,NS
 ElementAt,S,`element_at`,None,project,result,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS
@@ -197,6 +197,9 @@ Expm1,S,`expm1`,None,AST,input,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Expm1,S,`expm1`,None,AST,result,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Floor,S,`floor`,None,project,input,NA,NA,NA,NA,S,NA,S,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA
 Floor,S,`floor`,None,project,result,NA,NA,NA,NA,S,NA,S,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA
+FromUTCTimestamp,S,`from_utc_timestamp`,None,project,timestamp,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA
+FromUTCTimestamp,S,`from_utc_timestamp`,None,project,timezone,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA
+FromUTCTimestamp,S,`from_utc_timestamp`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA
 FromUnixTime,S,`from_unixtime`,None,project,sec,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 FromUnixTime,S,`from_unixtime`,None,project,format,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA
 FromUnixTime,S,`from_unixtime`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA
@@ -360,7 +363,7 @@ Or,S,`or`,None,AST,rhs,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Or,S,`or`,None,AST,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 PercentRank,S,`percent_rank`,None,window,ordering,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NS,NS,NS
 PercentRank,S,`percent_rank`,None,window,result,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-Pmod,S,`pmod`,None,project,lhs,NA,S,S,S,S,S,S,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA
+Pmod,S,`pmod`,None,project,lhs,NA,S,S,S,S,S,S,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA
 Pmod,S,`pmod`,None,project,rhs,NA,S,S,S,S,S,S,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA
 Pmod,S,`pmod`,None,project,result,NA,S,S,S,S,S,S,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA
 PosExplode,S,`posexplode_outer`; `posexplode`,None,project,input,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,PS,NA,NA
@@ -625,11 +628,11 @@ Max,S,`max`,None,reduction,input,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,PS,NS
 Max,S,`max`,None,reduction,result,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,PS,NS
 Max,S,`max`,None,window,input,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,NS,NS
 Max,S,`max`,None,window,result,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,NS,NS
-Min,S,`min`,None,aggregation,input,S,S,S,S,S,PS,PS,S,PS,S,S,S,NS,NS,NS,NA,PS,NS
+Min,S,`min`,None,aggregation,input,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,PS,NS
 Min,S,`min`,None,aggregation,result,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,PS,NS
-Min,S,`min`,None,reduction,input,S,S,S,S,S,PS,PS,S,PS,S,S,S,NS,NS,NS,NA,PS,NS
+Min,S,`min`,None,reduction,input,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,PS,NS
 Min,S,`min`,None,reduction,result,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,PS,NS
-Min,S,`min`,None,window,input,S,S,S,S,S,PS,PS,S,PS,S,S,S,NS,NS,NS,NA,NS,NS
+Min,S,`min`,None,window,input,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,NS,NS
 Min,S,`min`,None,window,result,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,NS,NS
 PivotFirst,S, ,None,aggregation,pivotColumn,S,S,S,S,S,PS,PS,S,PS,S,S,S,NS,NS,NS,NS,NS,NS
 PivotFirst,S, ,None,aggregation,valueColumn,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NS,NS,NS