Skip to content

Commit

Permalink
Enable build for Databricks 13.3 [databricks] (#9677)
Browse files Browse the repository at this point in the history
* pom changes

* pom changes

* pom changes

* add databricks13.3 to premerge

* Added ToPrettyString support

* xfail approximate percentile test

* xfail failing udf tests

* xfail failing tests due to WriteIntoDeltaCommand

* xfail test_delta_atomic_create_table_as_select and test_delta_atomic_replace_table_as_select

* Added 341db to shim-deps and removed from datagen/pom.xml

* updated udf-compiler pom.xml

* updated sql-plugin pom.xml

* fixed multiple pom.xml

* updated udf-compiler pom.xml

* removed TODO

* Signoff

Signed-off-by: Raza Jafri <[email protected]>

* updated scala 2.13 poms

* Revert "xfail failing tests due to WriteIntoDeltaCommand"

This reverts commit 00b498e.

* Revert "xfail test_delta_atomic_create_table_as_select and test_delta_atomic_replace_table_as_select"

This reverts commit ea2fd40.

* remove tests/pom.xml changes

* reverted 2.13 generation of tests/pom.xml

* removed 341db profile from tests as we don't run unit tests on databricks

* fixed the xfail reason to point to the correct issue

* removed diff.patch

* Revert "xfail approximate percentile test"

This reverts commit 0a7fa52.

* build fixes

Signed-off-by: Jason Lowe <[email protected]>

* Fix spark321db build

* Skip UDF tests until UDF handling is updated

* Remove xfail/skips eclipsed by module-level skip

* xfail fastparquet tests due to nulls being introduced by pandas

* Fix incorrect shimplify directives for 341db

* Fix fallback test

---------

Signed-off-by: Raza Jafri <[email protected]>
Signed-off-by: Jason Lowe <[email protected]>
Co-authored-by: Jason Lowe <[email protected]>
  • Loading branch information
razajafri and jlowe authored Nov 23, 2023
1 parent 61cfb7d commit d3629fd
Show file tree
Hide file tree
Showing 29 changed files with 251 additions and 39 deletions.
17 changes: 17 additions & 0 deletions aggregator/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,23 @@
</dependency>
</dependencies>
</profile>
<profile>
<id>release341db</id>
<activation>
<property>
<name>buildver</name>
<value>341db</value>
</property>
</activation>
<dependencies>
<dependency>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-delta-spark341db_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<classifier>${spark.version.classifier}</classifier>
</dependency>
</dependencies>
</profile>
<profile>
<id>release333</id>
<activation>
Expand Down
2 changes: 1 addition & 1 deletion integration_tests/src/main/python/delta_lake_merge_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def checker(data_path, do_merge):
merge_sql=merge_sql,
check_func=checker)

@allow_non_gpu("ExecutedCommandExec,BroadcastHashJoinExec,ColumnarToRowExec,BroadcastExchangeExec,DataWritingCommandExec", *delta_meta_allow)
@allow_non_gpu("ExecutedCommandExec,BroadcastHashJoinExec,ColumnarToRowExec,BroadcastExchangeExec,DataWritingCommandExec", delta_write_fallback_allow, *delta_meta_allow)
@delta_lake
@ignore_order
@pytest.mark.skipif(is_databricks_runtime() and spark_version() < "3.3.2", reason="NOT MATCHED BY SOURCE added in DBR 12.2")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from asserts import assert_gpu_and_cpu_are_equal_collect
from data_gen import *
from fastparquet_utils import get_fastparquet_result_canonicalizer
from spark_session import spark_version, with_cpu_session, with_gpu_session
from spark_session import is_databricks_runtime, spark_version, with_cpu_session, with_gpu_session


def fastparquet_unavailable():
Expand Down Expand Up @@ -107,8 +107,12 @@ def read_with_fastparquet_or_plugin(spark):
pytest.param(IntegerGen(nullable=True),
marks=pytest.mark.xfail(reason="Nullables cause merge errors, when converting to Spark dataframe")),
LongGen(nullable=False),
FloatGen(nullable=False),
DoubleGen(nullable=False),
pytest.param(FloatGen(nullable=False),
marks=pytest.mark.xfail(is_databricks_runtime(),
reason="https://github.com/NVIDIA/spark-rapids/issues/9778")),
pytest.param(DoubleGen(nullable=False),
marks=pytest.mark.xfail(is_databricks_runtime(),
reason="https://github.com/NVIDIA/spark-rapids/issues/9778")),
StringGen(nullable=False),
pytest.param(DecimalGen(nullable=False),
marks=pytest.mark.xfail(reason="fastparquet reads Decimal columns as Float, as per "
Expand All @@ -131,8 +135,11 @@ def read_with_fastparquet_or_plugin(spark):
marks=pytest.mark.xfail(reason="Conversion from Pandas dataframe (read with fastparquet) to Spark dataframe "
"fails: \"Unable to infer the type of the field a\".")),
StructGen(children=[("first", IntegerGen(nullable=False)),
("second", FloatGen(nullable=False))], nullable=False)
pytest.param(
StructGen(children=[("first", IntegerGen(nullable=False)),
("second", FloatGen(nullable=False))], nullable=False),
marks=pytest.mark.xfail(is_databricks_runtime(),
reason="https://github.com/NVIDIA/spark-rapids/issues/9778")),
], ids=idfn)
def test_reading_file_written_by_spark_cpu(data_gen, spark_tmp_path):
"""
Expand Down Expand Up @@ -176,8 +183,12 @@ def test_reading_file_written_by_spark_cpu(data_gen, spark_tmp_path):
LongGen(nullable=False),
pytest.param(LongGen(nullable=True),
marks=pytest.mark.xfail(reason="Nullables cause merge errors, when converting to Spark dataframe")),
FloatGen(nullable=False),
DoubleGen(nullable=False),
pytest.param(FloatGen(nullable=False),
marks=pytest.mark.xfail(is_databricks_runtime(),
reason="https://github.com/NVIDIA/spark-rapids/issues/9778")),
pytest.param(DoubleGen(nullable=False),
marks=pytest.mark.xfail(is_databricks_runtime(),
reason="https://github.com/NVIDIA/spark-rapids/issues/9778")),
StringGen(nullable=False),
pytest.param(DecimalGen(nullable=False),
marks=pytest.mark.xfail(reason="fastparquet reads Decimal columns as Float, as per "
Expand Down
7 changes: 6 additions & 1 deletion integration_tests/src/main/python/udf_cudf_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,15 @@
from typing import Iterator
from pyspark.sql import Window
from pyspark.sql.functions import pandas_udf, PandasUDFType
from spark_session import with_cpu_session, with_gpu_session
from spark_session import is_databricks_runtime, is_spark_340_or_later, with_cpu_session, with_gpu_session
from marks import cudf_udf


if is_databricks_runtime() and is_spark_340_or_later():
# Databricks 13.3 does not use separate reader/writer threads for Python UDFs
# which can lead to hangs. Skipping these tests until the Python UDF handling is updated.
pytestmark = pytest.mark.skip(reason="https://github.com/NVIDIA/spark-rapids/issues/9493")

_conf = {
'spark.rapids.sql.exec.AggregateInPandasExec': 'true',
'spark.rapids.sql.exec.FlatMapCoGroupsInPandasExec': 'true',
Expand Down
8 changes: 7 additions & 1 deletion integration_tests/src/main/python/udf_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import pytest

from conftest import is_at_least_precommit_run
from spark_session import is_databricks_runtime, is_before_spark_330, is_before_spark_350, is_spark_350_or_later
from spark_session import is_databricks_runtime, is_before_spark_330, is_before_spark_350, is_spark_340_or_later

from pyspark.sql.pandas.utils import require_minimum_pyarrow_version, require_minimum_pandas_version

Expand Down Expand Up @@ -43,6 +43,12 @@
import pyarrow
from typing import Iterator, Tuple


if is_databricks_runtime() and is_spark_340_or_later():
# Databricks 13.3 does not use separate reader/writer threads for Python UDFs
# which can lead to hangs. Skipping these tests until the Python UDF handling is updated.
pytestmark = pytest.mark.skip(reason="https://github.com/NVIDIA/spark-rapids/issues/9493")

arrow_udf_conf = {
'spark.sql.execution.arrow.pyspark.enabled': 'true',
'spark.rapids.sql.exec.WindowInPandasExec': 'true',
Expand Down
2 changes: 1 addition & 1 deletion jenkins/Jenkinsfile-blossom.premerge-databricks
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ pipeline {
// 'name' and 'value' only supprt literal string in the declarative Jenkins
// Refer to Jenkins issue https://issues.jenkins.io/browse/JENKINS-62127
name 'DB_RUNTIME'
values '10.4', '11.3', '12.2'
values '10.4', '11.3', '12.2', '13.3'
}
}
stages {
Expand Down
29 changes: 28 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,31 @@
<module>delta-lake/delta-spark332db</module>
</modules>
</profile>
<profile>
<!-- Note Databricks requires 2 properties -Ddatabricks and -Dbuildver=341db -->
<id>release341db</id>
<activation>
<property>
<name>buildver</name>
<value>341db</value>
</property>
</activation>
<properties>
<!-- Downgrade scala plugin version due to: https://github.com/sbt/sbt/issues/4305 -->
<scala.plugin.version>3.4.4</scala.plugin.version>
<spark.version.classifier>spark341db</spark.version.classifier>
<spark.version>${spark341db.version}</spark.version>
<spark.test.version>${spark341db.version}</spark.test.version>
<hadoop.client.version>3.3.1</hadoop.client.version>
<rat.consoleOutput>true</rat.consoleOutput>
<parquet.hadoop.version>1.12.0</parquet.hadoop.version>
<iceberg.version>${spark330.iceberg.version}</iceberg.version>
</properties>
<modules>
<module>shim-deps/databricks</module>
<module>delta-lake/delta-spark341db</module>
</modules>
</profile>
<profile>
<id>release350</id>
<activation>
Expand Down Expand Up @@ -691,6 +716,7 @@
<spark332cdh.version>3.3.2.3.3.7190.0-91</spark332cdh.version>
<spark330db.version>3.3.0-databricks</spark330db.version>
<spark332db.version>3.3.2-databricks</spark332db.version>
<spark341db.version>3.4.1-databricks</spark341db.version>
<spark350.version>3.5.0</spark350.version>
<mockito.version>3.12.4</mockito.version>
<scala.plugin.version>4.3.0</scala.plugin.version>
Expand Down Expand Up @@ -745,7 +771,8 @@
<databricks.buildvers>
321db,
330db,
332db
332db,
341db
</databricks.buildvers>
<!--
Build and run unit tests on one specific version for each sub-version (e.g. 311, 320, 330)
Expand Down
17 changes: 17 additions & 0 deletions scala2.13/aggregator/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,23 @@
</dependency>
</dependencies>
</profile>
<profile>
<id>release341db</id>
<activation>
<property>
<name>buildver</name>
<value>341db</value>
</property>
</activation>
<dependencies>
<dependency>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-delta-spark341db_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<classifier>${spark.version.classifier}</classifier>
</dependency>
</dependencies>
</profile>
<profile>
<id>release333</id>
<activation>
Expand Down
29 changes: 28 additions & 1 deletion scala2.13/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,31 @@
<module>delta-lake/delta-spark332db</module>
</modules>
</profile>
<profile>
<!-- Note Databricks requires 2 properties -Ddatabricks and -Dbuildver=341db -->
<id>release341db</id>
<activation>
<property>
<name>buildver</name>
<value>341db</value>
</property>
</activation>
<properties>
<!-- Downgrade scala plugin version due to: https://github.com/sbt/sbt/issues/4305 -->
<scala.plugin.version>3.4.4</scala.plugin.version>
<spark.version.classifier>spark341db</spark.version.classifier>
<spark.version>${spark341db.version}</spark.version>
<spark.test.version>${spark341db.version}</spark.test.version>
<hadoop.client.version>3.3.1</hadoop.client.version>
<rat.consoleOutput>true</rat.consoleOutput>
<parquet.hadoop.version>1.12.0</parquet.hadoop.version>
<iceberg.version>${spark330.iceberg.version}</iceberg.version>
</properties>
<modules>
<module>shim-deps/databricks</module>
<module>delta-lake/delta-spark341db</module>
</modules>
</profile>
<profile>
<id>release350</id>
<activation>
Expand Down Expand Up @@ -691,6 +716,7 @@
<spark332cdh.version>3.3.2.3.3.7190.0-91</spark332cdh.version>
<spark330db.version>3.3.0-databricks</spark330db.version>
<spark332db.version>3.3.2-databricks</spark332db.version>
<spark341db.version>3.4.1-databricks</spark341db.version>
<spark350.version>3.5.0</spark350.version>
<mockito.version>3.12.4</mockito.version>
<scala.plugin.version>4.3.0</scala.plugin.version>
Expand Down Expand Up @@ -745,7 +771,8 @@
<databricks.buildvers>
321db,
330db,
332db
332db,
341db
</databricks.buildvers>
<!--
Build and run unit tests on one specific version for each sub-version (e.g. 311, 320, 330)
Expand Down
41 changes: 41 additions & 0 deletions scala2.13/shim-deps/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,47 @@
</dependency>
</dependencies>
</profile>
<profile>
<id>release341db</id>
<activation>
<property>
<name>buildver</name>
<value>341db</value>
</property>
</activation>
<dependencies>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-format-internal_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-common-utils_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql-api_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>shaded.parquet.org.apache.thrift</groupId>
<artifactId>shaded-parquet-thrift_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
</dependencies>
</profile>
<profile>
<id>dbdeps</id>
<activation>
Expand Down
41 changes: 41 additions & 0 deletions shim-deps/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,47 @@
</dependency>
</dependencies>
</profile>
<profile>
<id>release341db</id>
<activation>
<property>
<name>buildver</name>
<value>341db</value>
</property>
</activation>
<dependencies>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-format-internal_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-common-utils_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql-api_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>shaded.parquet.org.apache.thrift</groupId>
<artifactId>shaded-parquet-thrift_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
</dependencies>
</profile>
<profile>
<id>dbdeps</id>
<activation>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
{"spark": "331"}
{"spark": "332cdh"}
{"spark": "332db"}
{"spark": "341db"}
spark-rapids-shim-json-lines ***/
package com.nvidia.spark.rapids.shims

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
{"spark": "332cdh"}
{"spark": "332db"}
{"spark": "333"}
{"spark": "341db"}
spark-rapids-shim-json-lines ***/
package com.nvidia.spark.rapids.shims

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
{"spark": "333"}
{"spark": "340"}
{"spark": "341"}
{"spark": "341db"}
spark-rapids-shim-json-lines ***/
package org.apache.spark.sql.hive.rapids.shims

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
{"spark": "333"}
{"spark": "340"}
{"spark": "341"}
{"spark": "341db"}
spark-rapids-shim-json-lines ***/
package org.apache.spark.sql.hive.rapids.shims

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
{"spark": "332cdh"}
{"spark": "332db"}
{"spark": "333"}
{"spark": "341db"}
spark-rapids-shim-json-lines ***/
package com.nvidia.spark.rapids.shims

Expand Down
Loading

0 comments on commit d3629fd

Please sign in to comment.