-
Notifications
You must be signed in to change notification settings - Fork 416
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add invariant enforcement support (#834)
# Description Adds support to retrieve invariants from the Delta schema and also a struct `DeltaDataChecker` to use DataFusion to check them and report useful errors. This also hooks it up to the Python bindings, allowing `write_deltalake()` to support Writer Protocol V2. I looked briefly at the Rust writer, but then realized we don't want to introduce a dependency on DataFusion. We should discuss how we want to design that API. I suspect we'll turn DeltaDataChecker into a trait, so we can have a DataFusion one available but also allow other engines to implement it themselves if they don't wish to use DataFusion. # Related Issue(s) - closes #592 - closes #575 # Documentation https://github.com/delta-io/delta/blob/master/PROTOCOL.md#column-invariants
- Loading branch information
Showing
12 changed files
with
554 additions
and
46 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
115 changes: 115 additions & 0 deletions
115
python/tests/pyspark_integration/test_write_to_pyspark.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
"""Tests that deltalake(delta-rs) can write to tables written by PySpark""" | ||
import pathlib | ||
|
||
import pyarrow as pa | ||
import pytest | ||
|
||
from deltalake import write_deltalake | ||
from deltalake._internal import PyDeltaTableError | ||
from deltalake.writer import DeltaTableProtocolError | ||
|
||
from .utils import assert_spark_read_equal, get_spark | ||
|
||
try: | ||
import delta | ||
import delta.pip_utils | ||
import delta.tables | ||
import pyspark | ||
|
||
spark = get_spark() | ||
except ModuleNotFoundError: | ||
pass | ||
|
||
|
||
@pytest.mark.pyspark | ||
@pytest.mark.integration | ||
def test_write_basic(tmp_path: pathlib.Path): | ||
# Write table in Spark | ||
spark = get_spark() | ||
schema = pyspark.sql.types.StructType( | ||
[ | ||
pyspark.sql.types.StructField( | ||
"c1", | ||
dataType=pyspark.sql.types.IntegerType(), | ||
nullable=True, | ||
) | ||
] | ||
) | ||
spark.createDataFrame([(4,)], schema=schema).write.save( | ||
str(tmp_path), | ||
mode="append", | ||
format="delta", | ||
) | ||
# Overwrite table in deltalake | ||
data = pa.table({"c1": pa.array([5, 6], type=pa.int32())}) | ||
write_deltalake(str(tmp_path), data, mode="overwrite") | ||
|
||
# Read table in Spark | ||
assert_spark_read_equal(data, str(tmp_path), sort_by="c1") | ||
|
||
|
||
@pytest.mark.pyspark | ||
@pytest.mark.integration | ||
def test_write_invariant(tmp_path: pathlib.Path): | ||
# Write table in Spark with invariant | ||
spark = get_spark() | ||
|
||
schema = pyspark.sql.types.StructType( | ||
[ | ||
pyspark.sql.types.StructField( | ||
"c1", | ||
dataType=pyspark.sql.types.IntegerType(), | ||
nullable=True, | ||
metadata={ | ||
"delta.invariants": '{"expression": { "expression": "c1 > 3"} }' | ||
}, | ||
) | ||
] | ||
) | ||
|
||
delta.tables.DeltaTable.create(spark).location(str(tmp_path)).addColumns( | ||
schema | ||
).execute() | ||
|
||
spark.createDataFrame([(4,)], schema=schema).write.save( | ||
str(tmp_path), | ||
mode="append", | ||
format="delta", | ||
) | ||
|
||
# Cannot write invalid data to the table | ||
invalid_data = pa.table({"c1": pa.array([6, 2], type=pa.int32())}) | ||
with pytest.raises( | ||
PyDeltaTableError, match="Invariant \(c1 > 3\) violated by value .+2" | ||
): | ||
# raise PyDeltaTableError("test") | ||
write_deltalake(str(tmp_path), invalid_data, mode="overwrite") | ||
|
||
# Can write valid data to the table | ||
valid_data = pa.table({"c1": pa.array([5, 6], type=pa.int32())}) | ||
write_deltalake(str(tmp_path), valid_data, mode="append") | ||
|
||
expected = pa.table({"c1": pa.array([4, 5, 6], type=pa.int32())}) | ||
assert_spark_read_equal(expected, str(tmp_path), sort_by="c1") | ||
|
||
|
||
@pytest.mark.pyspark | ||
@pytest.mark.integration | ||
def test_checks_min_writer_version(tmp_path: pathlib.Path): | ||
# Write table in Spark with constraint | ||
spark = get_spark() | ||
|
||
spark.createDataFrame([(4,)], schema=["c1"]).write.save( | ||
str(tmp_path), | ||
mode="append", | ||
format="delta", | ||
) | ||
|
||
# Add a constraint upgrades the minWriterProtocol | ||
spark.sql(f"ALTER TABLE delta.`{str(tmp_path)}` ADD CONSTRAINT x CHECK (c1 > 2)") | ||
|
||
with pytest.raises( | ||
DeltaTableProtocolError, match="This table's min_writer_version is 3, but" | ||
): | ||
valid_data = pa.table({"c1": pa.array([5, 6])}) | ||
write_deltalake(str(tmp_path), valid_data, mode="append") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
from typing import List | ||
|
||
import pyarrow as pa | ||
|
||
try: | ||
import delta | ||
import delta.pip_utils | ||
import delta.tables | ||
import pyspark | ||
except ModuleNotFoundError: | ||
pass | ||
|
||
try: | ||
from pandas.testing import assert_frame_equal | ||
except ModuleNotFoundError: | ||
_has_pandas = False | ||
else: | ||
_has_pandas = True | ||
|
||
|
||
def get_spark(): | ||
builder = ( | ||
pyspark.sql.SparkSession.builder.appName("MyApp") | ||
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") | ||
.config( | ||
"spark.sql.catalog.spark_catalog", | ||
"org.apache.spark.sql.delta.catalog.DeltaCatalog", | ||
) | ||
) | ||
return delta.pip_utils.configure_spark_with_delta_pip(builder).getOrCreate() | ||
|
||
|
||
def assert_spark_read_equal( | ||
expected: pa.Table, uri: str, sort_by: List[str] = ["int32"] | ||
): | ||
spark = get_spark() | ||
df = spark.read.format("delta").load(uri) | ||
|
||
# Spark and pyarrow don't convert these types to the same Pandas values | ||
incompatible_types = ["timestamp", "struct"] | ||
|
||
assert_frame_equal( | ||
df.toPandas() | ||
.sort_values(sort_by, ignore_index=True) | ||
.drop(incompatible_types, axis="columns", errors="ignore"), | ||
expected.to_pandas() | ||
.sort_values(sort_by, ignore_index=True) | ||
.drop(incompatible_types, axis="columns", errors="ignore"), | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.