Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Raise temporary error for decimal128 types in parquet reader #9804

Merged
merged 12 commits into from
Dec 7, 2021
28 changes: 28 additions & 0 deletions python/cudf/cudf/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from uuid import uuid4

import fsspec
import pyarrow as pa
from pyarrow import dataset as ds, parquet as pq

import cudf
Expand Down Expand Up @@ -411,6 +412,25 @@ def read_parquet(
filepaths_or_buffers.append(tmp_source)

if engine == "cudf":
# Temporary error to probe a parquet file
# and raise decimal128 support error.
if len(filepaths_or_buffers) > 0:
try:
metadata = pq.read_metadata(filepaths_or_buffers[0])
vuule marked this conversation as resolved.
Show resolved Hide resolved
except TypeError:
vyasr marked this conversation as resolved.
Show resolved Hide resolved
pass
else:
arrow_types = metadata.schema.to_arrow_schema().types
for arrow_type in arrow_types:
if isinstance(arrow_type, pa.ListType):
val_field_types = arrow_type.value_field.flatten()
for val_field_type in val_field_types:
_check_decimal128_type(val_field_type.type)
elif isinstance(arrow_type, pa.StructType):
_ = cudf.StructDtype.from_arrow(arrow_type)
else:
_check_decimal128_type(arrow_type)

return libparquet.read_parquet(
filepaths_or_buffers,
columns=columns,
Expand Down Expand Up @@ -529,3 +549,11 @@ def merge_parquet_filemetadata(filemetadata_list):


ParquetWriter = libparquet.ParquetWriter


def _check_decimal128_type(arrow_type):
if isinstance(arrow_type, pa.Decimal128Type):
if arrow_type.precision > cudf.Decimal64Dtype.MAX_PRECISION:
raise NotImplementedError(
"Decimal type greater than Decimal64 is not " "yet supported"
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
)
Binary file not shown.
21 changes: 17 additions & 4 deletions python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,15 +629,28 @@ def test_parquet_reader_spark_timestamps(datadir):
def test_parquet_reader_spark_decimals(datadir):
fname = datadir / "spark_decimal.parquet"

expect = pd.read_parquet(fname)
got = cudf.read_parquet(fname)
# expect = pd.read_parquet(fname)
with pytest.raises(
NotImplementedError,
match="Decimal type greater than Decimal64 is not yet supported",
):
_ = cudf.read_parquet(fname)
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved

# Convert the decimal dtype from PyArrow to float64 for comparison to cuDF
# This is because cuDF returns as float64 as it lacks an equivalent dtype
expect = expect.apply(pd.to_numeric)
# expect = expect.apply(pd.to_numeric)

# np.testing.assert_allclose(expect, got)
assert_eq(expect, got)
# assert_eq(expect, got)
vyasr marked this conversation as resolved.
Show resolved Hide resolved


def test_parquet_reader_decimal128_error_validation(datadir):
fname = datadir / "nested_decimal128_file.parquet"
vuule marked this conversation as resolved.
Show resolved Hide resolved
with pytest.raises(
NotImplementedError,
match="Decimal type greater than Decimal64 is not yet supported",
):
cudf.read_parquet(fname)


def test_parquet_reader_microsecond_timestamps(datadir):
Expand Down