From eb10ee1f04f9c252cba27d3ccb7ec1a741a65e8c Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 30 Nov 2021 15:41:05 -0800 Subject: [PATCH 1/8] add decimal128 validation --- python/cudf/cudf/io/parquet.py | 21 ++++++++++++++++++ .../data/parquet/decimal128_file.parquet | Bin 0 -> 1226 bytes python/cudf/cudf/tests/test_parquet.py | 10 +++++++++ 3 files changed, 31 insertions(+) create mode 100644 python/cudf/cudf/tests/data/parquet/decimal128_file.parquet diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 9d665d9a0a5..9c8c36840f8 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -7,6 +7,7 @@ from uuid import uuid4 import fsspec +import pyarrow as pa from pyarrow import dataset as ds, parquet as pq import cudf @@ -411,6 +412,26 @@ def read_parquet( filepaths_or_buffers.append(tmp_source) if engine == "cudf": + # Temporary error to probe a parquet file + # and raise decimal128 support error. + if len(filepaths_or_buffers) > 0: + try: + metadata = pq.read_metadata(filepaths_or_buffers[0]) + except TypeError: + pass + else: + arrow_types = metadata.schema.to_arrow_schema().types + for arrow_type in arrow_types: + if isinstance(arrow_type, pa.Decimal128Type): + if ( + arrow_type.precision + > cudf.Decimal64Dtype.MAX_PRECISION + ): + raise NotImplementedError( + "Decimal type greater than Decimal64 is not " + "yet supported" + ) + return libparquet.read_parquet( filepaths_or_buffers, columns=columns, diff --git a/python/cudf/cudf/tests/data/parquet/decimal128_file.parquet b/python/cudf/cudf/tests/data/parquet/decimal128_file.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7583a8c7604727d41e5a2eaf38e55e1dd3068371 GIT binary patch literal 1226 zcmb7^eP|PR7{{Nx#564$Uw_v-%^;p@dS&T(m)O*_0o`V#*rGD4h)&BTcd5aoO)rU} zU3>*~3<@ioMNM%-C#H1WD)>?rDz%~@!Kp1m`iGiGH#a++ze`iwS%wG)cfarR zJfG*``+K;hAAcqyKt~DGLD&$7KMlxec9NVPT0APP2V#3U092=z>G8`-=m4ZiS0FEG zgpDifVg1v?(EQgoz|(!O#N*IvU_ws}Hng>YaBvq?wsyke%JK|_?OO(F?@#dQ0Tn*@ z`aF0FD{#930=@iiX9N6pL)>?SHvF6a(W(@I+ip&6(z<_0=t!uZb!BQEGs30Al)>WzxMuKkOP1F=S_an1)k&m(-MbX%3!Uzd!zQ-XPr=T>%g9cSHCxbC)wQe z<+b)Is7hZh+))-k3G%PkXD4=2F+lfvlD<<81q$bLE#sOVE*u$;o%P3~*5!Q(DCoMm zyEy$kACXK|#rQ~AQrFi+eCw2ugbaX8$cRYVAS=Gou%jRkFN8@G$79imMbRD>MJ!W@ z*5oNz&cU)>2x}^dIG{Mj%($&aGcqwH1|u-TFho)u^^~9n2&>Wk){Ze|Y&0Men`ixp z7%6?zGE>|`QkO^^*Q(#%lhK4;{Rm-rx(uL0N1f1Fd-ZPT|6SrJ40Sh$;()QybiG>3 zOJNCn#Aoc{Hz;Z(P+v!nbMOK$0$nsFBhnD$v9k>^CCaPy8~KnDm1HR@(E~&y8{HIE z*fKT}Rb##=Yh(QZB`8POvek`jofOhSL-ylPHWi-(At}g}+8jk#t&0VNk~fGmF+#Di z!*sT+F&?*#reyAGEJw84MDhJd+7*ul*#+CXIcwrTFUq=698xV=`z#_aV*4zdHx>xW z7HO_a_RsaZTn@X}sra2vheH;eg43mx2tJp$)Kw~Y6|XE7F>W)!&;u9n$}GiyvDcZ@ BbxQyM literal 0 HcmV?d00001 diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index b6595be9566..da12c0480ff 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -626,6 +626,7 @@ def test_parquet_reader_spark_timestamps(datadir): assert_eq(expect, got) +@pytest.mark.xfail(reason="decimal128 not yet supported in cuDF") def test_parquet_reader_spark_decimals(datadir): fname = datadir / "spark_decimal.parquet" @@ -640,6 +641,15 @@ def test_parquet_reader_spark_decimals(datadir): assert_eq(expect, got) +def test_parquet_reader_decimal128_error_validation(datadir): + fname = datadir / "decimal128_file.parquet" + with pytest.raises( + NotImplementedError, + match="Decimal type greater than Decimal64 is not yet supported", + ): + cudf.read_parquet(fname) + + def test_parquet_reader_microsecond_timestamps(datadir): fname = datadir / "usec_timestamp.parquet" From 7ece2ed46192588038b8a44f70e4d7e595592ee6 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 30 Nov 2021 17:13:28 -0800 Subject: [PATCH 2/8] handle nested types --- python/cudf/cudf/io/parquet.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 9c8c36840f8..fadc47186a1 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -422,15 +422,12 @@ def read_parquet( else: arrow_types = metadata.schema.to_arrow_schema().types for arrow_type in arrow_types: - if isinstance(arrow_type, pa.Decimal128Type): - if ( - arrow_type.precision - > cudf.Decimal64Dtype.MAX_PRECISION - ): - raise NotImplementedError( - "Decimal type greater than Decimal64 is not " - "yet supported" - ) + if isinstance(arrow_type, (pa.ListType, pa.StructType)): + val_field_types = arrow_type.value_field.flatten() + for val_field_type in val_field_types: + _check_decimal128_type(val_field_type.type) + else: + _check_decimal128_type(arrow_type) return libparquet.read_parquet( filepaths_or_buffers, @@ -550,3 +547,11 @@ def merge_parquet_filemetadata(filemetadata_list): ParquetWriter = libparquet.ParquetWriter + + +def _check_decimal128_type(arrow_type): + if isinstance(arrow_type, pa.Decimal128Type): + if arrow_type.precision > cudf.Decimal64Dtype.MAX_PRECISION: + raise NotImplementedError( + "Decimal type greater than Decimal64 is not " "yet supported" + ) From 4f98d4f00a0520df0bb810467f8f9a7054799c2d Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 30 Nov 2021 17:24:25 -0800 Subject: [PATCH 3/8] handle nested types --- python/cudf/cudf/io/parquet.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index fadc47186a1..1c2e8b60480 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -422,10 +422,12 @@ def read_parquet( else: arrow_types = metadata.schema.to_arrow_schema().types for arrow_type in arrow_types: - if isinstance(arrow_type, (pa.ListType, pa.StructType)): + if isinstance(arrow_type, pa.ListType): val_field_types = arrow_type.value_field.flatten() for val_field_type in val_field_types: _check_decimal128_type(val_field_type.type) + elif isinstance(arrow_type, pa.StructType): + _ = cudf.StructDtype.from_arrow(arrow_type) else: _check_decimal128_type(arrow_type) From 067e9c7eecd41e7c0aee90f7ee9ae1f434eead07 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 1 Dec 2021 13:06:42 -0800 Subject: [PATCH 4/8] add nested tests --- .../tests/data/parquet/decimal128_file.parquet | Bin 1226 -> 0 bytes .../data/parquet/nested_decimal128_file.parquet | Bin 0 -> 728 bytes python/cudf/cudf/tests/test_parquet.py | 15 +++++++++------ 3 files changed, 9 insertions(+), 6 deletions(-) delete mode 100644 python/cudf/cudf/tests/data/parquet/decimal128_file.parquet create mode 100644 python/cudf/cudf/tests/data/parquet/nested_decimal128_file.parquet diff --git a/python/cudf/cudf/tests/data/parquet/decimal128_file.parquet b/python/cudf/cudf/tests/data/parquet/decimal128_file.parquet deleted file mode 100644 index 7583a8c7604727d41e5a2eaf38e55e1dd3068371..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1226 zcmb7^eP|PR7{{Nx#564$Uw_v-%^;p@dS&T(m)O*_0o`V#*rGD4h)&BTcd5aoO)rU} zU3>*~3<@ioMNM%-C#H1WD)>?rDz%~@!Kp1m`iGiGH#a++ze`iwS%wG)cfarR zJfG*``+K;hAAcqyKt~DGLD&$7KMlxec9NVPT0APP2V#3U092=z>G8`-=m4ZiS0FEG zgpDifVg1v?(EQgoz|(!O#N*IvU_ws}Hng>YaBvq?wsyke%JK|_?OO(F?@#dQ0Tn*@ z`aF0FD{#930=@iiX9N6pL)>?SHvF6a(W(@I+ip&6(z<_0=t!uZb!BQEGs30Al)>WzxMuKkOP1F=S_an1)k&m(-MbX%3!Uzd!zQ-XPr=T>%g9cSHCxbC)wQe z<+b)Is7hZh+))-k3G%PkXD4=2F+lfvlD<<81q$bLE#sOVE*u$;o%P3~*5!Q(DCoMm zyEy$kACXK|#rQ~AQrFi+eCw2ugbaX8$cRYVAS=Gou%jRkFN8@G$79imMbRD>MJ!W@ z*5oNz&cU)>2x}^dIG{Mj%($&aGcqwH1|u-TFho)u^^~9n2&>Wk){Ze|Y&0Men`ixp z7%6?zGE>|`QkO^^*Q(#%lhK4;{Rm-rx(uL0N1f1Fd-ZPT|6SrJ40Sh$;()QybiG>3 zOJNCn#Aoc{Hz;Z(P+v!nbMOK$0$nsFBhnD$v9k>^CCaPy8~KnDm1HR@(E~&y8{HIE z*fKT}Rb##=Yh(QZB`8POvek`jofOhSL-ylPHWi-(At}g}+8jk#t&0VNk~fGmF+#Di z!*sT+F&?*#reyAGEJw84MDhJd+7*ul*#+CXIcwrTFUq=698xV=`z#_aV*4zdHx>xW z7HO_a_RsaZTn@X}sra2vheH;eg43mx2tJp$)Kw~Y6|XE7F>W)!&;u9n$}GiyvDcZ@ BbxQyM diff --git a/python/cudf/cudf/tests/data/parquet/nested_decimal128_file.parquet b/python/cudf/cudf/tests/data/parquet/nested_decimal128_file.parquet new file mode 100644 index 0000000000000000000000000000000000000000..78263d58db49b031a7c602ea746e4bd5dae83ad5 GIT binary patch literal 728 zcmcgq&riZo4DQCuGKd-_blH+cIUwNxh$b_}=%Mo$Dn=kt660lA2#|;(p!*N}OTCzQ z^ytC%`GboWFZS@Yul?G7Z+lH@ucN{`ZZyzFl?5mO%o~uIw-w};(uHOCvJwL3u}~Ho zT4?!h`lmDtGY*t~%`N?aw>7ug03#W{=of0&J&5357j6BRc~ok?+z=vX|TzdfH0B&A`!$@3CEwUP@-)&)LCa|_XlfTo6hi8#r9dUcD!mdrmQ zifMmVl!z&@Ch4U=QguIf+|;f*HBIAw)~V-cDqn(4OWOr^WfYC?AG*^@JQ`J^@%S#u aKX%8H!QE{>sMe}A&-eKsy%+#Ze$yXnI(s$% literal 0 HcmV?d00001 diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index da12c0480ff..9bbb29a01d9 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -626,23 +626,26 @@ def test_parquet_reader_spark_timestamps(datadir): assert_eq(expect, got) -@pytest.mark.xfail(reason="decimal128 not yet supported in cuDF") def test_parquet_reader_spark_decimals(datadir): fname = datadir / "spark_decimal.parquet" - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname) + # expect = pd.read_parquet(fname) + with pytest.raises( + NotImplementedError, + match="Decimal type greater than Decimal64 is not yet supported", + ): + _ = cudf.read_parquet(fname) # Convert the decimal dtype from PyArrow to float64 for comparison to cuDF # This is because cuDF returns as float64 as it lacks an equivalent dtype - expect = expect.apply(pd.to_numeric) + # expect = expect.apply(pd.to_numeric) # np.testing.assert_allclose(expect, got) - assert_eq(expect, got) + # assert_eq(expect, got) def test_parquet_reader_decimal128_error_validation(datadir): - fname = datadir / "decimal128_file.parquet" + fname = datadir / "nested_decimal128_file.parquet" with pytest.raises( NotImplementedError, match="Decimal type greater than Decimal64 is not yet supported", From d4fc028f0dbce9ec3986f7fa318fbc67900878a1 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 6 Dec 2021 10:44:14 -0800 Subject: [PATCH 5/8] address reviews --- python/cudf/cudf/io/parquet.py | 9 +++++++-- .../parquet/nested_decimal128_file.parquet | Bin 728 -> 1692 bytes python/cudf/cudf/tests/test_parquet.py | 5 +++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index de18310321c..dc6531dcdb3 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -623,8 +623,13 @@ def _read_parquet( except TypeError: pass else: - arrow_types = metadata.schema.to_arrow_schema().types - for arrow_type in arrow_types: + arrow_schema = metadata.schema.to_arrow_schema() + check_cols = arrow_schema.names if columns is None else columns + for col_name, arrow_type in zip( + arrow_schema.names, arrow_schema.types + ): + if col_name not in check_cols: + continue if isinstance(arrow_type, pa.ListType): val_field_types = arrow_type.value_field.flatten() for val_field_type in val_field_types: diff --git a/python/cudf/cudf/tests/data/parquet/nested_decimal128_file.parquet b/python/cudf/cudf/tests/data/parquet/nested_decimal128_file.parquet index 78263d58db49b031a7c602ea746e4bd5dae83ad5..7440d357a1263ce9c28b6681e38c35540d5e2af8 100644 GIT binary patch literal 1692 zcmcgtPj3=I6n{HA(1qBhHl4|CvQZ9nLn5uU2}xt{Fbfr$f&!*0iI;^C#I_az#rOgI z6nqfRo;(>39zA;O(W8mJH&f^Z4jN?wZ{DBZ`#;InRO61|qAd0*!V!D}APul;L2}jM zuz5X^z5$mfBS54?I!iDS7jfTB%uH8oQbJ~~9i#0FDA@1#4 z5kKt>N4z)coCzi!`JTxiR`57*{xp@B#S*z-fF;ufbS=L{u3P*Vo3&WBS__(-@Zhuk zPuj#4Rj7O1 zTDUJRKUwmN6?|I>CM&)>C&q%vbGF62NeHJq76}eH=30?pB3{X32|4cSmqH6( z!-klv-^^AL5t-~WqJ%=9)?fMxzD3?3x#xTNrM=Q%nQ!Gk?@)e?Nsg0~A7YyTp6^~$ zjmE<#cg;@S3;Q;my>6(Ks^g!|ky2w7cGZ2Qx~kCCx)1?X;tEO~(v9-;Mv2%WsY#FI z9R}S;=HNj|%VVMkL@*?BI;hD8%sWPuB1)Ql*+i43s!ifAq9*+R{l1$As7MwW+9O zaw(cB+T>*Xb?Ot6^^{9mMp8F+1YsQPr>VlRnX@)(hgm(kiN*Dpd7vHRXz1}QpNVr1 v+?)Mq@alDEwB7Fax1+(}e4KjI84P>pFH^al-JM-8?*o`0{IUUR_+9!9wrcs_ delta 263 zcmbQkdxLet3w|a^A5#Viw&LWB)Z9c-rpXVPOxeU(BxNLJCe~$YinTF`v8erF6#K#` z!oVOYBf;Ys6yzUn1yL^XhEX2`%+ehvH?i2&yE*~^m@o_g3YPHG{6^RAdm*DbpwmKPEKUg;FXbKU|@L3$iNWb7-R?lF+oI| diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 9c7e39b625d..0f2615f785f 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -644,13 +644,14 @@ def test_parquet_reader_spark_decimals(datadir): # assert_eq(expect, got) -def test_parquet_reader_decimal128_error_validation(datadir): +@pytest.mark.parametrize("columns", [["a"], ["b", "a"], None]) +def test_parquet_reader_decimal128_error_validation(datadir, columns): fname = datadir / "nested_decimal128_file.parquet" with pytest.raises( NotImplementedError, match="Decimal type greater than Decimal64 is not yet supported", ): - cudf.read_parquet(fname) + cudf.read_parquet(fname, columns=columns) def test_parquet_reader_microsecond_timestamps(datadir): From 2ddf2cc6cbea434a84b8780c74913e272ae3348d Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 6 Dec 2021 12:46:40 -0600 Subject: [PATCH 6/8] Update python/cudf/cudf/io/parquet.py Co-authored-by: Vukasin Milovanovic --- python/cudf/cudf/io/parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index dc6531dcdb3..54b164cb3c3 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -762,5 +762,5 @@ def _check_decimal128_type(arrow_type): if isinstance(arrow_type, pa.Decimal128Type): if arrow_type.precision > cudf.Decimal64Dtype.MAX_PRECISION: raise NotImplementedError( - "Decimal type greater than Decimal64 is not " "yet supported" + "Decimal type greater than Decimal64 is not yet supported" ) From c9a93045af525ed26ba8a6d019e9e836adc655c8 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 7 Dec 2021 09:45:02 -0800 Subject: [PATCH 7/8] add comment --- python/cudf/cudf/io/parquet.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 54b164cb3c3..f9b39bf2cfa 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -621,6 +621,10 @@ def _read_parquet( try: metadata = pq.read_metadata(filepaths_or_buffers[0]) except TypeError: + # pq.read_metadata only supports reading metadata from + # certain types of file inputs, like str-filepath or file-like + # objects, and errors for the rest of inputs. Hence this is + # to avoid failing on other types of file inputs. pass else: arrow_schema = metadata.schema.to_arrow_schema() From 6014e1bf14b819b686b82bf41c0ba7936da33e35 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 7 Dec 2021 13:52:51 -0600 Subject: [PATCH 8/8] Update python/cudf/cudf/tests/test_parquet.py Co-authored-by: Vyas Ramasubramani --- python/cudf/cudf/tests/test_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 0f2615f785f..597ae6c05c0 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -634,7 +634,7 @@ def test_parquet_reader_spark_decimals(datadir): NotImplementedError, match="Decimal type greater than Decimal64 is not yet supported", ): - _ = cudf.read_parquet(fname) + cudf.read_parquet(fname) # Convert the decimal dtype from PyArrow to float64 for comparison to cuDF # This is because cuDF returns as float64 as it lacks an equivalent dtype