From cd7a0adb7c980c13861750c9c99e0fb5bda92492 Mon Sep 17 00:00:00 2001 From: Alfred Xu Date: Sat, 5 Dec 2020 00:08:02 +0800 Subject: [PATCH] Parquet option for strictly decimal reading (#6908) This PR is about to add a parquet option to determine whether strictly reading all decimal columns as fixed-point decimal types or converting decimal column who are not backed by int32/64 to float64. --- CHANGELOG.md | 1 + .../java/ai/rapids/cudf/ParquetOptions.java | 18 +++++++++++++ java/src/main/java/ai/rapids/cudf/Table.java | 22 ++++++++------- java/src/main/native/src/TableJni.cpp | 3 ++- .../test/java/ai/rapids/cudf/TableTest.java | 25 ++++++++++++++++++ java/src/test/resources/decimal.parquet | Bin 0 -> 6604 bytes 6 files changed, 59 insertions(+), 10 deletions(-) create mode 100644 java/src/test/resources/decimal.parquet diff --git a/CHANGELOG.md b/CHANGELOG.md index 4b88b350725..fd3a045c9b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -127,6 +127,7 @@ - PR #6837 Avoid gather when copying strings view from start of strings column - PR #6859 Move align_ptr_for_type() from cuda.cuh to alignment.hpp - PR #6807 Refactor `std::array` usage in row group index writing in ORC +- PR #6908 Parquet option for strictly decimal reading ## Bug Fixes diff --git a/java/src/main/java/ai/rapids/cudf/ParquetOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetOptions.java index 4ef1b713531..d8bb6b45f88 100644 --- a/java/src/main/java/ai/rapids/cudf/ParquetOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ParquetOptions.java @@ -27,22 +27,30 @@ public class ParquetOptions extends ColumnFilterOptions { private final DType unit; + private final boolean strictDecimalType; + private ParquetOptions(Builder builder) { super(builder); unit = builder.unit; + strictDecimalType = builder.strictDecimalType; } DType timeUnit() { return unit; } + boolean isStrictDecimalType() { + return strictDecimalType; + } + public static Builder builder() { return new Builder(); } public static class Builder extends ColumnFilterOptions.Builder { private DType unit = DType.EMPTY; + private boolean strictDecimalType = false; /** * Specify the time unit to use when returning timestamps. @@ -55,6 +63,16 @@ public Builder withTimeUnit(DType unit) { return this; } + /** + * Specify how to deal with decimal columns who are not backed by INT32/64 while reading. + * @param strictDecimalType whether strictly reading all decimal columns as fixed-point decimal type + * @return builder for chaining + */ + public Builder enableStrictDecimalType(boolean strictDecimalType) { + this.strictDecimalType = strictDecimalType; + return this; + } + public ParquetOptions build() { return new ParquetOptions(this); } diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index b7841c33a79..66552acc5bc 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -219,15 +219,17 @@ private static native long[] readCSV(String[] columnNames, String[] dTypes, /** * Read in Parquet formatted data. - * @param filterColumnNames name of the columns to read, or an empty array if we want to read - * all of them - * @param filePath the path of the file to read, or null if no path should be read. - * @param address the address of the buffer to read from or 0 if we should not. - * @param length the length of the buffer to read from. - * @param timeUnit return type of TimeStamp in units + * @param filterColumnNames name of the columns to read, or an empty array if we want to read + * all of them + * @param filePath the path of the file to read, or null if no path should be read. + * @param address the address of the buffer to read from or 0 if we should not. + * @param length the length of the buffer to read from. + * @param timeUnit return type of TimeStamp in units + * @param strictDecimalTypes whether strictly reading all decimal columns as fixed-point decimal type */ private static native long[] readParquet(String[] filterColumnNames, String filePath, - long address, long length, int timeUnit) throws CudfException; + long address, long length, int timeUnit, + boolean strictDecimalTypes) throws CudfException; /** * Setup everything to write parquet formatted data to a file. @@ -618,7 +620,8 @@ public static Table readParquet(File path) { */ public static Table readParquet(ParquetOptions opts, File path) { return new Table(readParquet(opts.getIncludeColumnNames(), - path.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId())); + path.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId(), + opts.isStrictDecimalType())); } /** @@ -678,7 +681,8 @@ public static Table readParquet(ParquetOptions opts, HostMemoryBuffer buffer, assert len <= buffer.getLength() - offset; assert offset >= 0 && offset < buffer.length; return new Table(readParquet(opts.getIncludeColumnNames(), - null, buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId())); + null, buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId(), + opts.isStrictDecimalType())); } /** diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 4dc32307552..3ec1f5e3c94 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -787,7 +787,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV( JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet( JNIEnv *env, jclass j_class_object, jobjectArray filter_col_names, jstring inputfilepath, - jlong buffer, jlong buffer_length, jint unit) { + jlong buffer, jlong buffer_length, jint unit, jboolean strict_decimal_types) { bool read_buffer = true; if (buffer == 0) { JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL); @@ -823,6 +823,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet( .convert_strings_to_categories(false) .timestamp_type(cudf::data_type(static_cast(unit))) .build(); + opts.set_strict_decimal_types(static_cast(strict_decimal_types)); cudf::io::table_with_metadata result = cudf::io::read_parquet(opts); return cudf::jni::convert_table_for_return(env, result.tbl); } diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index edacfc37cc6..c2879d09b54 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -67,6 +67,7 @@ public class TableTest extends CudfTestBase { private static final File TEST_ORC_FILE = new File("src/test/resources/TestOrcFile.orc"); private static final File TEST_ORC_TIMESTAMP_DATE_FILE = new File( "src/test/resources/timestamp-date-test.orc"); + private static final File TEST_DECIMAL_PARQUET_FILE = new File("src/test/resources/decimal.parquet"); private static final Schema CSV_DATA_BUFFER_SCHEMA = Schema.builder() .column(DType.INT32, "A") @@ -717,6 +718,30 @@ void testReadParquetFull() { } } + @Test + void testReadParquetContainsDecimalData() { + try (Table table = Table.readParquet(TEST_DECIMAL_PARQUET_FILE)) { + long rows = table.getRowCount(); + assertEquals(100, rows); + DType[] expectedTypes = new DType[]{ + DType.create(DType.DTypeEnum.DECIMAL64, 0), // Decimal(18, 0) + DType.create(DType.DTypeEnum.DECIMAL32, -3), // Decimal(7, 3) + DType.create(DType.DTypeEnum.DECIMAL64, -10), // Decimal(10, 10) + DType.create(DType.DTypeEnum.DECIMAL32, 0), // Decimal(1, 0) + DType.create(DType.DTypeEnum.DECIMAL64, -15), // Decimal(18, 15) + DType.FLOAT64, // Decimal(20, 10) which is backed by FIXED_LEN_BYTE_ARRAY + DType.INT64, + DType.FLOAT32 + }; + assertTableTypes(expectedTypes, table); + } + // An CudfException will be thrown here because we haven't support reading decimal stored as FIXED_LEN_BYTE_ARRAY. + ParquetOptions opts = ParquetOptions.builder().enableStrictDecimalType(true).build(); + assertThrows(ai.rapids.cudf.CudfException.class, () -> { + try (Table table = Table.readParquet(opts, TEST_DECIMAL_PARQUET_FILE)) {} + }); + } + @Test void testReadORC() { ORCOptions opts = ORCOptions.builder() diff --git a/java/src/test/resources/decimal.parquet b/java/src/test/resources/decimal.parquet new file mode 100644 index 0000000000000000000000000000000000000000..cc0da9ed948458790952aeb0c0e2cdc6f10d61b5 GIT binary patch literal 6604 zcmb7J3p`ZY_g`mT#+WI}4C9seD@_>BBIT7wq&!13o>$Fymn0#j6e*NPk6Rw4Qj$m{ zNl8RE6`@>8k|-4^h5w%Vb#M27@BjDt-}A9&?X}lh-?R2vYn{FK;jnDA79HqALi8~q z4SFMn`XoA8N>5moJTv2as;r`yCQj~m!0#E^Bn0RhCytHbL{LW=!}lm+_$|U1ae6`q z!%q)%6&Z(>mQh(!fd^%MWC)GFO2lX>`p`;!suZq<9m54GaKr|NB%m}&Eg^VCI#$%1 z`t|s)Z-_s@U-fKc9sk&h7yFW9t6sz;KFF~+!_1xyEDu3vpASuf7;@`f$Ixu_~_i6oO?C>qq1)Heecw&D#VtUc&=#ei(pBD zrR}YY?Ezi*XaU>SdG=(1)bgkRnKr%tJh;`7pVGTcdE~8wO|()5X^b`Lj(cJ>-;At zs|AMajqkc_<3R=aLHbaO7k&=rle*#bc5L$pmZm^N6}j!>(W}_G&lKlY{qaI zyvtk{3TZOTyYP-7OMLdnnz4jtAwmdh&z-G;)H~tz!_8&>G-h&*euk+|89dyLIX7^3{Mhw(t_WD_kydw5kyi* zU`(b#Ssxh&t?_V1RTUmeAArL-PT-Qqga06b;}b`~4?Dru>}4S5PlmbM)`M=MD>%AQ zpiylWe0UcJ*YUQn$!jiLDH4U)6c+3O4|pUf2Ca+aplOv0I1TXN`CK*dxw#sqq^K~m zLlB&-OyODw5Ax&!!9uH@!OF{Ei8q}V01;P{&uCSbd?qde?GbEs$8VXfh zH#l*d1HVgQFcKCGznD=V>7_5M9Hm0x9$9#KZyxl$$%Hw|;?Py+4xXh#(7ZSko_dJG z5@P`%By9w-(m+^D(S%_$A~=H^q&~HTRxBId>T^JD1P@=O?cmmI3Yd6dP$w=9VXNgJ zd6f;Q8c|?M9|wz@LP4@v8`xgx5ui-TwnE-2O8!l2SZNW6fDx26kVyEO+C zi-h2_au%S_outwSD1QwddFHv06ArY4Cm`v?5X8q!qlpm^{)`o@riwkk*V9GM&%Gu-?#Y=CM$fl=aLK#6QN<$(a5#+ zm=M*o53mPR5xh`?00+FuV z>3N8l?3~DQYB9w5y$>)$$X^cireLbn zQ1zR;(BRV-Pc0Ewp28MDvNKjg<>(Gn;x01cj(ck9BVy@bgQ4kNu-s;c3?{Bs1fju{ zb_FHOftuC({0ri~rndGWqMiEO3OV6^*e@PoKH+YUL|`X3nra$pCB5B8%e9ZCs$$rd=8YP=)j<{+cUA{{VTTbB+ zdDT3%7?E`fm6s5SV#W@j=i;OrPrN{#aH+)*^7D18dB{ziVI>oBdD2u~ z<<=^+gnO1&xR;V-@5lh#xE@2Cj=k84mQ#vUP3^jXQ=|+5=ZsGu?LLVS(qCUpJU-e_ z_yK$>@b=LQW<%ueZnM+g2OBXJVV%Ya)-mXa3A}IBtVijNF)lkySVX*0qA&Tl@-75P zQP!KT8OP@AxlxdJ(Ok%b6;-IaK?C9*dFl;cxGMC3^g^n9Iv>Bb)TdbU$u;7SsjQnR zPW{ve}AD)xEoV#9CVE0gOPF?>f ziEjHS|DaVcxXyR&D0_>+l$pwJr6+RTFP+!V`1rCKBUWMU8gOzKWU+_9{JEL`S343}8Aw3OQ)w4YW zA!*?J=A*(C1=~+W)*_a)S?a=9vuiVHr)i?2VWS2DJg&*O<&6rMOiMV~?Os8kSUpiM z9A(30pXBI8Nn^wcORD7xHwsZZy}j~n!E*AB`q8(2cTylS*NznPQC7h5^R^x9n2n?# z^N;n&_#|H~ht2Q4Ca*Mj4Cb)vE2R4B1T#)BhGsiW5wRU=n%C_1(N<`W^n@Sj<`>?# zOOe_tx}4Jn>k1oZt3T-_csB_h`YfwVMHMnYqmR=5xcGH{{ zn0p=Jd?1a5g7-)^+@4tmHuwNeQ$I9Ye1T`1gME}ImWblb|b(j8qlNGqV z)4C;{jcWp;P1oqY0|Y}LoD)*NZ1rj-VC@-BMeC9>fN@>*3c>yFZG4@gK%hvkmRl371w!v`oNga9ZUdfR zw`EUM)oCDU$Q>HsxRv}lWsl|{-}>zyC19#&WapwjaT-=6?VeAl$+`n7Vw|1^>Z
    6J}f){Rq`g)U7Tk{vHgHxRP!^W_VMj&14Zhy64r5CUxDVx#)8-&5JDCm;#N3mTX zR78NV68r8{SEM@a zj}!piH}Xq*9k69&1xLeW@0*$XJE;1O&hZPVvH`V1n z26*8F67gdG7l@o_u5CL$sRqtb=Mr=$tT!U~#3^4%aOpDm~k@Et0eoVm~~kyJ=XJ zg7vn2T-t`(UNApUvZ3&c)fo7VA31*gG1dl=4{xRHj9R=KNVf87Pdn=ef#!c?^Dl#B zIf#4nf_=#C?n?-MRD*LLo|g&p%DsL|35`Dj45+GnlcgI58Z9>Si=}Uyg6s0V9qH=*CKwE3eiV@UdP1ArNenSPxfTQ{;`RSXP~q5-$(g%OB{QE@#|It)$Lnh z46mn+;U^_9HM3upIv@JqGnYRfUO%!Xi+pv`_%7M8DK~VhN2ovJV7q65%%c^`{njn7 zMWW4jS1KH1Z}-AITR$Q0>}zrBh{=lG1Xx#K;Y0T)GuMBqyRhB5jzA|o+`}6JFR9G8 z`hAq#6`(zTXv;|3)YmBw8IWK*2`2+8XnmJ{ncTcR; z+_KPQ&!Fvg^Q4ELW!`B&x8+b2vxVaW)qSjFI_DBy+$q=gZ!sU5QoJf{+4xRIaqf^q z`wh_((#>x9_f$d`jGsSX@M=tHtRYMFiFs%Av)^YHux>AWQ~OFVqp=~;M6V@J`n0`- zQDXxBn#v%nIa*(xHpb)JG>XpMr}lt8t9965Q~x_D0d8t%d^Jvt$u8N!J?dH}A%S)> z->yA-Csq8sn7Omx0ih`hdr$Xpqv zxOZ9C=Np?-6zQVlv07tw4%SdE5oyEnbqQ@gLi9!CRfz!(&^$&4bmMR*v9lX(s>8sAFH3e7r%3%kcZox?bAl$a4-76 z+reL(%G%6Z z6k*5p7f*IAXX}s>K6Yd-I_Jx3NWmCY>H%aC{5n-lH4@|XW+prKS(7AdO`n-t$2t5p zPtS$)l_oMJ-WkCM4(nQ>y2&M>!}Cw8n>gB}UXK$ejQ7P(F(|4(ap=LO;>S`j{vHsW zou@@n3&4n*F!gliLcURXLQ9E&%PP4O3iz1pr^TJrXY`HittBVc2PKY+pp9^RExtEi({i9B zGu1(rr}1KeX7wu(J=1irG^MFxZGnW9>^bGl8H;(cF0}LK)d-uaS+Y4jF=uNd*=R!r@v=2I z%l_rOwYr=3`&~cKdZV>9U$?^L{?0j>^>#L_GSAzu!`n6d=#6UT6RVl{<8Q65=Jgk! zlz!kB+d@10ggr~y^MgVCGN0#7+iw+!#?@--PZ$@z9y8e1p0**rMp5g;<<1c^&fW=C zreja1Iaea>mn`n?^2=Q;bZe6|(ImlpwWDIa$0l}tp2XdlvE+>E>gPy(u@1Z=PL!Zm zHI<%>OOvyFCwtOkONUwuy>LrMOWkpEI*xh7iW(8h3E$uy;roj}k3$zm#||Z&INsM= zQ;C8itO#`6cU}weB>JpB_B{|MX?yXN2AJzTKNZCVg|B z{T<;dj&U8QynCLJ-d_)4Obv9vBOtGl>wJQoZcR3!U0{U1k?pH5dl zyO6??qR;ws`nul{Zp>obou!<@;!onAcb`9FJU04^7kb|@2B?fts`6taqkpkB{*2ma zEk!5)>6!j_)JYO!hk$aUwb6fhbNqzDxVZ}D128qj#Y4gaS)5P~%39X91ZHjW503~6 z2^J@^S(+>@5Lf$~Mno8wg>pS?qkkkTJS2+6^N-~CaUwb5#e^97$mmdi`9<;(k>MMC zBjq*Z1A_dyei8DEJYwX7IXu1yl>{39kDi~uZxD~eRn;=k&{RXi!5g_;jt>{uUNM`sf?#Yx?;6_-UyzQ>4&>_)mXu M41LsbK>x@2AM0^BBLDyZ literal 0 HcmV?d00001