From c31889ea6b79e14262e765fc266cc395a4d5d425 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 26 Jul 2021 15:35:10 -0700 Subject: [PATCH 1/6] upper pin dask --- conda/environments/cudf_dev_cuda11.0.yml | 4 ++-- conda/environments/cudf_dev_cuda11.2.yml | 4 ++-- conda/recipes/custreamz/meta.yaml | 4 ++-- conda/recipes/dask-cudf/meta.yaml | 8 ++++---- python/custreamz/dev_requirements.txt | 4 ++-- python/dask_cudf/dev_requirements.txt | 4 ++-- python/dask_cudf/setup.py | 8 ++++---- 7 files changed, 18 insertions(+), 18 deletions(-) diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml index b791d188700..62b59c3f081 100644 --- a/conda/environments/cudf_dev_cuda11.0.yml +++ b/conda/environments/cudf_dev_cuda11.0.yml @@ -40,8 +40,8 @@ dependencies: - mypy=0.782 - typing_extensions - pre_commit - - dask>=2021.6.0 - - distributed>=2021.6.0 + - dask>=2021.6.0,<=2021.07.1 + - distributed>=2021.6.0,<=2021.07.1 - streamz - arrow-cpp=4.0.1 - dlpack>=0.5,<0.6.0a0 diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml index b4ea3a81c6d..94c7116802b 100644 --- a/conda/environments/cudf_dev_cuda11.2.yml +++ b/conda/environments/cudf_dev_cuda11.2.yml @@ -40,8 +40,8 @@ dependencies: - mypy=0.782 - typing_extensions - pre_commit - - dask>=2021.6.0 - - distributed>=2021.6.0 + - dask>=2021.6.0,<=2021.07.1 + - distributed>=2021.6.0,<=2021.07.1 - streamz - arrow-cpp=4.0.1 - dlpack>=0.5,<0.6.0a0 diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml index 158e34684b4..c98e61f6100 100644 --- a/conda/recipes/custreamz/meta.yaml +++ b/conda/recipes/custreamz/meta.yaml @@ -31,8 +31,8 @@ requirements: - python - streamz - cudf {{ version }} - - dask>=2021.6.0 - - distributed>=2021.6.0 + - dask>=2021.6.0,<=2021.07.1 + - distributed>=2021.6.0,<=2021.07.1 - python-confluent-kafka - cudf_kafka {{ version }} diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml index 216acb7e068..af43f9ed2eb 100644 --- a/conda/recipes/dask-cudf/meta.yaml +++ b/conda/recipes/dask-cudf/meta.yaml @@ -26,13 +26,13 @@ requirements: host: - python - cudf {{ version }} - - dask>=2021.6.0 - - distributed>=2021.6.0 + - dask>=2021.6.0,<=2021.07.1 + - distributed>=2021.6.0,<=2021.07.1 run: - python - cudf {{ version }} - - dask>=2021.6.0 - - distributed>=2021.6.0 + - dask>=2021.6.0,<=2021.07.1 + - distributed>=2021.6.0,<=2021.07.1 test: requires: diff --git a/python/custreamz/dev_requirements.txt b/python/custreamz/dev_requirements.txt index 61e4817b1c2..3f5e3f84fd4 100644 --- a/python/custreamz/dev_requirements.txt +++ b/python/custreamz/dev_requirements.txt @@ -3,8 +3,8 @@ flake8==3.8.3 black==19.10b0 isort==5.6.4 -dask>=2021.6.0 -distributed>=2021.6.0 +dask>=2021.6.0,<=2021.07.1 +distributed>=2021.6.0,<=2021.07.1 streamz python-confluent-kafka pytest diff --git a/python/dask_cudf/dev_requirements.txt b/python/dask_cudf/dev_requirements.txt index cdd17d86649..ec530893b0d 100644 --- a/python/dask_cudf/dev_requirements.txt +++ b/python/dask_cudf/dev_requirements.txt @@ -1,7 +1,7 @@ # Copyright (c) 2021, NVIDIA CORPORATION. -dask>=2021.6.0 -distributed>=2021.6.0 +dask>=2021.6.0,<=2021.07.1 +distributed>=2021.6.0,<=2021.07.1 fsspec>=0.6.0 numba>=0.53.1 numpy diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py index a714dd98904..b17f1bc7463 100644 --- a/python/dask_cudf/setup.py +++ b/python/dask_cudf/setup.py @@ -10,8 +10,8 @@ install_requires = [ "cudf", - "dask>=2021.6.0", - "distributed>=2021.6.0", + "dask>=2021.6.0,<=2021.07.1", + "distributed>=2021.6.0,<=2021.07.1", "fsspec>=0.6.0", "numpy", "pandas>=1.0,<1.3.0dev0", @@ -23,8 +23,8 @@ "pandas>=1.0,<1.3.0dev0", "pytest", "numba>=0.53.1", - "dask>=2021.6.0", - "distributed>=2021.6.0", + "dask>=2021.6.0,<=2021.07.1", + "distributed>=2021.6.0,<=2021.07.1", ] } From 123204d50f57935a97c5a2bb4534482ca22fcb3c Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 27 Jul 2021 00:17:17 -0700 Subject: [PATCH 2/6] fix RLE stream size for timestamp columns; add test --- cpp/src/io/orc/writer_impl.cu | 4 ++-- .../data/orc/TestOrcFile.largeTimestamps.orc | Bin 0 -> 6327 bytes python/cudf/cudf/tests/test_orc.py | 19 ++++++++++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.largeTimestamps.orc diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 123b636ef9b..d93845530d7 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -575,8 +575,8 @@ orc_streams writer::impl::create_streams(host_span columns, break; } case TypeKind::TIMESTAMP: - add_RLE_stream(gpu::CI_DATA, DATA, TypeKind::INT); - add_RLE_stream(gpu::CI_DATA2, SECONDARY, TypeKind::INT); + add_RLE_stream(gpu::CI_DATA, DATA, TypeKind::LONG); + add_RLE_stream(gpu::CI_DATA2, SECONDARY, TypeKind::LONG); column.set_orc_encoding(DIRECT_V2); break; case TypeKind::DECIMAL: diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.largeTimestamps.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.largeTimestamps.orc new file mode 100644 index 0000000000000000000000000000000000000000..095b7372c89e8866cb318524d0b7936e44956e8b GIT binary patch literal 6327 zcmZvBX;@QN)b2Sa^Ke4ugiH()WJ;J3kpKb#K>~t+OaW1X2m}EMMQhYjf>Z%56{QN) zTEU4R4pFp5Z~(0e4i#!q)FR**plT3@kjw48|Gw|Xe%?K;wTHFdy`N{Tn79xc0N^oH z6t)7h4DbU08#3Msj|Ujw5-J6I7afUb1J)PEe*bXhW$`lbIsh2WTg)c&N3&^mauzVZ zn*3-st4-!#%wNprNway&M{`%#ta%n;{`$#eHX+QjT@#@B{a=$NvwC)JYS*NBPHJ|` zmbsCSU6A?zw#G3<{3{s13C8ojE2Fp_n$cokGO(;{ z@TO4xvN_V}gQ%;+^3&37Lqhxo@r^YGoZTTSYWB(QJScwPppBA3%1MgSEKH+LU0rL9 zd(=JckWW~2sK?@uQ?Lc%EO z9jW@$44(a0*OSKq^BvcbFiA%(M#I|t#vAg-e^a_(d-7KEd^bPHYxLjs08dOYpMUn+ zf99X6XOXCoD`*aYynBF7+lbsAdht4wnz(!Przc4Vf0vVrfsL^N@e1z?n-Z$}kJ~4Y zTu0a9&V;*P2y5>PXLgW+syBjl$e%S1Qd$WzUFXe?t_7UFh zuvgJ(#e_tkqL0C2K1F*AI>NH5J3`MgV9oiF)QGCt&+Fs!+k>LFe$6Bz3$LxHRNA>7 zbINo1)#&5hz0aGcc(WrGi%xp9(n97&FshFF=bf(0T;$qCaQfNLzlb z<+zvc+OMiwrp}{fm2guQ$HVrer>)ran-Mdh*pl#?XR8fSRWH4}^YA-z{JYIb8BAl1 z+x(Mh#r;6Q=VQxW z=a^Iaebz1jNx{5jMUn?Z_Z3ISh07IdA7&p_R6gV0U1OT%xg-7(GhCW_qDX81+A*=t z>Tz^HXs~Nklx44zq|o|9lb^$y$FYx_VYu!(S*(x#iwIL3CdJF4BAy(lc{%$ya(S?;maiKSszakQRBW~hD7Mb?(aLH0=)2hG&GP)0! zZ)h%X_@}8N`on&6aAAi8Xk4EWFu&FQbIT2m)4$(*J!8}EKA4!b=gID3_XCd#Y6VB! z(2t!3_qvY-D1)zETK{$E1Qn0#K^<7TW1FzszvO&SJMp(4hNeN2yG;rC_e_IZs%1Rv z!5%#)eW?gQ+c;nrmMv_Lc}#s+8{aTlx-aeNwV(V84vxMU^svWl8qZ|8QVv>3I}hDO zt{Axg)(wAnU4WQ6gxG`J{i`Vp8nD*+6x+q>suk?0)WO)@?Dwr9U-D}!sRe^}BR)sg zR!+cF>iul*%fV0Xud&$qlI?B&zIt=v-cu#{qV3<5yi0*5oI99Dq9!Z()t?;vWK{;Y zazG+H5|R-W-BFdf^1C%>lXW~zA^)-6^CNzL|MOrrjnWgliPOBYo@g)5t(^(~JLu}5 zj3V;A2^@{!n&)e`yVI!+x@$`oe?0WUi&fm!>5>@oHLxvuUOnnTYKCh0l>P4X#ZcirzS;6T&R~IV3zOtCvkr#sINKiQ zx{frJXnc;Z^W3TZ-YZR!Ja079O+M^Xw7o8R&=UKXDH>aMaGEY%QZ*5}&a~=A+3E#F zP;H^u)N+b!bPMz4j4Oh|!uBBE*3El#{^5m)d(sCJ7F#a+fABOE!VODKJk<%}+`*Xu z?yD!$ZHwvU^&b{)8$0C|6mjF^5VuL>9OO}-w{cIw;sx&PA7s=LMbZQ@tM)y=4(tmT;Qwicw zbk38FZxic_;-(i|A8;(RzR)l3aQ*=hH>3T?KVKBaHQ%qsoJi}2cq(Z^4o{|vtc#(B z>!Vk({7&{#X5MeRem%5avb;7e=5XA5|0Rv^kK>+%v?;tb)njBv zxkX=?B~tJ0IRj_k5yZvFT9968+mWh=!Dmkd?~dCf&kq30Q?mThS~pxedu*^xKI@cW#aXwo>NeG>n zxVXHcyfAJ@H$JL3GX1VV`0EPSX*DFEAR}#_%;+5if{><9y_!xG@_eZ;M0+?tmax9nlu!UAP6 zpVEfweM67R8~@HZxa5AtyH=&|%aOD4yqhm_3){p6rLmp&Kdp?J%5t=Gp3Ey;=GM`% zZP)s@r)X7C>)A21YyYeMj#E!>S0mLcxkjAH*B+?TYX-Zavw^G}nVO z7L@7p1!#hMpr3W~+S2mWqJwWjS1Sr+a-sX8_sHL#335ox zCX(%KIH%ULu{(CWu$}j$aobkRL9!mXG`6kJE6%;)M4bCo(rTW=e1D9GBJ;%F&;LC# zHe!iGi6c~T>d1n(+&m`0@2f7%7C$U)Gg9dM zA)k2B?b|{D_%||kA~NVt6~Nl}zQ`|9^>qt5ZM>wet>n<{A2t%VR$K4GJl}BqnJ4Yz z5u$I47=2faI(>g;{%+5lq18)P#eDYMjEu02YYzXRGeviB(aNP~H?AIv7+qggGVgv_ zy1=ty>9>z5w3mbBg}U>rk=TOJn+I1U_T;)b?AojCfQ}vBmS}OswE^k1zqG-cdVI^n z+~CY#Dxs3Jw{CG8Vp9LG{hXgJ7KX=#6ic-o$&+M+pNA#mHL!L(6yhEz*q8MJ&_0Qt@j005oU*OHR(#7-5l<1_)rGO7v3Wzzzf7GWqO2-PhEgU6 zsvk$=OCTRcX6%o3UJoM|+eTM@&rh^|a>FXF;MdEJNh}nY{nBPSwWJQCU&f(AJ?V|T zUcrM)JxX5AkS=C_x$|N)C7080x9*nu?`bboSK%o1#d9S!f7bO2TAYxaJxbdN3z<3h z)-aoaA3evfrtJK)ORU%=#x^Gvi}qh}l6UADV$&1DNql59bFP3H?eeAIAK zWJJ*)nz$^v@b;;tf4Tj6QC_}||ByNql}U6w_u~6>RAutsl*l55Yzb}cjxBey>odM_ zin!&N+nQE;a;2RsOxkn6^Ij@g9I@T%hbY4ALDC%ZErSGor;~cItN!RZHg8MYs5Lz> zaYyK)kBF+mqepOf660OhI2FII5G`36f7k`R=G%8yY`Axk{f?u9iQULvyW_=(SO3sB zZsjA%4dl0q%zUwRK=pK_p>*@jQ2jvj<3}r56sQ zV{b#u6;tWQ>{1RZdf98Q@7o$jcfA_9v1I?EG5X@SLS`#;?#GHjhF3yzyw@awbilV~ z?YVp#qs_ub#;sPj33P@B%P;3UmtQ))ijO|4ExI?Q}>-zIMf_ zPm(O`7fMoAUd!=CJ0ITjZAVh$8uqAvh>CM1Z=!akwP)L|vd#PfW$84OxGm=`b3uMl-W0~#9i?dZX5cIwmRM+n;i2Z&8mFvhM+ygB{;?ozv z0ch)TH9BkirK*0;x539QIr=`FpA__V<5sT}*GEC(V0h=7c;@Bl3S06b$@b5X`h91p4}G5c zvkVbcuj8;bZ2}B+Z-}q6i~o5#J@q2B(_`~~>1 z)}5&g`}T*~fTd@zhUFwT9`AgX7`g~wE4yI-XKi5Un+4SlpEu+U0u2$SNh{}&)Zm!N zVAVzV*Lzl1f+w{5?HZl@FPrg4d( zt**77kKgdB6JVL%t8x`3**nL9iu2RfHo9+(+G58KDd;Feann6Zm%c3eep6`4 z8^0j;r<7O4q~TT-D_q0f4t;> zcY10x<=JJpoKE16PnR_K@8}GtHp{~NS21r@lSP-lk`%PEjcExZsMVi~j^3Raa@4)) z3I*K`91mkSoGdJ`)N^($D?G0gUs=2;JZZ`z@AZasf7H@0 z=aF!6mm8}bG9%5l@w0i7w^r}FZuo`3D?*v*j^-gA!NZm&uojZi_3Qyo3%(xUiB<0x5DPy;G~KBff4z{@#JI3-Dj zk+O^kDP9g5dG(|oNQO|74O9_b18RsSY7?gi3FAZp2ET==XBg;uOdqR{SdXp&3wRPR zo1*}SA$xi~G)_^{V7fPMm@GsN^OYnam<}{UKujwa|x|dYX}O_myl5M%-XLPwR5`~dX$#Jx2WfsEMfE< zqo716z(P$Efwf3uMhs#YuO}*rDv-&PffINUn2lCj^w5-07&Hx(a`_|!)<~+i7+^Iq z5|9a88Cs4}p_F(nP7Sm(VJmpK|%X*9uu)=CAVTuaH2b6P6>xTm9O{_7DilZZ` zIeN4Pr{|$i3XT>y3+d+AXfMzpB`hWAgRFtHR4t(8i5Lt_0#1(<@>{qDav$7BuK_o+ zr3^hXfnorB1bU8yt(o&oJ=>c%CSXv;crru+t%<3i$cUmjt@`V<9ww^4N&O;=siF>U_}z46OePe0ee_d z9*j0wYN0Sp0>;RcvlQe$NP;p5B!V7LFt?kz6JdvOCVmYufu$kn>0_1yY$@2ztjFp( za=^!e0o4+F&;|+$?nBgIQTznx2}Q-%0t>9XQ6_;1r{>83z)}JB;Kl^PIrc~hJzSlj z2HixMyFo+?SBV_Ps;#_Pb5=ptU}|`JhKj3()vN)kj^mAyubYeKqp@f*z$X5OAlrHl z0W!$W#$&MH9PYK5tDxp8hhZv&1?CFGK_tY71kOqF@&7LqfT#bD;D6E~*GB+=MUZRU zIK(+{VIOj}_H(Gf*y6E7DhP5A8LQoFAuP_x2G3qsyhfZ^n7KMfAzojYS)3;>RTQsZ zyRLwW@)Wy^-Q6lMe*Zt^@V_n;v@3cEkOX22$q;+%R Date: Tue, 27 Jul 2021 07:48:50 -0500 Subject: [PATCH 3/6] add torch minversion="1.6.0" --- python/cudf/cudf/tests/test_cuda_array_interface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py index ecf961f133b..bfddd682fe4 100644 --- a/python/cudf/cudf/tests/test_cuda_array_interface.py +++ b/python/cudf/cudf/tests/test_cuda_array_interface.py @@ -170,7 +170,7 @@ def test_column_from_ephemeral_cupy_try_lose_reference(): def test_cuda_array_interface_pytorch(): - torch = pytest.importorskip("torch") + torch = pytest.importorskip("torch", minversion="1.6.0") if not torch.cuda.is_available(): pytest.skip("need gpu version of pytorch to be installed") From d7ba34516a22c65052e157524502c5f5e386cc2b Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Tue, 27 Jul 2021 13:34:45 -0500 Subject: [PATCH 4/6] Java bindings for regex replace (#8847) This adds Java bindings for `cudf::strings::replace_re`. Authors: - Jason Lowe (https://github.com/jlowe) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) URL: https://github.com/rapidsai/cudf/pull/8847 --- .../main/java/ai/rapids/cudf/ColumnView.java | 64 +++++++++++++++++++ java/src/main/native/src/ColumnViewJni.cpp | 45 +++++++++++++ .../java/ai/rapids/cudf/ColumnVectorTest.java | 40 ++++++++++++ 3 files changed, 149 insertions(+) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 29d81325815..8b92b88a47c 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -2465,6 +2465,48 @@ public final ColumnVector stringReplace(Scalar target, Scalar replace) { replace.getScalarHandle())); } + /** + * For each string, replaces any character sequence matching the given pattern using the + * replacement string scalar. + * + * @param pattern The regular expression pattern to search within each string. + * @param repl The string scalar to replace for each pattern match. + * @return A new column vector containing the string results. + */ + public final ColumnVector replaceRegex(String pattern, Scalar repl) { + return replaceRegex(pattern, repl, -1); + } + + /** + * For each string, replaces any character sequence matching the given pattern using the + * replacement string scalar. + * + * @param pattern The regular expression pattern to search within each string. + * @param repl The string scalar to replace for each pattern match. + * @param maxRepl The maximum number of times a replacement should occur within each string. + * @return A new column vector containing the string results. + */ + public final ColumnVector replaceRegex(String pattern, Scalar repl, int maxRepl) { + if (!repl.getType().equals(DType.STRING)) { + throw new IllegalArgumentException("Replacement must be a string scalar"); + } + return new ColumnVector(replaceRegex(getNativeView(), pattern, repl.getScalarHandle(), + maxRepl)); + } + + /** + * For each string, replaces any character sequence matching any of the regular expression + * patterns with the corresponding replacement strings. + * + * @param patterns The regular expression patterns to search within each string. + * @param repls The string scalars to replace for each corresponding pattern match. + * @return A new column vector containing the string results. + */ + public final ColumnVector replaceMultiRegex(String[] patterns, ColumnView repls) { + return new ColumnVector(replaceMultiRegex(getNativeView(), patterns, + repls.getNativeView())); + } + /** * For each string, replaces any character sequence matching the given pattern * using the replace template for back-references. @@ -3241,6 +3283,28 @@ private static native long substringColumn(long columnView, long startColumn, lo */ private static native long stringReplace(long columnView, long target, long repl) throws CudfException; + /** + * Native method for replacing each regular expression pattern match with the specified + * replacement string. + * @param columnView native handle of the cudf::column_view being operated on. + * @param pattern The regular expression pattern to search within each string. + * @param repl native handle of the cudf::scalar containing the replacement string. + * @param maxRepl maximum number of times to replace the pattern within a string + * @return native handle of the resulting cudf column containing the string results. + */ + private static native long replaceRegex(long columnView, String pattern, + long repl, long maxRepl) throws CudfException; + + /** + * Native method for multiple instance regular expression replacement. + * @param columnView native handle of the cudf::column_view being operated on. + * @param patterns native handle of the cudf::column_view containing the regex patterns. + * @param repls The replacement template for creating the output string. + * @return native handle of the resulting cudf column containing the string results. + */ + private static native long replaceMultiRegex(long columnView, String[] patterns, + long repls) throws CudfException; + /** * Native method for replacing any character sequence matching the given pattern * using the replace template for back-references. diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 07b65136970..b0ef773e166 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -1213,6 +1213,51 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapContains(JNIEnv *env, CATCH_STD(env, 0); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceRegex(JNIEnv *env, jclass, + jlong j_column_view, + jstring j_pattern, jlong j_repl, + jlong j_maxrepl) { + + JNI_NULL_CHECK(env, j_column_view, "column is null", 0); + JNI_NULL_CHECK(env, j_pattern, "pattern string is null", 0); + JNI_NULL_CHECK(env, j_repl, "replace scalar is null", 0); + try { + cudf::jni::auto_set_device(env); + auto cv = reinterpret_cast(j_column_view); + cudf::strings_column_view scv(*cv); + cudf::jni::native_jstring pattern(env, j_pattern); + auto repl = reinterpret_cast(j_repl); + + std::unique_ptr result = + cudf::strings::replace_re(scv, pattern.get(), *repl, j_maxrepl); + return reinterpret_cast(result.release()); + } + CATCH_STD(env, 0); +} + +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceMultiRegex(JNIEnv *env, jclass, + jlong j_column_view, + jobjectArray j_patterns, + jlong j_repls) { + + JNI_NULL_CHECK(env, j_column_view, "column is null", 0); + JNI_NULL_CHECK(env, j_patterns, "patterns is null", 0); + JNI_NULL_CHECK(env, j_repls, "repls is null", 0); + try { + cudf::jni::auto_set_device(env); + auto cv = reinterpret_cast(j_column_view); + cudf::strings_column_view scv(*cv); + cudf::jni::native_jstringArray patterns(env, j_patterns); + auto repl_cv = reinterpret_cast(j_repls); + cudf::strings_column_view repl_scv(*repl_cv); + + std::unique_ptr result = + cudf::strings::replace_re(scv, patterns.as_cpp_vector(), repl_scv); + return reinterpret_cast(result.release()); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplaceWithBackrefs( JNIEnv *env, jclass, jlong column_view, jstring patternObj, jstring replaceObj) { diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 481ca8fbab0..d3fdb0e19bb 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -4479,6 +4479,46 @@ void teststringReplaceThrowsException() { }); } + @Test + void testReplaceRegex() { + try (ColumnVector v = + ColumnVector.fromStrings("title and Title with title", "nothing", null, "Title"); + Scalar repl = Scalar.fromString("Repl"); + ColumnVector actual = v.replaceRegex("[tT]itle", repl); + ColumnVector expected = + ColumnVector.fromStrings("Repl and Repl with Repl", "nothing", null, "Repl")) { + assertColumnsAreEqual(expected, actual); + } + + try (ColumnVector v = + ColumnVector.fromStrings("title and Title with title", "nothing", null, "Title"); + Scalar repl = Scalar.fromString("Repl"); + ColumnVector actual = v.replaceRegex("[tT]itle", repl, 0)) { + assertColumnsAreEqual(v, actual); + } + + try (ColumnVector v = + ColumnVector.fromStrings("title and Title with title", "nothing", null, "Title"); + Scalar repl = Scalar.fromString("Repl"); + ColumnVector actual = v.replaceRegex("[tT]itle", repl, 1); + ColumnVector expected = + ColumnVector.fromStrings("Repl and Title with title", "nothing", null, "Repl")) { + assertColumnsAreEqual(expected, actual); + } + } + + @Test + void testReplaceMultiRegex() { + try (ColumnVector v = + ColumnVector.fromStrings("title and Title with title", "nothing", null, "Title"); + ColumnVector repls = ColumnVector.fromStrings("Repl", "**"); + ColumnVector actual = v.replaceMultiRegex(new String[] { "[tT]itle", "and|th" }, repls); + ColumnVector expected = + ColumnVector.fromStrings("Repl ** Repl wi** Repl", "no**ing", null, "Repl")) { + assertColumnsAreEqual(expected, actual); + } + } + @Test void testStringReplaceWithBackrefs() { From 279f9a42cd403b6d4d9df47fa8d4b3a580703289 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Tue, 27 Jul 2021 15:20:42 -0500 Subject: [PATCH 5/6] Update Java build instructions to mention Arrow S3 and Docker (#8867) Updates the Java build instructions to mention the `CUDF_ENABLE_ARROW_S3=OFF` setting when statically linking Arrow. The README now also points to the Java CI Docker environment and build script for building a Java jar that can run on any modern Linux system. Authors: - Jason Lowe (https://github.com/jlowe) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) URL: https://github.com/rapidsai/cudf/pull/8867 --- java/README.md | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/java/README.md b/java/README.md index cf60d6a9c93..730eeadb10a 100644 --- a/java/README.md +++ b/java/README.md @@ -35,8 +35,8 @@ most modern cuda drivers. ``` In some cases there may be a classifier to indicate the version of cuda required. See the -Build From Source section below for more information about when this can happen. No official -release of the jar will have a classifier on it. +[Build From Source](#build-from-source) section below for more information about when this +can happen. No official release of the jar will have a classifier on it. CUDA 11.0: ```xml @@ -51,9 +51,9 @@ CUDA 11.0: ## Build From Source Build [libcudf](../cpp) first, and make sure the JDK is installed and available. Specify -the cmake option `-DCUDF_USE_ARROW_STATIC=ON` when building so that Apache Arrow is linked -statically to libcudf, as this will help create a jar that does not require Arrow and its -dependencies to be available in the runtime environment. +the cmake option `-DCUDF_USE_ARROW_STATIC=ON -DCUDF_ENABLE_ARROW_S3=OFF` when building so +that Apache Arrow is linked statically to libcudf, as this will help create a jar that +does not require Arrow and its dependencies to be available in the runtime environment. After building libcudf, the Java bindings can be built via Maven, e.g.: ``` @@ -63,6 +63,18 @@ mvn clean install If you have a compatible GPU on your build system the tests will use it. If not you will see a lot of skipped tests. +### Using the Java CI Docker Image + +If you are interested in building a Java cudf jar that is similar to the official releases +that can run on all modern Linux systems, see the [Java CI README](ci/README.md) for +instructions on how to build within a Docker environment using devtoolset. Note that +building the jar without the Docker setup and script will likely produce a jar that can +only run in environments similar to that of the build machine. + +If you decide to build without Docker and the build script, examining the cmake and Maven +settings in the [Java CI build script](ci/build-in-docker.sh) can be helpful if you are +encountering difficulties during the build. + ## Dynamically Linking Arrow Since libcudf builds by default with a dynamically linked Arrow dependency, it may be From 904222bcd0bd9ed5c825e6be621c4b80189f1b60 Mon Sep 17 00:00:00 2001 From: pxLi Date: Wed, 28 Jul 2021 20:21:37 +0800 Subject: [PATCH 6/6] fix cufilejni build w/ c++17 (#8877) Signed-off-by: Peixin Li cufilejni was not specified CXX standard, so if `ENABLE_GDS=ON`, it will fail w/ ``` error: 'is_same_v' is not a member of 'std'; did you mean 'is_same' ``` Authors: - pxLi (https://github.com/pxLi) Approvers: - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/8877 --- java/src/main/native/CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index 1b0b747f92b..c018c0aa742 100755 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -293,6 +293,14 @@ target_compile_definitions(cudfjni if(USE_GDS) add_library(cufilejni SHARED "src/CuFileJni.cpp") + SET_TARGET_PROPERTIES(cufilejni + PROPERTIES BUILD_RPATH "\$ORIGIN" + # set target compile options + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + ) target_include_directories(cufilejni PRIVATE "${cuFile_INCLUDE_DIRS}") target_link_libraries(cufilejni PRIVATE cudfjni "${cuFile_LIBRARIES}") endif(USE_GDS)