From c7db81a34ba4a2e8b10f5dc5ccc3652260124b52 Mon Sep 17 00:00:00 2001 From: Cindy Jiang <47068112+cindyyuanjiang@users.noreply.github.com> Date: Mon, 6 Feb 2023 10:07:36 -0800 Subject: [PATCH] Add `regex_program` strings extract java APIs and tests (#12699) This PR adds [extract, extract_all_record](https://docs.rapids.ai/api/libcudf/nightly/strings_2extract_8hpp.html) related `regex_program` java APIs and unit tests. Part of work for https://github.com/NVIDIA/spark-rapids/issues/7295. Authors: - Cindy Jiang (https://github.com/cindyyuanjiang) Approvers: - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/12699 --- .../main/java/ai/rapids/cudf/ColumnView.java | 31 ++++++++++++++++--- java/src/main/native/src/ColumnViewJni.cpp | 20 +++++++----- .../java/ai/rapids/cudf/ColumnVectorTest.java | 18 ++++++----- 3 files changed, 50 insertions(+), 19 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index b3111cec77b..4daa3c17cfc 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -3252,10 +3252,28 @@ public final ColumnVector containsRe(RegexProgram regexProg) { * @throws CudfException if any error happens including if the RE does * not contain any capture groups. */ + @Deprecated public final Table extractRe(String pattern) throws CudfException { + return extractRe(new RegexProgram(pattern)); + } + + /** + * For each captured group specified in the given regex program + * return a column in the table. Null entries are added if the string + * does not match. Any null inputs also result in null output entries. + * + * For supported regex patterns refer to: + * @link https://docs.rapids.ai/api/libcudf/nightly/md_regex.html + * @param regexProg the regex program to use + * @return the table of extracted matches + * @throws CudfException if any error happens including if the regex + * program does not contain any capture groups. + */ + public final Table extractRe(RegexProgram regexProg) throws CudfException { assert type.equals(DType.STRING) : "column type must be a String"; - assert pattern != null : "pattern may not be null"; - return new Table(extractRe(this.getNativeView(), pattern)); + assert regexProg != null : "regex program may not be null"; + return new Table(extractRe(this.getNativeView(), regexProg.pattern(), + regexProg.combinedFlags(), regexProg.capture().nativeId)); } /** @@ -4100,9 +4118,14 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat private static native long stringContains(long cudfViewHandle, long compString) throws CudfException; /** - * Native method for extracting results from an regular expressions. Returns a table handle. + * Native method for extracting results from a regex program pattern. Returns a table handle. + * + * @param cudfViewHandle Native handle of the cudf::column_view being operated on. + * @param pattern String regex pattern. + * @param flags Regex flags setting. + * @param capture Capture groups setting. */ - private static native long[] extractRe(long cudfViewHandle, String pattern) throws CudfException; + private static native long[] extractRe(long cudfViewHandle, String pattern, int flags, int capture) throws CudfException; /** * Native method for extracting all results corresponding to group idx from a regex program pattern. diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index ff07a6786c1..bfa3fa0a522 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -1674,18 +1674,22 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringStrip(JNIEnv *env, JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_extractRe(JNIEnv *env, jclass, jlong j_view_handle, - jstring patternObj) { + jstring pattern_obj, + jint regex_flags, + jint capture_groups) { JNI_NULL_CHECK(env, j_view_handle, "column is null", nullptr); - JNI_NULL_CHECK(env, patternObj, "pattern is null", nullptr); + JNI_NULL_CHECK(env, pattern_obj, "pattern is null", nullptr); try { cudf::jni::auto_set_device(env); - cudf::strings_column_view const strings_column{ - *reinterpret_cast(j_view_handle)}; - cudf::jni::native_jstring pattern(env, patternObj); - - return cudf::jni::convert_table_for_return( - env, cudf::strings::extract(strings_column, pattern.get())); + auto const column_view = reinterpret_cast(j_view_handle); + auto const strings_column = cudf::strings_column_view{*column_view}; + auto const pattern = cudf::jni::native_jstring(env, pattern_obj); + auto const flags = static_cast(regex_flags); + auto const groups = static_cast(capture_groups); + auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups); + return cudf::jni::convert_table_for_return(env, + cudf::strings::extract(strings_column, *regex_prog)); } CATCH_STD(env, 0); } diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 26817281c2e..46264b7d668 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -4040,14 +4040,18 @@ void testStringFindOperations() { @Test void testExtractRe() { - try (ColumnVector input = ColumnVector.fromStrings("a1", "b2", "c3", null); - Table expected = new Table.TestBuilder() - .column("a", "b", null, null) - .column("1", "2", null, null) - .build(); - Table found = input.extractRe("([ab])(\\d)")) { - assertTablesAreEqual(expected, found); + try (ColumnVector input = ColumnVector.fromStrings("a1", "b2", "c3", null); + Table expected = new Table.TestBuilder() + .column("a", "b", null, null) + .column("1", "2", null, null) + .build()) { + try (Table found = input.extractRe("([ab])(\\d)")) { + assertTablesAreEqual(expected, found); } + try (Table found = input.extractRe(new RegexProgram("([ab])(\\d)"))) { + assertTablesAreEqual(expected, found); + } + } } @Test