Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add regex_program strings extract java APIs and tests #12699

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 27 additions & 4 deletions java/src/main/java/ai/rapids/cudf/ColumnView.java
Original file line number Diff line number Diff line change
Expand Up @@ -3252,10 +3252,28 @@ public final ColumnVector containsRe(RegexProgram regexProg) {
* @throws CudfException if any error happens including if the RE does
* not contain any capture groups.
*/
@Deprecated
public final Table extractRe(String pattern) throws CudfException {
return extractRe(new RegexProgram(pattern));
}

/**
* For each captured group specified in the given regex program
* return a column in the table. Null entries are added if the string
* does not match. Any null inputs also result in null output entries.
*
* For supported regex patterns refer to:
* @link https://docs.rapids.ai/api/libcudf/nightly/md_regex.html
* @param regexProg the regex program to use
* @return the table of extracted matches
* @throws CudfException if any error happens including if the regex
* program does not contain any capture groups.
*/
public final Table extractRe(RegexProgram regexProg) throws CudfException {
assert type.equals(DType.STRING) : "column type must be a String";
assert pattern != null : "pattern may not be null";
return new Table(extractRe(this.getNativeView(), pattern));
assert regexProg != null : "regex program may not be null";
return new Table(extractRe(this.getNativeView(), regexProg.pattern(),
regexProg.combinedFlags(), regexProg.capture().nativeId));
}

/**
Expand Down Expand Up @@ -4100,9 +4118,14 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat
private static native long stringContains(long cudfViewHandle, long compString) throws CudfException;

/**
* Native method for extracting results from an regular expressions. Returns a table handle.
* Native method for extracting results from a regex program pattern. Returns a table handle.
*
* @param cudfViewHandle Native handle of the cudf::column_view being operated on.
* @param pattern String regex pattern.
* @param flags Regex flags setting.
* @param capture Capture groups setting.
*/
private static native long[] extractRe(long cudfViewHandle, String pattern) throws CudfException;
private static native long[] extractRe(long cudfViewHandle, String pattern, int flags, int capture) throws CudfException;

/**
* Native method for extracting all results corresponding to group idx from a regex program pattern.
Expand Down
20 changes: 12 additions & 8 deletions java/src/main/native/src/ColumnViewJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1674,18 +1674,22 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringStrip(JNIEnv *env,

JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_extractRe(JNIEnv *env, jclass,
jlong j_view_handle,
jstring patternObj) {
jstring pattern_obj,
jint regex_flags,
jint capture_groups) {
JNI_NULL_CHECK(env, j_view_handle, "column is null", nullptr);
JNI_NULL_CHECK(env, patternObj, "pattern is null", nullptr);
JNI_NULL_CHECK(env, pattern_obj, "pattern is null", nullptr);

try {
cudf::jni::auto_set_device(env);
cudf::strings_column_view const strings_column{
*reinterpret_cast<cudf::column_view *>(j_view_handle)};
cudf::jni::native_jstring pattern(env, patternObj);

return cudf::jni::convert_table_for_return(
env, cudf::strings::extract(strings_column, pattern.get()));
auto const column_view = reinterpret_cast<cudf::column_view const *>(j_view_handle);
auto const strings_column = cudf::strings_column_view{*column_view};
auto const pattern = cudf::jni::native_jstring(env, pattern_obj);
auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups);
auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups);
return cudf::jni::convert_table_for_return(env,
cudf::strings::extract(strings_column, *regex_prog));
}
CATCH_STD(env, 0);
}
Expand Down
18 changes: 11 additions & 7 deletions java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -4040,14 +4040,18 @@ void testStringFindOperations() {

@Test
void testExtractRe() {
try (ColumnVector input = ColumnVector.fromStrings("a1", "b2", "c3", null);
Table expected = new Table.TestBuilder()
.column("a", "b", null, null)
.column("1", "2", null, null)
.build();
Table found = input.extractRe("([ab])(\\d)")) {
assertTablesAreEqual(expected, found);
try (ColumnVector input = ColumnVector.fromStrings("a1", "b2", "c3", null);
Table expected = new Table.TestBuilder()
.column("a", "b", null, null)
.column("1", "2", null, null)
.build()) {
try (Table found = input.extractRe("([ab])(\\d)")) {
assertTablesAreEqual(expected, found);
}
try (Table found = input.extractRe(new RegexProgram("([ab])(\\d)"))) {
assertTablesAreEqual(expected, found);
}
}
}

@Test
Expand Down