From 00c1f984f85495e1d2ec59916cf4ebfec7baf38b Mon Sep 17 00:00:00 2001 From: Cindy Jiang Date: Tue, 31 Jan 2023 13:52:45 -0800 Subject: [PATCH 1/9] added regex program related java classes Signed-off-by: Cindy Jiang --- .../java/ai/rapids/cudf/CaptureGroups.java | 36 +++++ .../main/java/ai/rapids/cudf/RegexFlag.java | 37 +++++ .../java/ai/rapids/cudf/RegexProgram.java | 134 ++++++++++++++++++ 3 files changed, 207 insertions(+) create mode 100644 java/src/main/java/ai/rapids/cudf/CaptureGroups.java create mode 100644 java/src/main/java/ai/rapids/cudf/RegexFlag.java create mode 100644 java/src/main/java/ai/rapids/cudf/RegexProgram.java diff --git a/java/src/main/java/ai/rapids/cudf/CaptureGroups.java b/java/src/main/java/ai/rapids/cudf/CaptureGroups.java new file mode 100644 index 00000000000..9dca14f85d1 --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/CaptureGroups.java @@ -0,0 +1,36 @@ +/* + * + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +/** + * Capture groups setting, closely following cudf::strings::capture_groups. + * + * For processing a regex pattern containing capture groups. These can be used + * to optimize the generated regex instructions where the capture groups do not + * require extracting the groups. + */ +public enum CaptureGroups { + EXTRACT(0), // capture groups processed normally for extract + NON_CAPTURE(1); // convert all capture groups to non-capture groups + + final int nativeId; // Native id, for use with libcudf. + private CaptureGroups(int nativeId) { // Only constant values should be used + this.nativeId = nativeId; + } +} \ No newline at end of file diff --git a/java/src/main/java/ai/rapids/cudf/RegexFlag.java b/java/src/main/java/ai/rapids/cudf/RegexFlag.java new file mode 100644 index 00000000000..20eafbc41fa --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/RegexFlag.java @@ -0,0 +1,37 @@ +/* + * + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +/** + * Regex flags setting, closely following cudf::strings::regex_flags. + * + * These types can be or'd to combine them. The values are chosen to + * leave room for future flags and to match the Python flag values. + */ +public enum RegexFlag { + DEFAULT(0), // default + MULTILINE(8), // the '^' and '$' honor new-line characters + DOTALL(16), // the '.' matching includes new-line characters + ASCII(256); // use only ASCII when matching built-in character classes + + final int nativeId; // Native id, for use with libcudf. + private RegexFlag(int nativeId) { // Only constant values should be used + this.nativeId = nativeId; + } +} \ No newline at end of file diff --git a/java/src/main/java/ai/rapids/cudf/RegexProgram.java b/java/src/main/java/ai/rapids/cudf/RegexProgram.java new file mode 100644 index 00000000000..61b89f9ae02 --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/RegexProgram.java @@ -0,0 +1,134 @@ +/* + * + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +import java.util.EnumSet; + +/** + * Regex program class, closely following cudf::strings::regex_program. + */ +public class RegexProgram { + private String pattern; // regex pattern + // regex flags for interpreting special characters in the pattern + private EnumSet flags; + // controls how capture groups in the pattern are used + // default is to extract a capture group + private CaptureGroups capture; + + /** + * Constructor for RegexProgram + * + * @param pattern Regex pattern + */ + public RegexProgram(String pattern) { + this(pattern, EnumSet.of(RegexFlag.DEFAULT), CaptureGroups.EXTRACT); + } + + /** + * Constructor for RegexProgram + * + * @param pattern Regex pattern + * @param flags Regex flags setting + */ + public RegexProgram(String pattern, EnumSet flags) { + this(pattern, flags, CaptureGroups.EXTRACT); + } + + /** + * Constructor for RegexProgram + * + * @param pattern Regex pattern setting + * @param capture Capture groups setting + */ + public RegexProgram(String pattern, CaptureGroups capture) { + this(pattern, EnumSet.of(RegexFlag.DEFAULT), capture); + } + + /** + * Constructor for RegexProgram + * + * @param pattern Regex pattern + * @param flags Regex flags setting + * @param capture Capture groups setting + */ + public RegexProgram(String pattern, EnumSet flags, CaptureGroups capture) { + assert pattern != null : "pattern may not be null"; + this.pattern = pattern; + this.flags = flags; + this.capture = capture; + } + + /** + * Get the pattern used to create this instance + * + * @param return A regex pattern as a string + */ + public String pattern() { + return pattern; + } + + /** + * Get the regex flags setting used to create this instance + * + * @param return Regex flags setting + */ + public EnumSet flags() { + return flags; + } + + /** + * Reset the regex flags setting for this instance + * + * @param flags Regex flags setting + */ + public void setFlags(EnumSet flags) { + this.flags = flags; + } + + /** + * Get the capture groups setting used to create this instance + * + * @param return Capture groups setting + */ + public CaptureGroups capture() { + return capture; + } + + /** + * Reset the capture groups setting for this instance + * + * @param capture Capture groups setting + */ + public void setCapture(CaptureGroups capture) { + this.capture = capture; + } + + /** + * Combine the regex flags using 'or' + * + * @param return An integer representing the value of combined (or'ed) flags + */ + public int combinedFlags() { + int allFlags = 0; + for (RegexFlag flag : flags) { + allFlags |= flag.nativeId; + } + return allFlags; + } +} \ No newline at end of file From c2869e6709e3f0fe1a998671dbf89ea9817d1dc2 Mon Sep 17 00:00:00 2001 From: Cindy Jiang Date: Wed, 1 Feb 2023 13:57:56 -0800 Subject: [PATCH 2/9] fixed code format Signed-off-by: Cindy Jiang --- java/src/main/java/ai/rapids/cudf/CaptureGroups.java | 2 +- java/src/main/java/ai/rapids/cudf/RegexFlag.java | 2 +- java/src/main/java/ai/rapids/cudf/RegexProgram.java | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/CaptureGroups.java b/java/src/main/java/ai/rapids/cudf/CaptureGroups.java index 9dca14f85d1..2ab778dbc35 100644 --- a/java/src/main/java/ai/rapids/cudf/CaptureGroups.java +++ b/java/src/main/java/ai/rapids/cudf/CaptureGroups.java @@ -33,4 +33,4 @@ public enum CaptureGroups { private CaptureGroups(int nativeId) { // Only constant values should be used this.nativeId = nativeId; } -} \ No newline at end of file +} diff --git a/java/src/main/java/ai/rapids/cudf/RegexFlag.java b/java/src/main/java/ai/rapids/cudf/RegexFlag.java index 20eafbc41fa..7ed8e0354c9 100644 --- a/java/src/main/java/ai/rapids/cudf/RegexFlag.java +++ b/java/src/main/java/ai/rapids/cudf/RegexFlag.java @@ -34,4 +34,4 @@ public enum RegexFlag { private RegexFlag(int nativeId) { // Only constant values should be used this.nativeId = nativeId; } -} \ No newline at end of file +} diff --git a/java/src/main/java/ai/rapids/cudf/RegexProgram.java b/java/src/main/java/ai/rapids/cudf/RegexProgram.java index 61b89f9ae02..191a0b95ff3 100644 --- a/java/src/main/java/ai/rapids/cudf/RegexProgram.java +++ b/java/src/main/java/ai/rapids/cudf/RegexProgram.java @@ -131,4 +131,4 @@ public int combinedFlags() { } return allFlags; } -} \ No newline at end of file +} From 7f46b3f208d575c5ab6b72e7191b68b0d3290584 Mon Sep 17 00:00:00 2001 From: Cindy Jiang Date: Thu, 2 Feb 2023 11:10:20 -0800 Subject: [PATCH 3/9] updated containsRe API Signed-off-by: Cindy Jiang --- .../main/java/ai/rapids/cudf/ColumnView.java | 35 ++++++++++++--- java/src/main/native/src/ColumnViewJni.cpp | 17 ++++--- .../java/ai/rapids/cudf/ColumnVectorTest.java | 44 +++++++++++++------ 3 files changed, 71 insertions(+), 25 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 47d6b7573cd..f5c6966c4f5 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -3177,7 +3177,7 @@ public final ColumnVector matchesRe(String pattern) { * * ``` * cv = ["abc","123","def456"] - * result = cv.matches_re("\\d+") + * result = cv.contains_re("\\d+") * r is now [false, true, true] * ``` * Any null string entries return corresponding null output column entries. @@ -3187,11 +3187,32 @@ public final ColumnVector matchesRe(String pattern) { * @param pattern Regex pattern to match to each string. * @return New ColumnVector of boolean results for each string. */ + @Deprecated public final ColumnVector containsRe(String pattern) { + return containsRe(new RegexProgram(pattern, CaptureGroups.NON_CAPTURE)); + } + + /** + * Returns a boolean ColumnVector identifying rows which + * match the given RegexProgram pattern starting at any location. + * + * ``` + * cv = ["abc","123","def456"] + * result = cv.contains_re("\\d+") + * r is now [false, true, true] + * ``` + * Any null string entries return corresponding null output column entries. + * For supported regex patterns refer to: + * @link https://docs.rapids.ai/api/libcudf/nightly/md_regex.html + * + * @param regexProg Regex program to match to each string. + * @return New ColumnVector of boolean results for each string. + */ + public final ColumnVector containsRe(RegexProgram regexProg) { assert type.equals(DType.STRING) : "column type must be a String"; - assert pattern != null : "pattern may not be null"; - assert !pattern.isEmpty() : "pattern string may not be empty"; - return new ColumnVector(containsRe(getNativeView(), pattern)); + assert regexProg != null : "regex program may not be null"; + assert !regexProg.pattern().isEmpty() : "pattern string may not be empty"; + return new ColumnVector(containsRe(getNativeView(), regexProg.pattern(), regexProg.combinedFlags(), regexProg.capture().nativeId)); } /** @@ -4004,12 +4025,14 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat private static native long matchesRe(long cudfViewHandle, String pattern) throws CudfException; /** - * Native method for checking if strings match the passed in regex pattern starting at any location. + * Native method for checking if strings match the passed in regex program pattern starting at any location. * @param cudfViewHandle native handle of the cudf::column_view being operated on. * @param pattern string regex pattern. + * @param flags regex flags setting. + * @param capture capture groups setting. * @return native handle of the resulting cudf column containing the boolean results. */ - private static native long containsRe(long cudfViewHandle, String pattern) throws CudfException; + private static native long containsRe(long cudfViewHandle, String pattern, int flags, int capture) throws CudfException; /** * Native method for checking if strings match the passed in like pattern diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index b48ddae196b..4775c98d0e2 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -1306,16 +1306,21 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_matchesRe(JNIEnv *env, jo JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsRe(JNIEnv *env, jobject j_object, jlong j_view_handle, - jstring patternObj) { + jstring pattern_obj, + jint regex_flags, + jint capture_groups) { JNI_NULL_CHECK(env, j_view_handle, "column is null", false); - JNI_NULL_CHECK(env, patternObj, "pattern is null", false); + JNI_NULL_CHECK(env, pattern_obj, "pattern is null", false); try { cudf::jni::auto_set_device(env); - cudf::column_view *column_view = reinterpret_cast(j_view_handle); - cudf::strings_column_view strings_column(*column_view); - cudf::jni::native_jstring pattern(env, patternObj); - return release_as_jlong(cudf::strings::contains_re(strings_column, pattern.get())); + auto const column_view = reinterpret_cast(j_view_handle); + auto const strings_column = cudf::strings_column_view{*column_view}; + auto const pattern = cudf::jni::native_jstring(env, pattern_obj); + auto const flags = static_cast(regex_flags); + auto const capture = static_cast(capture_groups); + auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, capture); + return release_as_jlong(cudf::strings::contains_re(strings_column, *regex_prog)); } CATCH_STD(env, 0); } diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index fc0a542e0a7..61a63713e52 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -4115,36 +4115,54 @@ void testContainsRe() { String patternString2 = "[A-Za-z]+\\s@[A-Za-z]+"; String patternString3 = ".*"; String patternString4 = ""; + RegexProgram regexProg1 = new RegexProgram(patternString1, CaptureGroups.NON_CAPTURE); + RegexProgram regexProg2 = new RegexProgram(patternString2, CaptureGroups.NON_CAPTURE); + RegexProgram regexProg3 = new RegexProgram(patternString3, CaptureGroups.NON_CAPTURE); + RegexProgram regexProg4 = new RegexProgram(patternString4, CaptureGroups.NON_CAPTURE); try (ColumnVector testStrings = ColumnVector.fromStrings(null, "abCD", "ovér the", - "lazy @dog", "1234", "00:0:00", "abc1234abc", "there @are 2 lazy @dogs"); - ColumnVector res1 = testStrings.containsRe(patternString1); - ColumnVector res2 = testStrings.containsRe(patternString2); - ColumnVector res3 = testStrings.containsRe(patternString3); + "lazy @dog", "1234", "00:0:00", "abc1234abc", "there @are 2 lazy @dogs"); ColumnVector expected1 = ColumnVector.fromBoxedBooleans(null, false, false, false, true, true, true, true); ColumnVector expected2 = ColumnVector.fromBoxedBooleans(null, false, false, true, false, false, false, true); ColumnVector expected3 = ColumnVector.fromBoxedBooleans(null, true, true, true, true, true, true, true)) { - assertColumnsAreEqual(expected1, res1); - assertColumnsAreEqual(expected2, res2); - assertColumnsAreEqual(expected3, res3); + try (ColumnVector res1 = testStrings.containsRe(patternString1); + ColumnVector res2 = testStrings.containsRe(patternString2); + ColumnVector res3 = testStrings.containsRe(patternString3)) { + assertColumnsAreEqual(expected1, res1); + assertColumnsAreEqual(expected2, res2); + assertColumnsAreEqual(expected3, res3); + } + try (ColumnVector res1 = testStrings.containsRe(regexProg1); + ColumnVector res2 = testStrings.containsRe(regexProg2); + ColumnVector res3 = testStrings.containsRe(regexProg3)) { + assertColumnsAreEqual(expected1, res1); + assertColumnsAreEqual(expected2, res2); + assertColumnsAreEqual(expected3, res3); + } + } + try (ColumnVector testStrings = ColumnVector.fromStrings("", null, "abCD", "ovér the", + "lazy @dog", "1234", "00:0:00", "abc1234abc", "there @are 2 lazy @dogs")) { + assertThrows(AssertionError.class, () -> { + try (ColumnVector res = testStrings.containsRe(patternString4)) {} + }); + assertThrows(AssertionError.class, () -> { + try (ColumnVector res = testStrings.containsRe(regexProg4)) {} + }); } - assertThrows(AssertionError.class, () -> { - try (ColumnVector testStrings = ColumnVector.fromStrings("", null, "abCD", "ovér the", - "lazy @dog", "1234", "00:0:00", "abc1234abc", "there @are 2 lazy @dogs"); - ColumnVector res = testStrings.containsRe(patternString4)) {} - }); } @Test - @Disabled("Needs fix for https://github.com/rapidsai/cudf/issues/4671") void testContainsReEmptyInput() { String patternString1 = ".*"; + RegexProgram regexProg1 = new RegexProgram(patternString1, CaptureGroups.NON_CAPTURE); try (ColumnVector testStrings = ColumnVector.fromStrings(""); ColumnVector res1 = testStrings.containsRe(patternString1); + ColumnVector resReProg1 = testStrings.containsRe(regexProg1); ColumnVector expected1 = ColumnVector.fromBoxedBooleans(true)) { assertColumnsAreEqual(expected1, res1); + assertColumnsAreEqual(expected1, resReProg1); } } From b077d6a9262a9226b4b1e2cc2a908b8eb41095c0 Mon Sep 17 00:00:00 2001 From: Cindy Jiang Date: Thu, 2 Feb 2023 11:43:22 -0800 Subject: [PATCH 4/9] updated matchesRe API and tests Signed-off-by: Cindy Jiang --- .../main/java/ai/rapids/cudf/ColumnView.java | 33 +++++++++++--- java/src/main/native/src/ColumnViewJni.cpp | 18 +++++--- .../java/ai/rapids/cudf/ColumnVectorTest.java | 43 ++++++++++++------- 3 files changed, 68 insertions(+), 26 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index f5c6966c4f5..4ee7f3d0de5 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -3164,11 +3164,32 @@ public final ColumnVector clamp(Scalar lo, Scalar loReplace, Scalar hi, Scalar h * @param pattern Regex pattern to match to each string. * @return New ColumnVector of boolean results for each string. */ + @Deprecated public final ColumnVector matchesRe(String pattern) { + return matchesRe(new RegexProgram(pattern, CaptureGroups.NON_CAPTURE)); + } + + /** + * Returns a boolean ColumnVector identifying rows which + * match the given regex program pattern but only at the beginning of the string. + * + * ``` + * cv = ["abc","123","def456"] + * result = cv.matches_re("\\d+") + * r is now [false, true, false] + * ``` + * Any null string entries return corresponding null output column entries. + * For supported regex patterns refer to: + * @link https://docs.rapids.ai/api/libcudf/nightly/md_regex.html + * + * @param regexProg Regex program to match to each string. + * @return New ColumnVector of boolean results for each string. + */ + public final ColumnVector matchesRe(RegexProgram regexProg) { assert type.equals(DType.STRING) : "column type must be a String"; - assert pattern != null : "pattern may not be null"; - assert !pattern.isEmpty() : "pattern string may not be empty"; - return new ColumnVector(matchesRe(getNativeView(), pattern)); + assert regexProg != null : "regex program may not be null"; + assert !regexProg.pattern().isEmpty() : "pattern string may not be empty"; + return new ColumnVector(matchesRe(getNativeView(), regexProg.pattern(), regexProg.combinedFlags(), regexProg.capture().nativeId)); } /** @@ -4016,13 +4037,15 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat private static native long stringStrip(long columnView, int type, long toStrip) throws CudfException; /** - * Native method for checking if strings match the passed in regex pattern from the + * Native method for checking if strings match the passed in regex program pattern from the * beginning of the string. * @param cudfViewHandle native handle of the cudf::column_view being operated on. * @param pattern string regex pattern. + * @param flags regex flags setting. + * @param capture capture groups setting. * @return native handle of the resulting cudf column containing the boolean results. */ - private static native long matchesRe(long cudfViewHandle, String pattern) throws CudfException; + private static native long matchesRe(long cudfViewHandle, String pattern, int flags, int capture) throws CudfException; /** * Native method for checking if strings match the passed in regex program pattern starting at any location. diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 4775c98d0e2..3ed1f3ed42e 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -65,6 +65,7 @@ #include #include #include +#include #include #include #include @@ -1290,16 +1291,21 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringContains(JNIEnv *en JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_matchesRe(JNIEnv *env, jobject j_object, jlong j_view_handle, - jstring patternObj) { + jstring pattern_obj, + jint regex_flags, + jint capture_groups) { JNI_NULL_CHECK(env, j_view_handle, "column is null", false); - JNI_NULL_CHECK(env, patternObj, "pattern is null", false); + JNI_NULL_CHECK(env, pattern_obj, "pattern is null", false); try { cudf::jni::auto_set_device(env); - cudf::column_view *column_view = reinterpret_cast(j_view_handle); - cudf::strings_column_view strings_column(*column_view); - cudf::jni::native_jstring pattern(env, patternObj); - return release_as_jlong(cudf::strings::matches_re(strings_column, pattern.get())); + auto const column_view = reinterpret_cast(j_view_handle); + auto const strings_column = cudf::strings_column_view{*column_view}; + auto const pattern = cudf::jni::native_jstring(env, pattern_obj); + auto const flags = static_cast(regex_flags); + auto const groups = static_cast(capture_groups); + auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups); + return release_as_jlong(cudf::strings::matches_re(strings_column, *regex_prog)); } CATCH_STD(env, 0); } diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 61a63713e52..fd6218a6ddd 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -4087,26 +4087,39 @@ void testMatchesRe() { String patternString2 = "[A-Za-z]+\\s@[A-Za-z]+"; String patternString3 = ".*"; String patternString4 = ""; + RegexProgram regexProg1 = new RegexProgram(patternString1, CaptureGroups.NON_CAPTURE); + RegexProgram regexProg2 = new RegexProgram(patternString2, CaptureGroups.NON_CAPTURE); + RegexProgram regexProg3 = new RegexProgram(patternString3, CaptureGroups.NON_CAPTURE); + RegexProgram regexProg4 = new RegexProgram(patternString4, CaptureGroups.NON_CAPTURE); try (ColumnVector testStrings = ColumnVector.fromStrings("", null, "abCD", "ovér the", - "lazy @dog", "1234", "00:0:00"); - ColumnVector res1 = testStrings.matchesRe(patternString1); - ColumnVector res2 = testStrings.matchesRe(patternString2); - ColumnVector res3 = testStrings.matchesRe(patternString3); + "lazy @dog", "1234", "00:0:00"); ColumnVector expected1 = ColumnVector.fromBoxedBooleans(false, null, false, false, false, - true, true); + true, true); ColumnVector expected2 = ColumnVector.fromBoxedBooleans(false, null, false, false, true, - false, false); + false, false); ColumnVector expected3 = ColumnVector.fromBoxedBooleans(true, null, true, true, true, - true, true)) { - assertColumnsAreEqual(expected1, res1); - assertColumnsAreEqual(expected2, res2); - assertColumnsAreEqual(expected3, res3); + true, true)) { + try (ColumnVector res1 = testStrings.matchesRe(patternString1); + ColumnVector res2 = testStrings.matchesRe(patternString2); + ColumnVector res3 = testStrings.matchesRe(patternString3)) { + assertColumnsAreEqual(expected1, res1); + assertColumnsAreEqual(expected2, res2); + assertColumnsAreEqual(expected3, res3); + } + try (ColumnVector res1 = testStrings.matchesRe(regexProg1); + ColumnVector res2 = testStrings.matchesRe(regexProg2); + ColumnVector res3 = testStrings.matchesRe(regexProg3)) { + assertColumnsAreEqual(expected1, res1); + assertColumnsAreEqual(expected2, res2); + assertColumnsAreEqual(expected3, res3); + } + assertThrows(AssertionError.class, () -> { + try (ColumnVector res = testStrings.matchesRe(patternString4)) {} + }); + assertThrows(AssertionError.class, () -> { + try (ColumnVector res = testStrings.matchesRe(regexProg4)) {} + }); } - assertThrows(AssertionError.class, () -> { - try (ColumnVector testStrings = ColumnVector.fromStrings("", null, "abCD", "ovér the", - "lazy @dog", "1234", "00:0:00"); - ColumnVector res = testStrings.matchesRe(patternString4)) {} - }); } @Test From 326c9a35f5a011d330a276de4a709a80b5929b5e Mon Sep 17 00:00:00 2001 From: Cindy Jiang Date: Thu, 2 Feb 2023 12:08:50 -0800 Subject: [PATCH 5/9] updated extractAllRecord API and tests Signed-off-by: Cindy Jiang --- .../main/java/ai/rapids/cudf/ColumnView.java | 36 ++++++++++-- java/src/main/native/src/ColumnViewJni.cpp | 23 ++++---- .../java/ai/rapids/cudf/ColumnVectorTest.java | 55 +++++++++++-------- 3 files changed, 73 insertions(+), 41 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 4ee7f3d0de5..89362d8bf5a 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -3189,7 +3189,8 @@ public final ColumnVector matchesRe(RegexProgram regexProg) { assert type.equals(DType.STRING) : "column type must be a String"; assert regexProg != null : "regex program may not be null"; assert !regexProg.pattern().isEmpty() : "pattern string may not be empty"; - return new ColumnVector(matchesRe(getNativeView(), regexProg.pattern(), regexProg.combinedFlags(), regexProg.capture().nativeId)); + return new ColumnVector(matchesRe(getNativeView(), regexProg.pattern(), + regexProg.combinedFlags(), regexProg.capture().nativeId)); } /** @@ -3233,7 +3234,8 @@ public final ColumnVector containsRe(RegexProgram regexProg) { assert type.equals(DType.STRING) : "column type must be a String"; assert regexProg != null : "regex program may not be null"; assert !regexProg.pattern().isEmpty() : "pattern string may not be empty"; - return new ColumnVector(containsRe(getNativeView(), regexProg.pattern(), regexProg.combinedFlags(), regexProg.capture().nativeId)); + return new ColumnVector(containsRe(getNativeView(), regexProg.pattern(), + regexProg.combinedFlags(), regexProg.capture().nativeId)); } /** @@ -3264,11 +3266,31 @@ public final Table extractRe(String pattern) throws CudfException { * @param idx The regex group index * @return A new column vector of extracted matches */ + @Deprecated public final ColumnVector extractAllRecord(String pattern, int idx) { + if (idx == 0) { + return extractAllRecord(new RegexProgram(pattern, CaptureGroups.NON_CAPTURE), idx); + } + return extractAllRecord(new RegexProgram(pattern), idx); + } + + /** + * Extracts all strings that match the given regex program pattern and corresponds to the + * regular expression group index. Any null inputs also result in null output entries. + * + * For supported regex patterns refer to: + * @link https://docs.rapids.ai/api/libcudf/nightly/md_regex.html + * @param regexProg The regex program + * @param idx The regex group index + * @return A new column vector of extracted matches + */ + public final ColumnVector extractAllRecord(RegexProgram regexProg, int idx) { assert type.equals(DType.STRING) : "column type must be a String"; assert idx >= 0 : "group index must be at least 0"; - - return new ColumnVector(extractAllRecord(this.getNativeView(), pattern, idx)); + assert regexProg != null : "regex program may not be null"; + return new ColumnVector( + extractAllRecord(this.getNativeView(), regexProg.pattern(), regexProg.combinedFlags(), + regexProg.capture().nativeId, idx)); } /** @@ -4081,14 +4103,16 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat private static native long[] extractRe(long cudfViewHandle, String pattern) throws CudfException; /** - * Native method for extracting all results corresponding to group idx from a regular expression. + * Native method for extracting all results corresponding to group idx from a regex program pattern. * * @param nativeHandle Native handle of the cudf::column_view being operated on. * @param pattern String regex pattern. + * @param flags Regex flags setting. + * @param capture Capture groups setting. * @param idx Regex group index. A 0 value means matching the entire regex. * @return Native handle of a string column of the result. */ - private static native long extractAllRecord(long nativeHandle, String pattern, int idx); + private static native long extractAllRecord(long nativeHandle, String pattern, int flags, int capture, int idx); private static native long urlDecode(long cudfViewHandle); diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 3ed1f3ed42e..e364794d3bc 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -1690,21 +1690,22 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_extractRe(JNIEnv *en CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractAllRecord(JNIEnv *env, jclass, - jlong j_view_handle, - jstring pattern_obj, - jint idx) { +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractAllRecord( + JNIEnv *env, jclass, jlong j_view_handle, jstring pattern_obj, jint regex_flags, + jint capture_groups, jint idx) { JNI_NULL_CHECK(env, j_view_handle, "column is null", 0); + JNI_NULL_CHECK(env, pattern_obj, "pattern is null", 0); try { cudf::jni::auto_set_device(env); - cudf::strings_column_view const strings_column{ - *reinterpret_cast(j_view_handle)}; - cudf::jni::native_jstring pattern(env, pattern_obj); - - auto result = (idx == 0) ? cudf::strings::findall(strings_column, pattern.get()) : - cudf::strings::extract_all_record(strings_column, pattern.get()); - + auto const column_view = reinterpret_cast(j_view_handle); + auto const strings_column = cudf::strings_column_view{*column_view}; + auto const pattern = cudf::jni::native_jstring(env, pattern_obj); + auto const flags = static_cast(regex_flags); + auto const groups = static_cast(capture_groups); + auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups); + auto result = (idx == 0) ? cudf::strings::findall(strings_column, *regex_prog) : + cudf::strings::extract_all_record(strings_column, *regex_prog); return release_as_jlong(result); } CATCH_STD(env, 0); diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index fd6218a6ddd..26817281c2e 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -4053,31 +4053,38 @@ void testExtractRe() { @Test void testExtractAllRecord() { String pattern = "([ab])(\\d)"; + RegexProgram regexProg = new RegexProgram(pattern); try (ColumnVector v = ColumnVector.fromStrings("a1", "b2", "c3", null, "a1b1c3a2"); - ColumnVector expectedIdx0 = ColumnVector.fromLists( - new HostColumnVector.ListType(true, - new HostColumnVector.BasicType(true, DType.STRING)), - Arrays.asList("a1"), - Arrays.asList("b2"), - Arrays.asList(), - null, - Arrays.asList("a1", "b1", "a2")); - ColumnVector expectedIdx12 = ColumnVector.fromLists( - new HostColumnVector.ListType(true, - new HostColumnVector.BasicType(true, DType.STRING)), - Arrays.asList("a", "1"), - Arrays.asList("b", "2"), - null, - null, - Arrays.asList("a", "1", "b", "1", "a", "2")); - - ColumnVector resultIdx0 = v.extractAllRecord(pattern, 0); - ColumnVector resultIdx1 = v.extractAllRecord(pattern, 1); - ColumnVector resultIdx2 = v.extractAllRecord(pattern, 2); - ) { - assertColumnsAreEqual(expectedIdx0, resultIdx0); - assertColumnsAreEqual(expectedIdx12, resultIdx1); - assertColumnsAreEqual(expectedIdx12, resultIdx2); + ColumnVector expectedIdx0 = ColumnVector.fromLists( + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("a1"), + Arrays.asList("b2"), + Arrays.asList(), + null, + Arrays.asList("a1", "b1", "a2")); + ColumnVector expectedIdx12 = ColumnVector.fromLists( + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("a", "1"), + Arrays.asList("b", "2"), + null, + null, + Arrays.asList("a", "1", "b", "1", "a", "2"))) { + try (ColumnVector resultIdx0 = v.extractAllRecord(pattern, 0); + ColumnVector resultIdx1 = v.extractAllRecord(pattern, 1); + ColumnVector resultIdx2 = v.extractAllRecord(pattern, 2)) { + assertColumnsAreEqual(expectedIdx0, resultIdx0); + assertColumnsAreEqual(expectedIdx12, resultIdx1); + assertColumnsAreEqual(expectedIdx12, resultIdx2); + } + try (ColumnVector resultIdx0 = v.extractAllRecord(regexProg, 0); + ColumnVector resultIdx1 = v.extractAllRecord(regexProg, 1); + ColumnVector resultIdx2 = v.extractAllRecord(regexProg, 2)) { + assertColumnsAreEqual(expectedIdx0, resultIdx0); + assertColumnsAreEqual(expectedIdx12, resultIdx1); + assertColumnsAreEqual(expectedIdx12, resultIdx2); + } } } From 4ab46e23937e063fe088534e319c8f31273ea083 Mon Sep 17 00:00:00 2001 From: Cindy Jiang Date: Thu, 2 Feb 2023 14:53:24 -0800 Subject: [PATCH 6/9] fixed code formatting Signed-off-by: Cindy Jiang --- java/src/main/native/src/ColumnViewJni.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index e364794d3bc..ff07a6786c1 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -62,10 +62,10 @@ #include #include #include +#include #include #include #include -#include #include #include #include From bd6691df8e84e9612a1687fde02296d6eb88209a Mon Sep 17 00:00:00 2001 From: Cindy Jiang Date: Fri, 3 Feb 2023 10:29:10 -0800 Subject: [PATCH 7/9] updated function documentation Signed-off-by: Cindy Jiang --- .../src/main/java/ai/rapids/cudf/ColumnView.java | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 89362d8bf5a..7a90275b17f 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -3153,7 +3153,7 @@ public final ColumnVector clamp(Scalar lo, Scalar loReplace, Scalar hi, Scalar h * match the given regex pattern but only at the beginning of the string. * * ``` - * cv = ["abc","123","def456"] + * cv = ["abc", "123", "def456"] * result = cv.matches_re("\\d+") * r is now [false, true, false] * ``` @@ -3174,8 +3174,9 @@ public final ColumnVector matchesRe(String pattern) { * match the given regex program pattern but only at the beginning of the string. * * ``` - * cv = ["abc","123","def456"] - * result = cv.matches_re("\\d+") + * cv = ["abc", "123", "def456"] + * p = regex_program::create("\\d+") + * r = cv.matches_re(p) * r is now [false, true, false] * ``` * Any null string entries return corresponding null output column entries. @@ -3198,8 +3199,8 @@ public final ColumnVector matchesRe(RegexProgram regexProg) { * match the given regex pattern starting at any location. * * ``` - * cv = ["abc","123","def456"] - * result = cv.contains_re("\\d+") + * cv = ["abc", "123", "def456"] + * r = cv.contains_re("\\d+") * r is now [false, true, true] * ``` * Any null string entries return corresponding null output column entries. @@ -3219,8 +3220,9 @@ public final ColumnVector containsRe(String pattern) { * match the given RegexProgram pattern starting at any location. * * ``` - * cv = ["abc","123","def456"] - * result = cv.contains_re("\\d+") + * cv = ["abc", "123", "def456"] + * p = regex_program::create("\\d+") + * r = cv.contains_re(p) * r is now [false, true, true] * ``` * Any null string entries return corresponding null output column entries. From 1d74ec170fc40f03cf4477b0e32516e0cb3cdb86 Mon Sep 17 00:00:00 2001 From: Cindy Jiang Date: Fri, 3 Feb 2023 10:46:00 -0800 Subject: [PATCH 8/9] fixed function documentation Signed-off-by: Cindy Jiang --- java/src/main/java/ai/rapids/cudf/ColumnView.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 7a90275b17f..9981ce4535b 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -3154,7 +3154,7 @@ public final ColumnVector clamp(Scalar lo, Scalar loReplace, Scalar hi, Scalar h * * ``` * cv = ["abc", "123", "def456"] - * result = cv.matches_re("\\d+") + * result = cv.matchesRe("\\d+") * r is now [false, true, false] * ``` * Any null string entries return corresponding null output column entries. @@ -3175,8 +3175,8 @@ public final ColumnVector matchesRe(String pattern) { * * ``` * cv = ["abc", "123", "def456"] - * p = regex_program::create("\\d+") - * r = cv.matches_re(p) + * p = RegexProgram("\\d+", CaptureGroups.NON_CAPTURE) + * r = cv.matchesRe(p) * r is now [false, true, false] * ``` * Any null string entries return corresponding null output column entries. @@ -3200,7 +3200,7 @@ public final ColumnVector matchesRe(RegexProgram regexProg) { * * ``` * cv = ["abc", "123", "def456"] - * r = cv.contains_re("\\d+") + * r = cv.containsRe("\\d+") * r is now [false, true, true] * ``` * Any null string entries return corresponding null output column entries. @@ -3221,8 +3221,8 @@ public final ColumnVector containsRe(String pattern) { * * ``` * cv = ["abc", "123", "def456"] - * p = regex_program::create("\\d+") - * r = cv.contains_re(p) + * p = RegexProgram("\\d+", CaptureGroups.NON_CAPTURE) + * r = cv.containsRe(p) * r is now [false, true, true] * ``` * Any null string entries return corresponding null output column entries. From c13ccb64a56c68615fd481d04a666546a7ecda47 Mon Sep 17 00:00:00 2001 From: Cindy Jiang Date: Fri, 3 Feb 2023 10:50:37 -0800 Subject: [PATCH 9/9] updated function documentation Signed-off-by: Cindy Jiang --- java/src/main/java/ai/rapids/cudf/ColumnView.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 9981ce4535b..b3111cec77b 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -3175,7 +3175,7 @@ public final ColumnVector matchesRe(String pattern) { * * ``` * cv = ["abc", "123", "def456"] - * p = RegexProgram("\\d+", CaptureGroups.NON_CAPTURE) + * p = new RegexProgram("\\d+", CaptureGroups.NON_CAPTURE) * r = cv.matchesRe(p) * r is now [false, true, false] * ``` @@ -3221,7 +3221,7 @@ public final ColumnVector containsRe(String pattern) { * * ``` * cv = ["abc", "123", "def456"] - * p = RegexProgram("\\d+", CaptureGroups.NON_CAPTURE) + * p = new RegexProgram("\\d+", CaptureGroups.NON_CAPTURE) * r = cv.containsRe(p) * r is now [false, true, true] * ```