Skip to content

Commit

Permalink
Add regex_program searching APIs and related java classes (#12666)
Browse files Browse the repository at this point in the history
  • Loading branch information
cindyyuanjiang authored Feb 3, 2023
1 parent e380331 commit 17554ad
Show file tree
Hide file tree
Showing 6 changed files with 422 additions and 93 deletions.
36 changes: 36 additions & 0 deletions java/src/main/java/ai/rapids/cudf/CaptureGroups.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

package ai.rapids.cudf;

/**
* Capture groups setting, closely following cudf::strings::capture_groups.
*
* For processing a regex pattern containing capture groups. These can be used
* to optimize the generated regex instructions where the capture groups do not
* require extracting the groups.
*/
public enum CaptureGroups {
EXTRACT(0), // capture groups processed normally for extract
NON_CAPTURE(1); // convert all capture groups to non-capture groups

final int nativeId; // Native id, for use with libcudf.
private CaptureGroups(int nativeId) { // Only constant values should be used
this.nativeId = nativeId;
}
}
108 changes: 90 additions & 18 deletions java/src/main/java/ai/rapids/cudf/ColumnView.java
Original file line number Diff line number Diff line change
Expand Up @@ -3153,8 +3153,8 @@ public final ColumnVector clamp(Scalar lo, Scalar loReplace, Scalar hi, Scalar h
* match the given regex pattern but only at the beginning of the string.
*
* ```
* cv = ["abc","123","def456"]
* result = cv.matches_re("\\d+")
* cv = ["abc", "123", "def456"]
* result = cv.matchesRe("\\d+")
* r is now [false, true, false]
* ```
* Any null string entries return corresponding null output column entries.
Expand All @@ -3164,20 +3164,43 @@ public final ColumnVector clamp(Scalar lo, Scalar loReplace, Scalar hi, Scalar h
* @param pattern Regex pattern to match to each string.
* @return New ColumnVector of boolean results for each string.
*/
@Deprecated
public final ColumnVector matchesRe(String pattern) {
return matchesRe(new RegexProgram(pattern, CaptureGroups.NON_CAPTURE));
}

/**
* Returns a boolean ColumnVector identifying rows which
* match the given regex program pattern but only at the beginning of the string.
*
* ```
* cv = ["abc", "123", "def456"]
* p = new RegexProgram("\\d+", CaptureGroups.NON_CAPTURE)
* r = cv.matchesRe(p)
* r is now [false, true, false]
* ```
* Any null string entries return corresponding null output column entries.
* For supported regex patterns refer to:
* @link https://docs.rapids.ai/api/libcudf/nightly/md_regex.html
*
* @param regexProg Regex program to match to each string.
* @return New ColumnVector of boolean results for each string.
*/
public final ColumnVector matchesRe(RegexProgram regexProg) {
assert type.equals(DType.STRING) : "column type must be a String";
assert pattern != null : "pattern may not be null";
assert !pattern.isEmpty() : "pattern string may not be empty";
return new ColumnVector(matchesRe(getNativeView(), pattern));
assert regexProg != null : "regex program may not be null";
assert !regexProg.pattern().isEmpty() : "pattern string may not be empty";
return new ColumnVector(matchesRe(getNativeView(), regexProg.pattern(),
regexProg.combinedFlags(), regexProg.capture().nativeId));
}

/**
* Returns a boolean ColumnVector identifying rows which
* match the given regex pattern starting at any location.
*
* ```
* cv = ["abc","123","def456"]
* result = cv.matches_re("\\d+")
* cv = ["abc", "123", "def456"]
* r = cv.containsRe("\\d+")
* r is now [false, true, true]
* ```
* Any null string entries return corresponding null output column entries.
Expand All @@ -3187,11 +3210,34 @@ public final ColumnVector matchesRe(String pattern) {
* @param pattern Regex pattern to match to each string.
* @return New ColumnVector of boolean results for each string.
*/
@Deprecated
public final ColumnVector containsRe(String pattern) {
return containsRe(new RegexProgram(pattern, CaptureGroups.NON_CAPTURE));
}

/**
* Returns a boolean ColumnVector identifying rows which
* match the given RegexProgram pattern starting at any location.
*
* ```
* cv = ["abc", "123", "def456"]
* p = new RegexProgram("\\d+", CaptureGroups.NON_CAPTURE)
* r = cv.containsRe(p)
* r is now [false, true, true]
* ```
* Any null string entries return corresponding null output column entries.
* For supported regex patterns refer to:
* @link https://docs.rapids.ai/api/libcudf/nightly/md_regex.html
*
* @param regexProg Regex program to match to each string.
* @return New ColumnVector of boolean results for each string.
*/
public final ColumnVector containsRe(RegexProgram regexProg) {
assert type.equals(DType.STRING) : "column type must be a String";
assert pattern != null : "pattern may not be null";
assert !pattern.isEmpty() : "pattern string may not be empty";
return new ColumnVector(containsRe(getNativeView(), pattern));
assert regexProg != null : "regex program may not be null";
assert !regexProg.pattern().isEmpty() : "pattern string may not be empty";
return new ColumnVector(containsRe(getNativeView(), regexProg.pattern(),
regexProg.combinedFlags(), regexProg.capture().nativeId));
}

/**
Expand Down Expand Up @@ -3222,11 +3268,31 @@ public final Table extractRe(String pattern) throws CudfException {
* @param idx The regex group index
* @return A new column vector of extracted matches
*/
@Deprecated
public final ColumnVector extractAllRecord(String pattern, int idx) {
if (idx == 0) {
return extractAllRecord(new RegexProgram(pattern, CaptureGroups.NON_CAPTURE), idx);
}
return extractAllRecord(new RegexProgram(pattern), idx);
}

/**
* Extracts all strings that match the given regex program pattern and corresponds to the
* regular expression group index. Any null inputs also result in null output entries.
*
* For supported regex patterns refer to:
* @link https://docs.rapids.ai/api/libcudf/nightly/md_regex.html
* @param regexProg The regex program
* @param idx The regex group index
* @return A new column vector of extracted matches
*/
public final ColumnVector extractAllRecord(RegexProgram regexProg, int idx) {
assert type.equals(DType.STRING) : "column type must be a String";
assert idx >= 0 : "group index must be at least 0";

return new ColumnVector(extractAllRecord(this.getNativeView(), pattern, idx));
assert regexProg != null : "regex program may not be null";
return new ColumnVector(
extractAllRecord(this.getNativeView(), regexProg.pattern(), regexProg.combinedFlags(),
regexProg.capture().nativeId, idx));
}

/**
Expand Down Expand Up @@ -3995,21 +4061,25 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat
private static native long stringStrip(long columnView, int type, long toStrip) throws CudfException;

/**
* Native method for checking if strings match the passed in regex pattern from the
* Native method for checking if strings match the passed in regex program pattern from the
* beginning of the string.
* @param cudfViewHandle native handle of the cudf::column_view being operated on.
* @param pattern string regex pattern.
* @param flags regex flags setting.
* @param capture capture groups setting.
* @return native handle of the resulting cudf column containing the boolean results.
*/
private static native long matchesRe(long cudfViewHandle, String pattern) throws CudfException;
private static native long matchesRe(long cudfViewHandle, String pattern, int flags, int capture) throws CudfException;

/**
* Native method for checking if strings match the passed in regex pattern starting at any location.
* Native method for checking if strings match the passed in regex program pattern starting at any location.
* @param cudfViewHandle native handle of the cudf::column_view being operated on.
* @param pattern string regex pattern.
* @param flags regex flags setting.
* @param capture capture groups setting.
* @return native handle of the resulting cudf column containing the boolean results.
*/
private static native long containsRe(long cudfViewHandle, String pattern) throws CudfException;
private static native long containsRe(long cudfViewHandle, String pattern, int flags, int capture) throws CudfException;

/**
* Native method for checking if strings match the passed in like pattern
Expand All @@ -4035,14 +4105,16 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat
private static native long[] extractRe(long cudfViewHandle, String pattern) throws CudfException;

/**
* Native method for extracting all results corresponding to group idx from a regular expression.
* Native method for extracting all results corresponding to group idx from a regex program pattern.
*
* @param nativeHandle Native handle of the cudf::column_view being operated on.
* @param pattern String regex pattern.
* @param flags Regex flags setting.
* @param capture Capture groups setting.
* @param idx Regex group index. A 0 value means matching the entire regex.
* @return Native handle of a string column of the result.
*/
private static native long extractAllRecord(long nativeHandle, String pattern, int idx);
private static native long extractAllRecord(long nativeHandle, String pattern, int flags, int capture, int idx);

private static native long urlDecode(long cudfViewHandle);

Expand Down
37 changes: 37 additions & 0 deletions java/src/main/java/ai/rapids/cudf/RegexFlag.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

package ai.rapids.cudf;

/**
* Regex flags setting, closely following cudf::strings::regex_flags.
*
* These types can be or'd to combine them. The values are chosen to
* leave room for future flags and to match the Python flag values.
*/
public enum RegexFlag {
DEFAULT(0), // default
MULTILINE(8), // the '^' and '$' honor new-line characters
DOTALL(16), // the '.' matching includes new-line characters
ASCII(256); // use only ASCII when matching built-in character classes

final int nativeId; // Native id, for use with libcudf.
private RegexFlag(int nativeId) { // Only constant values should be used
this.nativeId = nativeId;
}
}
134 changes: 134 additions & 0 deletions java/src/main/java/ai/rapids/cudf/RegexProgram.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
/*
*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

package ai.rapids.cudf;

import java.util.EnumSet;

/**
* Regex program class, closely following cudf::strings::regex_program.
*/
public class RegexProgram {
private String pattern; // regex pattern
// regex flags for interpreting special characters in the pattern
private EnumSet<RegexFlag> flags;
// controls how capture groups in the pattern are used
// default is to extract a capture group
private CaptureGroups capture;

/**
* Constructor for RegexProgram
*
* @param pattern Regex pattern
*/
public RegexProgram(String pattern) {
this(pattern, EnumSet.of(RegexFlag.DEFAULT), CaptureGroups.EXTRACT);
}

/**
* Constructor for RegexProgram
*
* @param pattern Regex pattern
* @param flags Regex flags setting
*/
public RegexProgram(String pattern, EnumSet<RegexFlag> flags) {
this(pattern, flags, CaptureGroups.EXTRACT);
}

/**
* Constructor for RegexProgram
*
* @param pattern Regex pattern setting
* @param capture Capture groups setting
*/
public RegexProgram(String pattern, CaptureGroups capture) {
this(pattern, EnumSet.of(RegexFlag.DEFAULT), capture);
}

/**
* Constructor for RegexProgram
*
* @param pattern Regex pattern
* @param flags Regex flags setting
* @param capture Capture groups setting
*/
public RegexProgram(String pattern, EnumSet<RegexFlag> flags, CaptureGroups capture) {
assert pattern != null : "pattern may not be null";
this.pattern = pattern;
this.flags = flags;
this.capture = capture;
}

/**
* Get the pattern used to create this instance
*
* @param return A regex pattern as a string
*/
public String pattern() {
return pattern;
}

/**
* Get the regex flags setting used to create this instance
*
* @param return Regex flags setting
*/
public EnumSet<RegexFlag> flags() {
return flags;
}

/**
* Reset the regex flags setting for this instance
*
* @param flags Regex flags setting
*/
public void setFlags(EnumSet<RegexFlag> flags) {
this.flags = flags;
}

/**
* Get the capture groups setting used to create this instance
*
* @param return Capture groups setting
*/
public CaptureGroups capture() {
return capture;
}

/**
* Reset the capture groups setting for this instance
*
* @param capture Capture groups setting
*/
public void setCapture(CaptureGroups capture) {
this.capture = capture;
}

/**
* Combine the regex flags using 'or'
*
* @param return An integer representing the value of combined (or'ed) flags
*/
public int combinedFlags() {
int allFlags = 0;
for (RegexFlag flag : flags) {
allFlags |= flag.nativeId;
}
return allFlags;
}
}
Loading

0 comments on commit 17554ad

Please sign in to comment.