From e3803319d9578e976e90b5cf05c99dc3d7063551 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Fri, 3 Feb 2023 12:21:31 -0600 Subject: [PATCH 1/6] Remove child fom newCudaAsyncMemoryResource (#12681) Fixes a bug introduced here https://github.com/rapidsai/cudf/pull/12632 where the C++ version of the jni call `Rmm.newCudaAsyncMemoryResource` was taking an extra argument (`child`), causing the `threshold` to be set to a random value since the calling code defined the native method has taking 2 longs, not 3. The above can affect performance if the threshold is set to 0 for example. The async pool will reduce its footprint by releasing memory, which means we need to go back and reallocate. This is how I found it. I am starting the build process for this, so it's going to take me a bit to test. Authors: - Alessandro Bellina (https://github.com/abellina) Approvers: - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/12681 --- java/src/main/native/src/RmmJni.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 1ce69414c98..5bbb5383d93 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -549,10 +549,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseArenaMemoryResource(JNIEnv } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(JNIEnv *env, - jclass clazz, - jlong child, jlong init, + jclass clazz, jlong init, jlong release) { - JNI_NULL_CHECK(env, child, "child is null", 0); try { cudf::jni::auto_set_device(env); auto ret = new rmm::mr::cuda_async_memory_resource(init, release); From 17554adf507cd43f9fbfe51820163751d220188c Mon Sep 17 00:00:00 2001 From: Cindy Jiang <47068112+cindyyuanjiang@users.noreply.github.com> Date: Fri, 3 Feb 2023 13:19:30 -0800 Subject: [PATCH 2/6] Add `regex_program` searching APIs and related java classes (#12666) --- .../java/ai/rapids/cudf/CaptureGroups.java | 36 +++++ .../main/java/ai/rapids/cudf/ColumnView.java | 108 ++++++++++--- .../main/java/ai/rapids/cudf/RegexFlag.java | 37 +++++ .../java/ai/rapids/cudf/RegexProgram.java | 134 +++++++++++++++++ java/src/main/native/src/ColumnViewJni.cpp | 58 ++++--- .../java/ai/rapids/cudf/ColumnVectorTest.java | 142 +++++++++++------- 6 files changed, 422 insertions(+), 93 deletions(-) create mode 100644 java/src/main/java/ai/rapids/cudf/CaptureGroups.java create mode 100644 java/src/main/java/ai/rapids/cudf/RegexFlag.java create mode 100644 java/src/main/java/ai/rapids/cudf/RegexProgram.java diff --git a/java/src/main/java/ai/rapids/cudf/CaptureGroups.java b/java/src/main/java/ai/rapids/cudf/CaptureGroups.java new file mode 100644 index 00000000000..2ab778dbc35 --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/CaptureGroups.java @@ -0,0 +1,36 @@ +/* + * + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +/** + * Capture groups setting, closely following cudf::strings::capture_groups. + * + * For processing a regex pattern containing capture groups. These can be used + * to optimize the generated regex instructions where the capture groups do not + * require extracting the groups. + */ +public enum CaptureGroups { + EXTRACT(0), // capture groups processed normally for extract + NON_CAPTURE(1); // convert all capture groups to non-capture groups + + final int nativeId; // Native id, for use with libcudf. + private CaptureGroups(int nativeId) { // Only constant values should be used + this.nativeId = nativeId; + } +} diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 47d6b7573cd..b3111cec77b 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -3153,8 +3153,8 @@ public final ColumnVector clamp(Scalar lo, Scalar loReplace, Scalar hi, Scalar h * match the given regex pattern but only at the beginning of the string. * * ``` - * cv = ["abc","123","def456"] - * result = cv.matches_re("\\d+") + * cv = ["abc", "123", "def456"] + * result = cv.matchesRe("\\d+") * r is now [false, true, false] * ``` * Any null string entries return corresponding null output column entries. @@ -3164,11 +3164,34 @@ public final ColumnVector clamp(Scalar lo, Scalar loReplace, Scalar hi, Scalar h * @param pattern Regex pattern to match to each string. * @return New ColumnVector of boolean results for each string. */ + @Deprecated public final ColumnVector matchesRe(String pattern) { + return matchesRe(new RegexProgram(pattern, CaptureGroups.NON_CAPTURE)); + } + + /** + * Returns a boolean ColumnVector identifying rows which + * match the given regex program pattern but only at the beginning of the string. + * + * ``` + * cv = ["abc", "123", "def456"] + * p = new RegexProgram("\\d+", CaptureGroups.NON_CAPTURE) + * r = cv.matchesRe(p) + * r is now [false, true, false] + * ``` + * Any null string entries return corresponding null output column entries. + * For supported regex patterns refer to: + * @link https://docs.rapids.ai/api/libcudf/nightly/md_regex.html + * + * @param regexProg Regex program to match to each string. + * @return New ColumnVector of boolean results for each string. + */ + public final ColumnVector matchesRe(RegexProgram regexProg) { assert type.equals(DType.STRING) : "column type must be a String"; - assert pattern != null : "pattern may not be null"; - assert !pattern.isEmpty() : "pattern string may not be empty"; - return new ColumnVector(matchesRe(getNativeView(), pattern)); + assert regexProg != null : "regex program may not be null"; + assert !regexProg.pattern().isEmpty() : "pattern string may not be empty"; + return new ColumnVector(matchesRe(getNativeView(), regexProg.pattern(), + regexProg.combinedFlags(), regexProg.capture().nativeId)); } /** @@ -3176,8 +3199,8 @@ public final ColumnVector matchesRe(String pattern) { * match the given regex pattern starting at any location. * * ``` - * cv = ["abc","123","def456"] - * result = cv.matches_re("\\d+") + * cv = ["abc", "123", "def456"] + * r = cv.containsRe("\\d+") * r is now [false, true, true] * ``` * Any null string entries return corresponding null output column entries. @@ -3187,11 +3210,34 @@ public final ColumnVector matchesRe(String pattern) { * @param pattern Regex pattern to match to each string. * @return New ColumnVector of boolean results for each string. */ + @Deprecated public final ColumnVector containsRe(String pattern) { + return containsRe(new RegexProgram(pattern, CaptureGroups.NON_CAPTURE)); + } + + /** + * Returns a boolean ColumnVector identifying rows which + * match the given RegexProgram pattern starting at any location. + * + * ``` + * cv = ["abc", "123", "def456"] + * p = new RegexProgram("\\d+", CaptureGroups.NON_CAPTURE) + * r = cv.containsRe(p) + * r is now [false, true, true] + * ``` + * Any null string entries return corresponding null output column entries. + * For supported regex patterns refer to: + * @link https://docs.rapids.ai/api/libcudf/nightly/md_regex.html + * + * @param regexProg Regex program to match to each string. + * @return New ColumnVector of boolean results for each string. + */ + public final ColumnVector containsRe(RegexProgram regexProg) { assert type.equals(DType.STRING) : "column type must be a String"; - assert pattern != null : "pattern may not be null"; - assert !pattern.isEmpty() : "pattern string may not be empty"; - return new ColumnVector(containsRe(getNativeView(), pattern)); + assert regexProg != null : "regex program may not be null"; + assert !regexProg.pattern().isEmpty() : "pattern string may not be empty"; + return new ColumnVector(containsRe(getNativeView(), regexProg.pattern(), + regexProg.combinedFlags(), regexProg.capture().nativeId)); } /** @@ -3222,11 +3268,31 @@ public final Table extractRe(String pattern) throws CudfException { * @param idx The regex group index * @return A new column vector of extracted matches */ + @Deprecated public final ColumnVector extractAllRecord(String pattern, int idx) { + if (idx == 0) { + return extractAllRecord(new RegexProgram(pattern, CaptureGroups.NON_CAPTURE), idx); + } + return extractAllRecord(new RegexProgram(pattern), idx); + } + + /** + * Extracts all strings that match the given regex program pattern and corresponds to the + * regular expression group index. Any null inputs also result in null output entries. + * + * For supported regex patterns refer to: + * @link https://docs.rapids.ai/api/libcudf/nightly/md_regex.html + * @param regexProg The regex program + * @param idx The regex group index + * @return A new column vector of extracted matches + */ + public final ColumnVector extractAllRecord(RegexProgram regexProg, int idx) { assert type.equals(DType.STRING) : "column type must be a String"; assert idx >= 0 : "group index must be at least 0"; - - return new ColumnVector(extractAllRecord(this.getNativeView(), pattern, idx)); + assert regexProg != null : "regex program may not be null"; + return new ColumnVector( + extractAllRecord(this.getNativeView(), regexProg.pattern(), regexProg.combinedFlags(), + regexProg.capture().nativeId, idx)); } /** @@ -3995,21 +4061,25 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat private static native long stringStrip(long columnView, int type, long toStrip) throws CudfException; /** - * Native method for checking if strings match the passed in regex pattern from the + * Native method for checking if strings match the passed in regex program pattern from the * beginning of the string. * @param cudfViewHandle native handle of the cudf::column_view being operated on. * @param pattern string regex pattern. + * @param flags regex flags setting. + * @param capture capture groups setting. * @return native handle of the resulting cudf column containing the boolean results. */ - private static native long matchesRe(long cudfViewHandle, String pattern) throws CudfException; + private static native long matchesRe(long cudfViewHandle, String pattern, int flags, int capture) throws CudfException; /** - * Native method for checking if strings match the passed in regex pattern starting at any location. + * Native method for checking if strings match the passed in regex program pattern starting at any location. * @param cudfViewHandle native handle of the cudf::column_view being operated on. * @param pattern string regex pattern. + * @param flags regex flags setting. + * @param capture capture groups setting. * @return native handle of the resulting cudf column containing the boolean results. */ - private static native long containsRe(long cudfViewHandle, String pattern) throws CudfException; + private static native long containsRe(long cudfViewHandle, String pattern, int flags, int capture) throws CudfException; /** * Native method for checking if strings match the passed in like pattern @@ -4035,14 +4105,16 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat private static native long[] extractRe(long cudfViewHandle, String pattern) throws CudfException; /** - * Native method for extracting all results corresponding to group idx from a regular expression. + * Native method for extracting all results corresponding to group idx from a regex program pattern. * * @param nativeHandle Native handle of the cudf::column_view being operated on. * @param pattern String regex pattern. + * @param flags Regex flags setting. + * @param capture Capture groups setting. * @param idx Regex group index. A 0 value means matching the entire regex. * @return Native handle of a string column of the result. */ - private static native long extractAllRecord(long nativeHandle, String pattern, int idx); + private static native long extractAllRecord(long nativeHandle, String pattern, int flags, int capture, int idx); private static native long urlDecode(long cudfViewHandle); diff --git a/java/src/main/java/ai/rapids/cudf/RegexFlag.java b/java/src/main/java/ai/rapids/cudf/RegexFlag.java new file mode 100644 index 00000000000..7ed8e0354c9 --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/RegexFlag.java @@ -0,0 +1,37 @@ +/* + * + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +/** + * Regex flags setting, closely following cudf::strings::regex_flags. + * + * These types can be or'd to combine them. The values are chosen to + * leave room for future flags and to match the Python flag values. + */ +public enum RegexFlag { + DEFAULT(0), // default + MULTILINE(8), // the '^' and '$' honor new-line characters + DOTALL(16), // the '.' matching includes new-line characters + ASCII(256); // use only ASCII when matching built-in character classes + + final int nativeId; // Native id, for use with libcudf. + private RegexFlag(int nativeId) { // Only constant values should be used + this.nativeId = nativeId; + } +} diff --git a/java/src/main/java/ai/rapids/cudf/RegexProgram.java b/java/src/main/java/ai/rapids/cudf/RegexProgram.java new file mode 100644 index 00000000000..191a0b95ff3 --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/RegexProgram.java @@ -0,0 +1,134 @@ +/* + * + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +import java.util.EnumSet; + +/** + * Regex program class, closely following cudf::strings::regex_program. + */ +public class RegexProgram { + private String pattern; // regex pattern + // regex flags for interpreting special characters in the pattern + private EnumSet flags; + // controls how capture groups in the pattern are used + // default is to extract a capture group + private CaptureGroups capture; + + /** + * Constructor for RegexProgram + * + * @param pattern Regex pattern + */ + public RegexProgram(String pattern) { + this(pattern, EnumSet.of(RegexFlag.DEFAULT), CaptureGroups.EXTRACT); + } + + /** + * Constructor for RegexProgram + * + * @param pattern Regex pattern + * @param flags Regex flags setting + */ + public RegexProgram(String pattern, EnumSet flags) { + this(pattern, flags, CaptureGroups.EXTRACT); + } + + /** + * Constructor for RegexProgram + * + * @param pattern Regex pattern setting + * @param capture Capture groups setting + */ + public RegexProgram(String pattern, CaptureGroups capture) { + this(pattern, EnumSet.of(RegexFlag.DEFAULT), capture); + } + + /** + * Constructor for RegexProgram + * + * @param pattern Regex pattern + * @param flags Regex flags setting + * @param capture Capture groups setting + */ + public RegexProgram(String pattern, EnumSet flags, CaptureGroups capture) { + assert pattern != null : "pattern may not be null"; + this.pattern = pattern; + this.flags = flags; + this.capture = capture; + } + + /** + * Get the pattern used to create this instance + * + * @param return A regex pattern as a string + */ + public String pattern() { + return pattern; + } + + /** + * Get the regex flags setting used to create this instance + * + * @param return Regex flags setting + */ + public EnumSet flags() { + return flags; + } + + /** + * Reset the regex flags setting for this instance + * + * @param flags Regex flags setting + */ + public void setFlags(EnumSet flags) { + this.flags = flags; + } + + /** + * Get the capture groups setting used to create this instance + * + * @param return Capture groups setting + */ + public CaptureGroups capture() { + return capture; + } + + /** + * Reset the capture groups setting for this instance + * + * @param capture Capture groups setting + */ + public void setCapture(CaptureGroups capture) { + this.capture = capture; + } + + /** + * Combine the regex flags using 'or' + * + * @param return An integer representing the value of combined (or'ed) flags + */ + public int combinedFlags() { + int allFlags = 0; + for (RegexFlag flag : flags) { + allFlags |= flag.nativeId; + } + return allFlags; + } +} diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index b48ddae196b..ff07a6786c1 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -62,6 +62,7 @@ #include #include #include +#include #include #include #include @@ -1290,32 +1291,42 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringContains(JNIEnv *en JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_matchesRe(JNIEnv *env, jobject j_object, jlong j_view_handle, - jstring patternObj) { + jstring pattern_obj, + jint regex_flags, + jint capture_groups) { JNI_NULL_CHECK(env, j_view_handle, "column is null", false); - JNI_NULL_CHECK(env, patternObj, "pattern is null", false); + JNI_NULL_CHECK(env, pattern_obj, "pattern is null", false); try { cudf::jni::auto_set_device(env); - cudf::column_view *column_view = reinterpret_cast(j_view_handle); - cudf::strings_column_view strings_column(*column_view); - cudf::jni::native_jstring pattern(env, patternObj); - return release_as_jlong(cudf::strings::matches_re(strings_column, pattern.get())); + auto const column_view = reinterpret_cast(j_view_handle); + auto const strings_column = cudf::strings_column_view{*column_view}; + auto const pattern = cudf::jni::native_jstring(env, pattern_obj); + auto const flags = static_cast(regex_flags); + auto const groups = static_cast(capture_groups); + auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups); + return release_as_jlong(cudf::strings::matches_re(strings_column, *regex_prog)); } CATCH_STD(env, 0); } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsRe(JNIEnv *env, jobject j_object, jlong j_view_handle, - jstring patternObj) { + jstring pattern_obj, + jint regex_flags, + jint capture_groups) { JNI_NULL_CHECK(env, j_view_handle, "column is null", false); - JNI_NULL_CHECK(env, patternObj, "pattern is null", false); + JNI_NULL_CHECK(env, pattern_obj, "pattern is null", false); try { cudf::jni::auto_set_device(env); - cudf::column_view *column_view = reinterpret_cast(j_view_handle); - cudf::strings_column_view strings_column(*column_view); - cudf::jni::native_jstring pattern(env, patternObj); - return release_as_jlong(cudf::strings::contains_re(strings_column, pattern.get())); + auto const column_view = reinterpret_cast(j_view_handle); + auto const strings_column = cudf::strings_column_view{*column_view}; + auto const pattern = cudf::jni::native_jstring(env, pattern_obj); + auto const flags = static_cast(regex_flags); + auto const capture = static_cast(capture_groups); + auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, capture); + return release_as_jlong(cudf::strings::contains_re(strings_column, *regex_prog)); } CATCH_STD(env, 0); } @@ -1679,21 +1690,22 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_extractRe(JNIEnv *en CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractAllRecord(JNIEnv *env, jclass, - jlong j_view_handle, - jstring pattern_obj, - jint idx) { +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractAllRecord( + JNIEnv *env, jclass, jlong j_view_handle, jstring pattern_obj, jint regex_flags, + jint capture_groups, jint idx) { JNI_NULL_CHECK(env, j_view_handle, "column is null", 0); + JNI_NULL_CHECK(env, pattern_obj, "pattern is null", 0); try { cudf::jni::auto_set_device(env); - cudf::strings_column_view const strings_column{ - *reinterpret_cast(j_view_handle)}; - cudf::jni::native_jstring pattern(env, pattern_obj); - - auto result = (idx == 0) ? cudf::strings::findall(strings_column, pattern.get()) : - cudf::strings::extract_all_record(strings_column, pattern.get()); - + auto const column_view = reinterpret_cast(j_view_handle); + auto const strings_column = cudf::strings_column_view{*column_view}; + auto const pattern = cudf::jni::native_jstring(env, pattern_obj); + auto const flags = static_cast(regex_flags); + auto const groups = static_cast(capture_groups); + auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups); + auto result = (idx == 0) ? cudf::strings::findall(strings_column, *regex_prog) : + cudf::strings::extract_all_record(strings_column, *regex_prog); return release_as_jlong(result); } CATCH_STD(env, 0); diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index fc0a542e0a7..26817281c2e 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -4053,31 +4053,38 @@ void testExtractRe() { @Test void testExtractAllRecord() { String pattern = "([ab])(\\d)"; + RegexProgram regexProg = new RegexProgram(pattern); try (ColumnVector v = ColumnVector.fromStrings("a1", "b2", "c3", null, "a1b1c3a2"); - ColumnVector expectedIdx0 = ColumnVector.fromLists( - new HostColumnVector.ListType(true, - new HostColumnVector.BasicType(true, DType.STRING)), - Arrays.asList("a1"), - Arrays.asList("b2"), - Arrays.asList(), - null, - Arrays.asList("a1", "b1", "a2")); - ColumnVector expectedIdx12 = ColumnVector.fromLists( - new HostColumnVector.ListType(true, - new HostColumnVector.BasicType(true, DType.STRING)), - Arrays.asList("a", "1"), - Arrays.asList("b", "2"), - null, - null, - Arrays.asList("a", "1", "b", "1", "a", "2")); - - ColumnVector resultIdx0 = v.extractAllRecord(pattern, 0); - ColumnVector resultIdx1 = v.extractAllRecord(pattern, 1); - ColumnVector resultIdx2 = v.extractAllRecord(pattern, 2); - ) { - assertColumnsAreEqual(expectedIdx0, resultIdx0); - assertColumnsAreEqual(expectedIdx12, resultIdx1); - assertColumnsAreEqual(expectedIdx12, resultIdx2); + ColumnVector expectedIdx0 = ColumnVector.fromLists( + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("a1"), + Arrays.asList("b2"), + Arrays.asList(), + null, + Arrays.asList("a1", "b1", "a2")); + ColumnVector expectedIdx12 = ColumnVector.fromLists( + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("a", "1"), + Arrays.asList("b", "2"), + null, + null, + Arrays.asList("a", "1", "b", "1", "a", "2"))) { + try (ColumnVector resultIdx0 = v.extractAllRecord(pattern, 0); + ColumnVector resultIdx1 = v.extractAllRecord(pattern, 1); + ColumnVector resultIdx2 = v.extractAllRecord(pattern, 2)) { + assertColumnsAreEqual(expectedIdx0, resultIdx0); + assertColumnsAreEqual(expectedIdx12, resultIdx1); + assertColumnsAreEqual(expectedIdx12, resultIdx2); + } + try (ColumnVector resultIdx0 = v.extractAllRecord(regexProg, 0); + ColumnVector resultIdx1 = v.extractAllRecord(regexProg, 1); + ColumnVector resultIdx2 = v.extractAllRecord(regexProg, 2)) { + assertColumnsAreEqual(expectedIdx0, resultIdx0); + assertColumnsAreEqual(expectedIdx12, resultIdx1); + assertColumnsAreEqual(expectedIdx12, resultIdx2); + } } } @@ -4087,26 +4094,39 @@ void testMatchesRe() { String patternString2 = "[A-Za-z]+\\s@[A-Za-z]+"; String patternString3 = ".*"; String patternString4 = ""; + RegexProgram regexProg1 = new RegexProgram(patternString1, CaptureGroups.NON_CAPTURE); + RegexProgram regexProg2 = new RegexProgram(patternString2, CaptureGroups.NON_CAPTURE); + RegexProgram regexProg3 = new RegexProgram(patternString3, CaptureGroups.NON_CAPTURE); + RegexProgram regexProg4 = new RegexProgram(patternString4, CaptureGroups.NON_CAPTURE); try (ColumnVector testStrings = ColumnVector.fromStrings("", null, "abCD", "ovér the", - "lazy @dog", "1234", "00:0:00"); - ColumnVector res1 = testStrings.matchesRe(patternString1); - ColumnVector res2 = testStrings.matchesRe(patternString2); - ColumnVector res3 = testStrings.matchesRe(patternString3); + "lazy @dog", "1234", "00:0:00"); ColumnVector expected1 = ColumnVector.fromBoxedBooleans(false, null, false, false, false, - true, true); + true, true); ColumnVector expected2 = ColumnVector.fromBoxedBooleans(false, null, false, false, true, - false, false); + false, false); ColumnVector expected3 = ColumnVector.fromBoxedBooleans(true, null, true, true, true, - true, true)) { - assertColumnsAreEqual(expected1, res1); - assertColumnsAreEqual(expected2, res2); - assertColumnsAreEqual(expected3, res3); + true, true)) { + try (ColumnVector res1 = testStrings.matchesRe(patternString1); + ColumnVector res2 = testStrings.matchesRe(patternString2); + ColumnVector res3 = testStrings.matchesRe(patternString3)) { + assertColumnsAreEqual(expected1, res1); + assertColumnsAreEqual(expected2, res2); + assertColumnsAreEqual(expected3, res3); + } + try (ColumnVector res1 = testStrings.matchesRe(regexProg1); + ColumnVector res2 = testStrings.matchesRe(regexProg2); + ColumnVector res3 = testStrings.matchesRe(regexProg3)) { + assertColumnsAreEqual(expected1, res1); + assertColumnsAreEqual(expected2, res2); + assertColumnsAreEqual(expected3, res3); + } + assertThrows(AssertionError.class, () -> { + try (ColumnVector res = testStrings.matchesRe(patternString4)) {} + }); + assertThrows(AssertionError.class, () -> { + try (ColumnVector res = testStrings.matchesRe(regexProg4)) {} + }); } - assertThrows(AssertionError.class, () -> { - try (ColumnVector testStrings = ColumnVector.fromStrings("", null, "abCD", "ovér the", - "lazy @dog", "1234", "00:0:00"); - ColumnVector res = testStrings.matchesRe(patternString4)) {} - }); } @Test @@ -4115,36 +4135,54 @@ void testContainsRe() { String patternString2 = "[A-Za-z]+\\s@[A-Za-z]+"; String patternString3 = ".*"; String patternString4 = ""; + RegexProgram regexProg1 = new RegexProgram(patternString1, CaptureGroups.NON_CAPTURE); + RegexProgram regexProg2 = new RegexProgram(patternString2, CaptureGroups.NON_CAPTURE); + RegexProgram regexProg3 = new RegexProgram(patternString3, CaptureGroups.NON_CAPTURE); + RegexProgram regexProg4 = new RegexProgram(patternString4, CaptureGroups.NON_CAPTURE); try (ColumnVector testStrings = ColumnVector.fromStrings(null, "abCD", "ovér the", - "lazy @dog", "1234", "00:0:00", "abc1234abc", "there @are 2 lazy @dogs"); - ColumnVector res1 = testStrings.containsRe(patternString1); - ColumnVector res2 = testStrings.containsRe(patternString2); - ColumnVector res3 = testStrings.containsRe(patternString3); + "lazy @dog", "1234", "00:0:00", "abc1234abc", "there @are 2 lazy @dogs"); ColumnVector expected1 = ColumnVector.fromBoxedBooleans(null, false, false, false, true, true, true, true); ColumnVector expected2 = ColumnVector.fromBoxedBooleans(null, false, false, true, false, false, false, true); ColumnVector expected3 = ColumnVector.fromBoxedBooleans(null, true, true, true, true, true, true, true)) { - assertColumnsAreEqual(expected1, res1); - assertColumnsAreEqual(expected2, res2); - assertColumnsAreEqual(expected3, res3); + try (ColumnVector res1 = testStrings.containsRe(patternString1); + ColumnVector res2 = testStrings.containsRe(patternString2); + ColumnVector res3 = testStrings.containsRe(patternString3)) { + assertColumnsAreEqual(expected1, res1); + assertColumnsAreEqual(expected2, res2); + assertColumnsAreEqual(expected3, res3); + } + try (ColumnVector res1 = testStrings.containsRe(regexProg1); + ColumnVector res2 = testStrings.containsRe(regexProg2); + ColumnVector res3 = testStrings.containsRe(regexProg3)) { + assertColumnsAreEqual(expected1, res1); + assertColumnsAreEqual(expected2, res2); + assertColumnsAreEqual(expected3, res3); + } + } + try (ColumnVector testStrings = ColumnVector.fromStrings("", null, "abCD", "ovér the", + "lazy @dog", "1234", "00:0:00", "abc1234abc", "there @are 2 lazy @dogs")) { + assertThrows(AssertionError.class, () -> { + try (ColumnVector res = testStrings.containsRe(patternString4)) {} + }); + assertThrows(AssertionError.class, () -> { + try (ColumnVector res = testStrings.containsRe(regexProg4)) {} + }); } - assertThrows(AssertionError.class, () -> { - try (ColumnVector testStrings = ColumnVector.fromStrings("", null, "abCD", "ovér the", - "lazy @dog", "1234", "00:0:00", "abc1234abc", "there @are 2 lazy @dogs"); - ColumnVector res = testStrings.containsRe(patternString4)) {} - }); } @Test - @Disabled("Needs fix for https://github.com/rapidsai/cudf/issues/4671") void testContainsReEmptyInput() { String patternString1 = ".*"; + RegexProgram regexProg1 = new RegexProgram(patternString1, CaptureGroups.NON_CAPTURE); try (ColumnVector testStrings = ColumnVector.fromStrings(""); ColumnVector res1 = testStrings.containsRe(patternString1); + ColumnVector resReProg1 = testStrings.containsRe(regexProg1); ColumnVector expected1 = ColumnVector.fromBoxedBooleans(true)) { assertColumnsAreEqual(expected1, res1); + assertColumnsAreEqual(expected1, resReProg1); } } From 58e0fde34682c47923b1285e89704e4ac3bd96eb Mon Sep 17 00:00:00 2001 From: AJ Schmidt Date: Fri, 3 Feb 2023 17:16:02 -0500 Subject: [PATCH 3/6] update workflow branches [skip ci] (#12696) This PR updates the branch reference used for our shared workflows. I will open similar PRs for `branch-23.04` next week. Authors: - AJ Schmidt (https://github.com/ajschmidt8) Approvers: - Ray Douglass (https://github.com/raydouglass) --- .github/workflows/build.yaml | 14 +++++++------- .github/workflows/pr.yaml | 26 +++++++++++++------------- .github/workflows/test.yaml | 14 +++++++------- ci/release/update-version.sh | 4 ++++ 4 files changed, 31 insertions(+), 27 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 74632e1d8d7..3366554db30 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.02 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.02 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.02 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -55,7 +55,7 @@ jobs: skip_upload_pkgs: libcudf-example wheel-build-cudf: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.02 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -67,7 +67,7 @@ jobs: wheel-publish-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.02 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -77,7 +77,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-publish-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-build.yml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-build.yml@branch-23.02 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -88,7 +88,7 @@ jobs: wheel-publish-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-publish.yml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-publish.yml@branch-23.02 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 89e14d3e421..cf20b0006a2 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -25,32 +25,32 @@ jobs: - wheel-build-dask-cudf - wheel-tests-dask-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.02 checks: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.02 conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.02 with: build_type: pull-request conda-cpp-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.02 with: build_type: pull-request conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.02 with: build_type: pull-request conda-python-cudf-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.02 with: build_type: pull-request test_script: "ci/test_python_cudf.sh" @@ -58,14 +58,14 @@ jobs: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.02 with: build_type: pull-request test_script: "ci/test_python_other.sh" conda-java-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.02 with: build_type: pull-request node_type: "gpu-latest-1" @@ -75,7 +75,7 @@ jobs: conda-notebook-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.02 with: build_type: pull-request node_type: "gpu-latest-1" @@ -85,7 +85,7 @@ jobs: wheel-build-cudf: needs: checks secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.02 with: build_type: pull-request package-name: cudf @@ -94,7 +94,7 @@ jobs: wheel-tests-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.02 with: build_type: pull-request package-name: cudf @@ -106,7 +106,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-tests-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-build.yml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-build.yml@branch-23.02 with: build_type: pull-request package-name: dask_cudf @@ -115,7 +115,7 @@ jobs: wheel-tests-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-test.yml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-test.yml@branch-23.02 with: build_type: pull-request package-name: dask_cudf diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index b383d185564..1b117bb2f4f 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.02 with: build_type: nightly branch: ${{ inputs.branch }} @@ -24,7 +24,7 @@ jobs: sha: ${{ inputs.sha }} conda-python-cudf-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.02 with: build_type: nightly branch: ${{ inputs.branch }} @@ -34,7 +34,7 @@ jobs: conda-python-other-tests: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.02 with: build_type: nightly branch: ${{ inputs.branch }} @@ -43,7 +43,7 @@ jobs: test_script: "ci/test_python_other.sh" conda-java-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.02 with: build_type: nightly branch: ${{ inputs.branch }} @@ -55,7 +55,7 @@ jobs: run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.02 with: build_type: nightly branch: ${{ inputs.branch }} @@ -67,7 +67,7 @@ jobs: run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.02 with: build_type: nightly branch: ${{ inputs.branch }} @@ -78,7 +78,7 @@ jobs: test-unittest: "pytest -v -n 8 ./python/cudf/cudf/tests" wheel-tests-dask-cudf: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-test.yml@cuda-118 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-test.yml@branch-23.02 with: build_type: nightly branch: ${{ inputs.branch }} diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index ee5c0a823ee..555a67d9cd6 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -94,3 +94,7 @@ NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; prin # Wheel builds install intra-RAPIDS dependencies from same release sed_runner "s/rmm{cuda_suffix}.*\",/rmm{cuda_suffix}==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/cudf/setup.py sed_runner "s/cudf{cuda_suffix}==.*\",/cudf{cuda_suffix}==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/dask_cudf/setup.py + +for FILE in .github/workflows/*.yaml; do + sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}" +done From 82860012c6b68c12b5b319c387ccfe6bdff33d44 Mon Sep 17 00:00:00 2001 From: AJ Schmidt Date: Mon, 6 Feb 2023 12:57:22 -0500 Subject: [PATCH 4/6] Move date to build string in `conda` recipe (#12661) This PR moves the date string from the version to the build string for conda recipes in this repository. This is necessary to ensure that the conda packages resulting from PR builds can be installed in the same environment as nightly conda packages, which is useful for testing purposes. Additionally, this prevents a bug from occurring where the Python builds fail because the date string it computes is different than the one computed by the C++ build, therefore causing the Python build to search for a C++ build artifact that doesn't exist. xref: https://github.com/rapidsai/rmm/pull/1195 Authors: - AJ Schmidt (https://github.com/ajschmidt8) Approvers: - Ray Douglass (https://github.com/raydouglass) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/12661 --- conda/recipes/cudf/meta.yaml | 7 ++++--- conda/recipes/cudf_kafka/meta.yaml | 7 ++++--- conda/recipes/custreamz/meta.yaml | 7 ++++--- conda/recipes/dask-cudf/meta.yaml | 7 ++++--- conda/recipes/libcudf/meta.yaml | 11 ++++++----- conda/recipes/strings_udf/meta.yaml | 7 ++++--- 6 files changed, 26 insertions(+), 20 deletions(-) diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 205ca2a995c..0d5b5d16e08 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -1,10 +1,11 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. -{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} +{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version = environ['CONDA_PY'] %} {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} {% set cuda_major = cuda_version.split('.')[0] %} +{% set date_string = environ['RAPIDS_DATE_STRING'] %} package: name: cudf @@ -15,7 +16,7 @@ source: build: number: {{ GIT_DESCRIBE_NUMBER }} - string: cuda_{{ cuda_major }}_py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + string: cuda_{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} script_env: - PARALLEL_LEVEL - CMAKE_GENERATOR diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml index 5fa0411803b..5cbea78e82b 100644 --- a/conda/recipes/cudf_kafka/meta.yaml +++ b/conda/recipes/cudf_kafka/meta.yaml @@ -1,9 +1,10 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. -{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} +{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version = environ['CONDA_PY'] %} {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} +{% set date_string = environ['RAPIDS_DATE_STRING'] %} package: name: cudf_kafka @@ -14,7 +15,7 @@ source: build: number: {{ GIT_DESCRIBE_NUMBER }} - string: py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + string: py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} script_env: - PARALLEL_LEVEL - CMAKE_GENERATOR diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml index 13d54011e02..7f2f7482dfd 100644 --- a/conda/recipes/custreamz/meta.yaml +++ b/conda/recipes/custreamz/meta.yaml @@ -1,9 +1,10 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. -{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} +{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version = environ['CONDA_PY'] %} {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} +{% set date_string = environ['RAPIDS_DATE_STRING'] %} package: name: custreamz @@ -14,7 +15,7 @@ source: build: number: {{ GIT_DESCRIBE_NUMBER }} - string: py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + string: py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} script_env: - PARALLEL_LEVEL - CMAKE_GENERATOR diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml index feb9e53e37f..6c7bda6d5bf 100644 --- a/conda/recipes/dask-cudf/meta.yaml +++ b/conda/recipes/dask-cudf/meta.yaml @@ -1,10 +1,11 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. -{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} +{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version = environ['CONDA_PY'] %} {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} {% set cuda_major = cuda_version.split('.')[0] %} +{% set date_string = environ['RAPIDS_DATE_STRING'] %} package: name: dask-cudf @@ -15,7 +16,7 @@ source: build: number: {{ GIT_DESCRIBE_NUMBER }} - string: cuda_{{ cuda_major }}_py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + string: cuda_{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} script_env: - PARALLEL_LEVEL - CMAKE_GENERATOR diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 911080ebdb6..b0b86b427b7 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -1,10 +1,11 @@ # Copyright (c) 2018-2023, NVIDIA CORPORATION. -{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} +{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} {% set cuda_major = cuda_version.split('.')[0] %} {% set cuda_spec = ">=" + cuda_major ~ ",<" + (cuda_major | int + 1) ~ ".0a0" %} # i.e. >=11,<12.0a0 +{% set date_string = environ['RAPIDS_DATE_STRING'] %} package: name: libcudf-split @@ -52,7 +53,7 @@ outputs: script: install_libcudf.sh build: number: {{ GIT_DESCRIBE_NUMBER }} - string: cuda{{ cuda_major }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} run_exports: - {{ pin_subpackage("libcudf", max_pin="x.x") }} ignore_run_exports_from: @@ -308,7 +309,7 @@ outputs: script: install_libcudf_kafka.sh build: number: {{ GIT_DESCRIBE_NUMBER }} - string: {{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + string: {{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} ignore_run_exports_from: - {{ compiler('cuda') }} requirements: @@ -331,7 +332,7 @@ outputs: script: install_libcudf_example.sh build: number: {{ GIT_DESCRIBE_NUMBER }} - string: {{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + string: {{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} ignore_run_exports_from: - {{ compiler('cuda') }} requirements: @@ -358,7 +359,7 @@ outputs: script: install_libcudf_tests.sh build: number: {{ GIT_DESCRIBE_NUMBER }} - string: cuda{{ cuda_major }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} ignore_run_exports_from: - {{ compiler('cuda') }} requirements: diff --git a/conda/recipes/strings_udf/meta.yaml b/conda/recipes/strings_udf/meta.yaml index 0928c5d3315..93316a92c22 100644 --- a/conda/recipes/strings_udf/meta.yaml +++ b/conda/recipes/strings_udf/meta.yaml @@ -1,10 +1,11 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. -{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} +{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version = environ['CONDA_PY'] %} {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} {% set cuda_major = cuda_version.split('.')[0] %} +{% set date_string = environ['RAPIDS_DATE_STRING'] %} package: name: strings_udf @@ -15,7 +16,7 @@ source: build: number: {{ GIT_DESCRIBE_NUMBER }} - string: cuda_{{ cuda_major }}_py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + string: cuda_{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} script_env: - PARALLEL_LEVEL - CMAKE_GENERATOR From c7db81a34ba4a2e8b10f5dc5ccc3652260124b52 Mon Sep 17 00:00:00 2001 From: Cindy Jiang <47068112+cindyyuanjiang@users.noreply.github.com> Date: Mon, 6 Feb 2023 10:07:36 -0800 Subject: [PATCH 5/6] Add `regex_program` strings extract java APIs and tests (#12699) This PR adds [extract, extract_all_record](https://docs.rapids.ai/api/libcudf/nightly/strings_2extract_8hpp.html) related `regex_program` java APIs and unit tests. Part of work for https://github.com/NVIDIA/spark-rapids/issues/7295. Authors: - Cindy Jiang (https://github.com/cindyyuanjiang) Approvers: - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/12699 --- .../main/java/ai/rapids/cudf/ColumnView.java | 31 ++++++++++++++++--- java/src/main/native/src/ColumnViewJni.cpp | 20 +++++++----- .../java/ai/rapids/cudf/ColumnVectorTest.java | 18 ++++++----- 3 files changed, 50 insertions(+), 19 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index b3111cec77b..4daa3c17cfc 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -3252,10 +3252,28 @@ public final ColumnVector containsRe(RegexProgram regexProg) { * @throws CudfException if any error happens including if the RE does * not contain any capture groups. */ + @Deprecated public final Table extractRe(String pattern) throws CudfException { + return extractRe(new RegexProgram(pattern)); + } + + /** + * For each captured group specified in the given regex program + * return a column in the table. Null entries are added if the string + * does not match. Any null inputs also result in null output entries. + * + * For supported regex patterns refer to: + * @link https://docs.rapids.ai/api/libcudf/nightly/md_regex.html + * @param regexProg the regex program to use + * @return the table of extracted matches + * @throws CudfException if any error happens including if the regex + * program does not contain any capture groups. + */ + public final Table extractRe(RegexProgram regexProg) throws CudfException { assert type.equals(DType.STRING) : "column type must be a String"; - assert pattern != null : "pattern may not be null"; - return new Table(extractRe(this.getNativeView(), pattern)); + assert regexProg != null : "regex program may not be null"; + return new Table(extractRe(this.getNativeView(), regexProg.pattern(), + regexProg.combinedFlags(), regexProg.capture().nativeId)); } /** @@ -4100,9 +4118,14 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat private static native long stringContains(long cudfViewHandle, long compString) throws CudfException; /** - * Native method for extracting results from an regular expressions. Returns a table handle. + * Native method for extracting results from a regex program pattern. Returns a table handle. + * + * @param cudfViewHandle Native handle of the cudf::column_view being operated on. + * @param pattern String regex pattern. + * @param flags Regex flags setting. + * @param capture Capture groups setting. */ - private static native long[] extractRe(long cudfViewHandle, String pattern) throws CudfException; + private static native long[] extractRe(long cudfViewHandle, String pattern, int flags, int capture) throws CudfException; /** * Native method for extracting all results corresponding to group idx from a regex program pattern. diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index ff07a6786c1..bfa3fa0a522 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -1674,18 +1674,22 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringStrip(JNIEnv *env, JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_extractRe(JNIEnv *env, jclass, jlong j_view_handle, - jstring patternObj) { + jstring pattern_obj, + jint regex_flags, + jint capture_groups) { JNI_NULL_CHECK(env, j_view_handle, "column is null", nullptr); - JNI_NULL_CHECK(env, patternObj, "pattern is null", nullptr); + JNI_NULL_CHECK(env, pattern_obj, "pattern is null", nullptr); try { cudf::jni::auto_set_device(env); - cudf::strings_column_view const strings_column{ - *reinterpret_cast(j_view_handle)}; - cudf::jni::native_jstring pattern(env, patternObj); - - return cudf::jni::convert_table_for_return( - env, cudf::strings::extract(strings_column, pattern.get())); + auto const column_view = reinterpret_cast(j_view_handle); + auto const strings_column = cudf::strings_column_view{*column_view}; + auto const pattern = cudf::jni::native_jstring(env, pattern_obj); + auto const flags = static_cast(regex_flags); + auto const groups = static_cast(capture_groups); + auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups); + return cudf::jni::convert_table_for_return(env, + cudf::strings::extract(strings_column, *regex_prog)); } CATCH_STD(env, 0); } diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 26817281c2e..46264b7d668 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -4040,14 +4040,18 @@ void testStringFindOperations() { @Test void testExtractRe() { - try (ColumnVector input = ColumnVector.fromStrings("a1", "b2", "c3", null); - Table expected = new Table.TestBuilder() - .column("a", "b", null, null) - .column("1", "2", null, null) - .build(); - Table found = input.extractRe("([ab])(\\d)")) { - assertTablesAreEqual(expected, found); + try (ColumnVector input = ColumnVector.fromStrings("a1", "b2", "c3", null); + Table expected = new Table.TestBuilder() + .column("a", "b", null, null) + .column("1", "2", null, null) + .build()) { + try (Table found = input.extractRe("([ab])(\\d)")) { + assertTablesAreEqual(expected, found); } + try (Table found = input.extractRe(new RegexProgram("([ab])(\\d)"))) { + assertTablesAreEqual(expected, found); + } + } } @Test From 519e118f12e1bdc89d26eb799687f6ace55490f2 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 6 Feb 2023 12:45:23 -0600 Subject: [PATCH 6/6] [REVIEW] Pin `dask` and `distributed` for release (#12695) This PR pins `dask` and `distributed` to `2023.1.1` for `23.02` release. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Peter Andreas Entschev (https://github.com/pentschev) - Mark Sadang (https://github.com/msadang) - Bradley Dice (https://github.com/bdice) --- ci/benchmark/build.sh | 6 +++--- ci/cpu/build.sh | 4 ++-- ci/gpu/build.sh | 4 ++-- .../all_cuda-118_arch-x86_64.yaml | 4 ++-- conda/recipes/custreamz/meta.yaml | 6 +++--- conda/recipes/dask-cudf/meta.yaml | 10 +++++----- conda/recipes/dask-cudf/run_test.sh | 20 +++++++++++++------ dependencies.yaml | 4 ++-- python/dask_cudf/setup.py | 4 ++-- 9 files changed, 35 insertions(+), 27 deletions(-) diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh index ec4f8d55372..c27fe23d078 100755 --- a/ci/benchmark/build.sh +++ b/ci/benchmark/build.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. ######################################### # cuDF GPU build and test script for CI # ######################################### @@ -37,10 +37,10 @@ export GBENCH_BENCHMARKS_DIR="$WORKSPACE/cpp/build/gbenchmarks/" export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache" # Dask & Distributed option to install main(nightly) or `conda-forge` packages. -export INSTALL_DASK_MAIN=1 +export INSTALL_DASK_MAIN=0 # Dask version to install when `INSTALL_DASK_MAIN=0` -export DASK_STABLE_VERSION="2022.12.0" +export DASK_STABLE_VERSION="2023.1.1" function remove_libcudf_kernel_cache_dir { EXITCODE=$? diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index 7ffd032bce0..5b4a201e5e9 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. ############################################## # cuDF CPU conda build script for CI # ############################################## @@ -35,7 +35,7 @@ export CONDA_BLD_DIR="$WORKSPACE/.conda-bld" # Whether to keep `dask/label/dev` channel in the env. If INSTALL_DASK_MAIN=0, # `dask/label/dev` channel is removed. -export INSTALL_DASK_MAIN=1 +export INSTALL_DASK_MAIN=0 # Switch to project root; also root of repo checkout cd "$WORKSPACE" diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index a33dc842a7f..68375720adf 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -39,10 +39,10 @@ export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'` unset GIT_DESCRIBE_TAG # Dask & Distributed option to install main(nightly) or `conda-forge` packages. -export INSTALL_DASK_MAIN=1 +export INSTALL_DASK_MAIN=0 # Dask version to install when `INSTALL_DASK_MAIN=0` -export DASK_STABLE_VERSION="2022.12.0" +export DASK_STABLE_VERSION="2023.1.1" # ucx-py version export UCX_PY_VERSION='0.30.*' diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 1bcb1978373..cc9cd57edd5 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -21,8 +21,8 @@ dependencies: - cxx-compiler - cython>=0.29,<0.30 - dask-cuda=23.02.* -- dask>=2022.12.0 -- distributed>=2022.12.0 +- dask==2023.1.1 +- distributed==2023.1.1 - dlpack>=0.5,<0.6.0a0 - doxygen=1.8.20 - fastavro>=0.22.9 diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml index 13d54011e02..09fa63fc1f7 100644 --- a/conda/recipes/custreamz/meta.yaml +++ b/conda/recipes/custreamz/meta.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} @@ -38,8 +38,8 @@ requirements: - python - streamz - cudf ={{ version }} - - dask >=2022.12.0 - - distributed >=2022.12.0 + - dask ==2023.1.1 + - distributed ==2023.1.1 - python-confluent-kafka >=1.7.0,<1.8.0a0 - cudf_kafka ={{ version }} diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml index feb9e53e37f..991d0a9da34 100644 --- a/conda/recipes/dask-cudf/meta.yaml +++ b/conda/recipes/dask-cudf/meta.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} @@ -34,14 +34,14 @@ requirements: host: - python - cudf ={{ version }} - - dask >=2022.12.0 - - distributed >=2022.12.0 + - dask ==2023.1.1 + - distributed ==2023.1.1 - cudatoolkit ={{ cuda_version }} run: - python - cudf ={{ version }} - - dask >=2022.12.0 - - distributed >=2022.12.0 + - dask ==2023.1.1 + - distributed ==2023.1.1 - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }} test: diff --git a/conda/recipes/dask-cudf/run_test.sh b/conda/recipes/dask-cudf/run_test.sh index f56610bea86..3b1fc46c4f4 100644 --- a/conda/recipes/dask-cudf/run_test.sh +++ b/conda/recipes/dask-cudf/run_test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. set -e @@ -17,12 +17,20 @@ if [ "${ARCH}" = "aarch64" ]; then exit 0 fi -# Install the latest version of dask and distributed -logger "pip install git+https://github.com/dask/distributed.git@main --upgrade --no-deps" -pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps +# Dask & Distributed option to install main(nightly) or `conda-forge` packages. +export INSTALL_DASK_MAIN=0 -logger "pip install git+https://github.com/dask/dask.git@main --upgrade --no-deps" -pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps +# Dask version to install when `INSTALL_DASK_MAIN=0` +export DASK_STABLE_VERSION="2023.1.1" + +# Install the conda-forge or nightly version of dask and distributed +if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then + gpuci_logger "gpuci_mamba_retry install -c dask/label/dev 'dask/label/dev::dask' 'dask/label/dev::distributed'" + gpuci_mamba_retry install -c dask/label/dev "dask/label/dev::dask" "dask/label/dev::distributed" +else + gpuci_logger "gpuci_mamba_retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall" + gpuci_mamba_retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall +fi logger "python -c 'import dask_cudf'" python -c "import dask_cudf" diff --git a/dependencies.yaml b/dependencies.yaml index 8790853fbb3..8ee4595f8e7 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -225,8 +225,8 @@ dependencies: - output_types: [conda, requirements] packages: - cachetools - - dask>=2022.12.0 - - distributed>=2022.12.0 + - dask==2023.1.1 + - distributed==2023.1.1 - fsspec>=0.6.0 - numba>=0.56.2 - numpy diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py index 1108da91d03..4b420b1b97c 100644 --- a/python/dask_cudf/setup.py +++ b/python/dask_cudf/setup.py @@ -8,8 +8,8 @@ cuda_suffix = os.getenv("RAPIDS_PY_WHEEL_CUDA_SUFFIX", default="") install_requires = [ - "dask>=2022.12.0", - "distributed>=2022.12.0", + "dask==2023.1.1", + "distributed==2023.1.1", "fsspec>=0.6.0", "numpy", "pandas>=1.0,<1.6.0dev0",