From 476d5bbf9cfdbcef024bdccc29f30cd1c6fdbc94 Mon Sep 17 00:00:00 2001 From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com> Date: Wed, 8 Feb 2023 10:02:08 -0600 Subject: [PATCH 1/4] Handle parquet list data corner case (#12698) Fixes an issue with a particular arrangement of page data related to lists. Specifically, it is possible for page `N` to contain "0" rows because the values for the row it is a part of start on page `N-1` and end on page `N+1`. This was defeating logic in the decode kernel that would erroneously cause these values to be skipped. Similar to https://github.com/rapidsai/cudf/pull/12488 this is only reproducible with data out in the wild. In this case, we have a file that we could in theory check in to create a test with, but it is 16 MB so it's fairly large. Looking for feedback on whether this is too big. Authors: - https://github.com/nvdbaranec - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Bradley Dice (https://github.com/bdice) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/12698 --- cpp/src/io/parquet/page_data.cu | 50 +++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 23d130e1585..ee115e7432a 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -104,20 +104,41 @@ struct page_state_s { * specified row bounds * * @param s The page to be checked - * @param min_row The starting row index + * @param start_row The starting row index * @param num_rows The number of rows * * @return True if the page spans the beginning or the end of the row bounds */ -inline __device__ bool is_bounds_page(page_state_s* const s, size_t min_row, size_t num_rows) +inline __device__ bool is_bounds_page(page_state_s* const s, size_t start_row, size_t num_rows) { size_t const page_begin = s->col.start_row + s->page.chunk_row; size_t const page_end = page_begin + s->page.num_rows; - size_t const begin = min_row; - size_t const end = min_row + num_rows; + size_t const begin = start_row; + size_t const end = start_row + num_rows; + return ((page_begin <= begin && page_end >= begin) || (page_begin <= end && page_end >= end)); } +/** + * @brief Returns whether or not a page is completely contained within the specified + * row bounds + * + * @param s The page to be checked + * @param start_row The starting row index + * @param num_rows The number of rows + * + * @return True if the page is completely contained within the row bounds + */ +inline __device__ bool is_page_contained(page_state_s* const s, size_t start_row, size_t num_rows) +{ + size_t const page_begin = s->col.start_row + s->page.chunk_row; + size_t const page_end = page_begin + s->page.num_rows; + size_t const begin = start_row; + size_t const end = start_row + num_rows; + + return page_begin >= begin && page_end <= end; +} + /** * @brief Read a 32-bit varint integer * @@ -1728,10 +1749,11 @@ __global__ void __launch_bounds__(block_size) auto const thread_depth = depth + t; if (thread_depth < s->page.num_output_nesting_levels) { // if we are not a bounding page (as checked above) then we are either - // returning 0 rows from the page (completely outside the bounds) or all - // rows in the page (completely within the bounds) + // returning all rows/values from this page, or 0 of them pp->nesting[thread_depth].batch_size = - s->num_rows == 0 ? 0 : pp->nesting[thread_depth].size; + (s->num_rows == 0 && !is_page_contained(s, min_row, num_rows)) + ? 0 + : pp->nesting[thread_depth].size; } depth += blockDim.x; } @@ -1838,7 +1860,19 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData( bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0; // if we have no work to do (eg, in a skip_rows/num_rows case) in this page. - if (s->num_rows == 0 && !(has_repetition && is_bounds_page(s, min_row, num_rows))) { return; } + // + // corner case: in the case of lists, we can have pages that contain "0" rows if the current row + // starts before this page and ends after this page: + // P0 P1 P2 + // |---------|---------|----------| + // ^------------------^ + // row start row end + // P1 will contain 0 rows + // + if (s->num_rows == 0 && !(has_repetition && (is_bounds_page(s, min_row, num_rows) || + is_page_contained(s, min_row, num_rows)))) { + return; + } if (s->dict_base) { out_thread0 = (s->dict_bits > 0) ? 64 : 32; From 89ec635dceacde2b6715af253029ef317905df4e Mon Sep 17 00:00:00 2001 From: AJ Schmidt Date: Wed, 8 Feb 2023 12:17:50 -0500 Subject: [PATCH 2/4] Update shared workflow branches (#12733) This PR updates the branch reference used for our shared workflows. Authors: - AJ Schmidt (https://github.com/ajschmidt8) Approvers: - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/cudf/pull/12733 --- .github/workflows/build.yaml | 14 +++++++------- .github/workflows/pr.yaml | 26 +++++++++++++------------- .github/workflows/test.yaml | 14 +++++++------- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 3366554db30..26d07515f70 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.04 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.04 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.04 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -55,7 +55,7 @@ jobs: skip_upload_pkgs: libcudf-example wheel-build-cudf: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.04 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -67,7 +67,7 @@ jobs: wheel-publish-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.04 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -77,7 +77,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-publish-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-build.yml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-build.yml@branch-23.04 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -88,7 +88,7 @@ jobs: wheel-publish-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-publish.yml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-publish.yml@branch-23.04 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index cf20b0006a2..f33fc15c52f 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -25,32 +25,32 @@ jobs: - wheel-build-dask-cudf - wheel-tests-dask-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.04 checks: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.04 conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.04 with: build_type: pull-request conda-cpp-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.04 with: build_type: pull-request conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.04 with: build_type: pull-request conda-python-cudf-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.04 with: build_type: pull-request test_script: "ci/test_python_cudf.sh" @@ -58,14 +58,14 @@ jobs: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.04 with: build_type: pull-request test_script: "ci/test_python_other.sh" conda-java-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.04 with: build_type: pull-request node_type: "gpu-latest-1" @@ -75,7 +75,7 @@ jobs: conda-notebook-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.04 with: build_type: pull-request node_type: "gpu-latest-1" @@ -85,7 +85,7 @@ jobs: wheel-build-cudf: needs: checks secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.04 with: build_type: pull-request package-name: cudf @@ -94,7 +94,7 @@ jobs: wheel-tests-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.04 with: build_type: pull-request package-name: cudf @@ -106,7 +106,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-tests-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-build.yml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-build.yml@branch-23.04 with: build_type: pull-request package-name: dask_cudf @@ -115,7 +115,7 @@ jobs: wheel-tests-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-test.yml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-test.yml@branch-23.04 with: build_type: pull-request package-name: dask_cudf diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 1b117bb2f4f..ff19d51f8ef 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.04 with: build_type: nightly branch: ${{ inputs.branch }} @@ -24,7 +24,7 @@ jobs: sha: ${{ inputs.sha }} conda-python-cudf-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.04 with: build_type: nightly branch: ${{ inputs.branch }} @@ -34,7 +34,7 @@ jobs: conda-python-other-tests: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.04 with: build_type: nightly branch: ${{ inputs.branch }} @@ -43,7 +43,7 @@ jobs: test_script: "ci/test_python_other.sh" conda-java-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.04 with: build_type: nightly branch: ${{ inputs.branch }} @@ -55,7 +55,7 @@ jobs: run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.04 with: build_type: nightly branch: ${{ inputs.branch }} @@ -67,7 +67,7 @@ jobs: run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.04 with: build_type: nightly branch: ${{ inputs.branch }} @@ -78,7 +78,7 @@ jobs: test-unittest: "pytest -v -n 8 ./python/cudf/cudf/tests" wheel-tests-dask-cudf: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-test.yml@branch-23.02 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-test.yml@branch-23.04 with: build_type: nightly branch: ${{ inputs.branch }} From d3f9dafa49c973c5e5d8b8a9336bbc92555ea0c3 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Wed, 8 Feb 2023 11:41:10 -0600 Subject: [PATCH 3/4] Fix faulty conditional logic in JIT `GroupBy.apply` (#12706) Closes https://github.com/rapidsai/cudf/issues/12686 Authors: - https://github.com/brandon-b-miller Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/12706 --- python/cudf/cudf/tests/test_groupby.py | 17 +++++++++++++++++ python/cudf/udf_cpp/groupby/function.cu | 6 +++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index c5b330fd89c..1fea3c7a37e 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -519,6 +519,23 @@ def test_groupby_apply_jit_args(func, args, groupby_jit_data): run_groupby_apply_jit_test(groupby_jit_data, func, ["key1", "key2"], *args) +def test_groupby_apply_jit_block_divergence(): + # https://github.com/rapidsai/cudf/issues/12686 + df = cudf.DataFrame( + { + "a": [0, 0, 0, 1, 1, 1], + "b": [1, 1, 1, 2, 3, 4], + } + ) + + def diverging_block(grp_df): + if grp_df["a"].mean() > 0: + return grp_df["b"].mean() + return 0 + + run_groupby_apply_jit_test(df, diverging_block, ["a"]) + + @pytest.mark.parametrize("nelem", [2, 3, 100, 500, 1000]) @pytest.mark.parametrize( "func", diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index f94f99c4b49..782371b8a44 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -284,7 +284,7 @@ extern "C" { __device__ int name##_##cname(return_type* numba_return_value, type* const data, int64_t size) \ { \ return_type const res = name(data, size); \ - if (threadIdx.x == 0) { *numba_return_value = res; } \ + *numba_return_value = res; \ __syncthreads(); \ return 0; \ } @@ -309,8 +309,8 @@ extern "C" { __device__ int name##_##cname( \ int64_t* numba_return_value, type* const data, int64_t* index, int64_t size) \ { \ - auto const res = name(data, index, size); \ - if (threadIdx.x == 0) { *numba_return_value = res; } \ + auto const res = name(data, index, size); \ + *numba_return_value = res; \ __syncthreads(); \ return 0; \ } From 0161ba896a1d70ba3e049bdbb3d649cedba2aeb0 Mon Sep 17 00:00:00 2001 From: Cindy Jiang <47068112+cindyyuanjiang@users.noreply.github.com> Date: Wed, 8 Feb 2023 10:48:38 -0800 Subject: [PATCH 4/4] Add `regex_program` strings replacing java APIs and tests (#12701) This PR adds [replace_re, replace_with_backrefs](https://docs.rapids.ai/api/libcudf/nightly/replace__re_8hpp.html) related `regex_program` java APIs and unit tests. Part of work for https://github.com/NVIDIA/spark-rapids/issues/7295. Authors: - Cindy Jiang (https://github.com/cindyyuanjiang) Approvers: - Jason Lowe (https://github.com/jlowe) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/12701 --- .../main/java/ai/rapids/cudf/ColumnView.java | 71 +++++++++++--- java/src/main/native/src/ColumnViewJni.cpp | 43 ++++---- .../java/ai/rapids/cudf/ColumnVectorTest.java | 98 ++++++++++++------- 3 files changed, 149 insertions(+), 63 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 2d0bf28225f..0cb9ed37d9f 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -2922,8 +2922,21 @@ public final ColumnVector stringReplace(Scalar target, Scalar replace) { * @param repl The string scalar to replace for each pattern match. * @return A new column vector containing the string results. */ + @Deprecated public final ColumnVector replaceRegex(String pattern, Scalar repl) { - return replaceRegex(pattern, repl, -1); + return replaceRegex(new RegexProgram(pattern, CaptureGroups.NON_CAPTURE), repl); + } + + /** + * For each string, replaces any character sequence matching the given regex program pattern + * using the replacement string scalar. + * + * @param regexProg The regex program with pattern to search within each string. + * @param repl The string scalar to replace for each pattern match. + * @return A new column vector containing the string results. + */ + public final ColumnVector replaceRegex(RegexProgram regexProg, Scalar repl) { + return replaceRegex(regexProg, repl, -1); } /** @@ -2935,12 +2948,27 @@ public final ColumnVector replaceRegex(String pattern, Scalar repl) { * @param maxRepl The maximum number of times a replacement should occur within each string. * @return A new column vector containing the string results. */ + @Deprecated public final ColumnVector replaceRegex(String pattern, Scalar repl, int maxRepl) { + return replaceRegex(new RegexProgram(pattern, CaptureGroups.NON_CAPTURE), repl, maxRepl); + } + + /** + * For each string, replaces any character sequence matching the given regex program pattern + * using the replacement string scalar. + * + * @param regexProg The regex program with pattern to search within each string. + * @param repl The string scalar to replace for each pattern match. + * @param maxRepl The maximum number of times a replacement should occur within each string. + * @return A new column vector containing the string results. + */ + public final ColumnVector replaceRegex(RegexProgram regexProg, Scalar repl, int maxRepl) { if (!repl.getType().equals(DType.STRING)) { throw new IllegalArgumentException("Replacement must be a string scalar"); } - return new ColumnVector(replaceRegex(getNativeView(), pattern, repl.getScalarHandle(), - maxRepl)); + assert regexProg != null : "regex program may not be null"; + return new ColumnVector(replaceRegex(getNativeView(), regexProg.pattern(), regexProg.combinedFlags(), + regexProg.capture().nativeId, repl.getScalarHandle(), maxRepl)); } /** @@ -2966,9 +2994,26 @@ public final ColumnVector replaceMultiRegex(String[] patterns, ColumnView repls) * @param replace The replacement template for creating the output string. * @return A new java column vector containing the string results. */ + @Deprecated public final ColumnVector stringReplaceWithBackrefs(String pattern, String replace) { - return new ColumnVector(stringReplaceWithBackrefs(getNativeView(), pattern, - replace)); + return stringReplaceWithBackrefs(new RegexProgram(pattern), replace); + } + + /** + * For each string, replaces any character sequence matching the given regex program + * pattern using the replace template for back-references. + * + * Any null string entries return corresponding null output column entries. + * + * @param regexProg The regex program with pattern to search within each string. + * @param replace The replacement template for creating the output string. + * @return A new java column vector containing the string results. + */ + public final ColumnVector stringReplaceWithBackrefs(RegexProgram regexProg, String replace) { + assert regexProg != null : "regex program may not be null"; + return new ColumnVector( + stringReplaceWithBackrefs(getNativeView(), regexProg.pattern(), regexProg.combinedFlags(), + regexProg.capture().nativeId, replace)); } /** @@ -4129,12 +4174,14 @@ private static native long substringColumn(long columnView, long startColumn, lo * Native method for replacing each regular expression pattern match with the specified * replacement string. * @param columnView native handle of the cudf::column_view being operated on. - * @param pattern The regular expression pattern to search within each string. + * @param pattern regular expression pattern to search within each string. + * @param flags regex flags setting. + * @param capture capture groups setting. * @param repl native handle of the cudf::scalar containing the replacement string. * @param maxRepl maximum number of times to replace the pattern within a string * @return native handle of the resulting cudf column containing the string results. */ - private static native long replaceRegex(long columnView, String pattern, + private static native long replaceRegex(long columnView, String pattern, int flags, int capture, long repl, long maxRepl) throws CudfException; /** @@ -4148,15 +4195,17 @@ private static native long replaceMultiRegex(long columnView, String[] patterns, long repls) throws CudfException; /** - * Native method for replacing any character sequence matching the given pattern - * using the replace template for back-references. + * Native method for replacing any character sequence matching the given regex program + * pattern using the replace template for back-references. * @param columnView native handle of the cudf::column_view being operated on. * @param pattern The regular expression patterns to search within each string. + * @param flags Regex flags setting. + * @param capture Capture groups setting. * @param replace The replacement template for creating the output string. * @return native handle of the resulting cudf column containing the string results. */ - private static native long stringReplaceWithBackrefs(long columnView, String pattern, - String replace) throws CudfException; + private static native long stringReplaceWithBackrefs(long columnView, String pattern, int flags, + int capture, String replace) throws CudfException; /** * Native method for checking if strings in a column starts with a specified comparison string. diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 958efd364ed..c42cc430560 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -1606,21 +1606,24 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapContains(JNIEnv *env, CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceRegex(JNIEnv *env, jclass, - jlong j_column_view, - jstring j_pattern, jlong j_repl, - jlong j_maxrepl) { +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceRegex( + JNIEnv *env, jclass, jlong j_column_view, jstring j_pattern, jint regex_flags, + jint capture_groups, jlong j_repl, jlong j_maxrepl) { JNI_NULL_CHECK(env, j_column_view, "column is null", 0); JNI_NULL_CHECK(env, j_pattern, "pattern string is null", 0); JNI_NULL_CHECK(env, j_repl, "replace scalar is null", 0); try { cudf::jni::auto_set_device(env); - auto cv = reinterpret_cast(j_column_view); - cudf::strings_column_view scv(*cv); - cudf::jni::native_jstring pattern(env, j_pattern); - auto repl = reinterpret_cast(j_repl); - return release_as_jlong(cudf::strings::replace_re(scv, pattern.get(), *repl, j_maxrepl)); + auto const cv = reinterpret_cast(j_column_view); + auto const strings_column = cudf::strings_column_view{*cv}; + auto const pattern = cudf::jni::native_jstring(env, j_pattern); + auto const flags = static_cast(regex_flags); + auto const groups = static_cast(capture_groups); + auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups); + auto const repl = reinterpret_cast(j_repl); + return release_as_jlong( + cudf::strings::replace_re(strings_column, *regex_prog, *repl, j_maxrepl)); } CATCH_STD(env, 0); } @@ -1646,19 +1649,23 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceMultiRegex(JNIEnv } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplaceWithBackrefs( - JNIEnv *env, jclass, jlong column_view, jstring patternObj, jstring replaceObj) { + JNIEnv *env, jclass, jlong j_column_view, jstring pattern_obj, jint regex_flags, + jint capture_groups, jstring replace_obj) { - JNI_NULL_CHECK(env, column_view, "column is null", 0); - JNI_NULL_CHECK(env, patternObj, "pattern string is null", 0); - JNI_NULL_CHECK(env, replaceObj, "replace string is null", 0); + JNI_NULL_CHECK(env, j_column_view, "column is null", 0); + JNI_NULL_CHECK(env, pattern_obj, "pattern string is null", 0); + JNI_NULL_CHECK(env, replace_obj, "replace string is null", 0); try { cudf::jni::auto_set_device(env); - cudf::column_view *cv = reinterpret_cast(column_view); - cudf::strings_column_view scv(*cv); - cudf::jni::native_jstring ss_pattern(env, patternObj); - cudf::jni::native_jstring ss_replace(env, replaceObj); + auto const cv = reinterpret_cast(j_column_view); + auto const strings_column = cudf::strings_column_view{*cv}; + auto const pattern = cudf::jni::native_jstring(env, pattern_obj); + auto const flags = static_cast(regex_flags); + auto const groups = static_cast(capture_groups); + auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups); + cudf::jni::native_jstring ss_replace(env, replace_obj); return release_as_jlong( - cudf::strings::replace_with_backrefs(scv, ss_pattern.get(), ss_replace.get())); + cudf::strings::replace_with_backrefs(strings_column, *regex_prog, ss_replace.get())); } CATCH_STD(env, 0); } diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index ab4baf74277..db64dcb08c7 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -5147,29 +5147,42 @@ void teststringReplaceThrowsException() { @Test void testReplaceRegex() { - try (ColumnVector v = - ColumnVector.fromStrings("title and Title with title", "nothing", null, "Title"); - Scalar repl = Scalar.fromString("Repl"); - ColumnVector actual = v.replaceRegex("[tT]itle", repl); - ColumnVector expected = - ColumnVector.fromStrings("Repl and Repl with Repl", "nothing", null, "Repl")) { - assertColumnsAreEqual(expected, actual); - } + try (ColumnVector v = ColumnVector.fromStrings("title and Title with title", "nothing", null, "Title"); + Scalar repl = Scalar.fromString("Repl")) { + String pattern = "[tT]itle"; + RegexProgram regexProg = new RegexProgram(pattern, CaptureGroups.NON_CAPTURE); - try (ColumnVector v = - ColumnVector.fromStrings("title and Title with title", "nothing", null, "Title"); - Scalar repl = Scalar.fromString("Repl"); - ColumnVector actual = v.replaceRegex("[tT]itle", repl, 0)) { - assertColumnsAreEqual(v, actual); - } + try (ColumnVector actual = v.replaceRegex(pattern, repl); + ColumnVector expected = + ColumnVector.fromStrings("Repl and Repl with Repl", "nothing", null, "Repl")) { + assertColumnsAreEqual(expected, actual); + } - try (ColumnVector v = - ColumnVector.fromStrings("title and Title with title", "nothing", null, "Title"); - Scalar repl = Scalar.fromString("Repl"); - ColumnVector actual = v.replaceRegex("[tT]itle", repl, 1); - ColumnVector expected = - ColumnVector.fromStrings("Repl and Title with title", "nothing", null, "Repl")) { - assertColumnsAreEqual(expected, actual); + try (ColumnVector actual = v.replaceRegex(pattern, repl, 0)) { + assertColumnsAreEqual(v, actual); + } + + try (ColumnVector actual = v.replaceRegex(pattern, repl, 1); + ColumnVector expected = + ColumnVector.fromStrings("Repl and Title with title", "nothing", null, "Repl")) { + assertColumnsAreEqual(expected, actual); + } + + try (ColumnVector actual = v.replaceRegex(regexProg, repl); + ColumnVector expected = + ColumnVector.fromStrings("Repl and Repl with Repl", "nothing", null, "Repl")) { + assertColumnsAreEqual(expected, actual); + } + + try (ColumnVector actual = v.replaceRegex(regexProg, repl, 0)) { + assertColumnsAreEqual(v, actual); + } + + try (ColumnVector actual = v.replaceRegex(regexProg, repl, 1); + ColumnVector expected = + ColumnVector.fromStrings("Repl and Title with title", "nothing", null, "Repl")) { + assertColumnsAreEqual(expected, actual); + } } } @@ -5188,45 +5201,55 @@ void testReplaceMultiRegex() { @Test void testStringReplaceWithBackrefs() { - try (ColumnVector v = ColumnVector.fromStrings("

title

", "

another title

", - null); + try (ColumnVector v = ColumnVector.fromStrings("

title

", "

another title

", null); ColumnVector expected = ColumnVector.fromStrings("

title

", "

another title

", null); - ColumnVector actual = v.stringReplaceWithBackrefs("

(.*)

", "

\\1

")) { + ColumnVector actual = v.stringReplaceWithBackrefs("

(.*)

", "

\\1

"); + ColumnVector actualRe = + v.stringReplaceWithBackrefs(new RegexProgram("

(.*)

"), "

\\1

")) { assertColumnsAreEqual(expected, actual); + assertColumnsAreEqual(expected, actualRe); } try (ColumnVector v = ColumnVector.fromStrings("2020-1-01", "2020-2-02", null); ColumnVector expected = ColumnVector.fromStrings("2020-01-01", "2020-02-02", null); - ColumnVector actual = v.stringReplaceWithBackrefs("-([0-9])-", "-0\\1-")) { + ColumnVector actual = v.stringReplaceWithBackrefs("-([0-9])-", "-0\\1-"); + ColumnVector actualRe = + v.stringReplaceWithBackrefs(new RegexProgram("-([0-9])-"), "-0\\1-")) { assertColumnsAreEqual(expected, actual); + assertColumnsAreEqual(expected, actualRe); } - try (ColumnVector v = ColumnVector.fromStrings("2020-01-1", "2020-02-2", - "2020-03-3invalid", null); + try (ColumnVector v = ColumnVector.fromStrings("2020-01-1", "2020-02-2", "2020-03-3invalid", null); ColumnVector expected = ColumnVector.fromStrings("2020-01-01", "2020-02-02", "2020-03-3invalid", null); - ColumnVector actual = v.stringReplaceWithBackrefs( - "-([0-9])$", "-0\\1")) { + ColumnVector actual = v.stringReplaceWithBackrefs("-([0-9])$", "-0\\1"); + ColumnVector actualRe = + v.stringReplaceWithBackrefs(new RegexProgram("-([0-9])$"), "-0\\1")) { assertColumnsAreEqual(expected, actual); + assertColumnsAreEqual(expected, actualRe); } try (ColumnVector v = ColumnVector.fromStrings("2020-01-1 random_text", "2020-02-2T12:34:56", - "2020-03-3invalid", null); + "2020-03-3invalid", null); ColumnVector expected = ColumnVector.fromStrings("2020-01-01 random_text", "2020-02-02T12:34:56", "2020-03-3invalid", null); - ColumnVector actual = v.stringReplaceWithBackrefs( - "-([0-9])([ T])", "-0\\1\\2")) { + ColumnVector actual = v.stringReplaceWithBackrefs("-([0-9])([ T])", "-0\\1\\2"); + ColumnVector actualRe = + v.stringReplaceWithBackrefs(new RegexProgram("-([0-9])([ T])"), "-0\\1\\2")) { assertColumnsAreEqual(expected, actual); + assertColumnsAreEqual(expected, actualRe); } // test zero as group index try (ColumnVector v = ColumnVector.fromStrings("aa-11 b2b-345", "aa-11a 1c-2b2 b2-c3", "11-aa", null); ColumnVector expected = ColumnVector.fromStrings("aa-11:aa:11; b2b-345:b:345;", "aa-11:aa:11;a 1c-2:c:2;b2 b2-c3", "11-aa", null); - ColumnVector actual = v.stringReplaceWithBackrefs( - "([a-z]+)-([0-9]+)", "${0}:${1}:${2};")) { + ColumnVector actual = v.stringReplaceWithBackrefs("([a-z]+)-([0-9]+)", "${0}:${1}:${2};"); + ColumnVector actualRe = + v.stringReplaceWithBackrefs(new RegexProgram("([a-z]+)-([0-9]+)"), "${0}:${1}:${2};")) { assertColumnsAreEqual(expected, actual); + assertColumnsAreEqual(expected, actualRe); } // group index exceeds group count @@ -5236,6 +5259,13 @@ void testStringReplaceWithBackrefs() { } }); + // group index exceeds group count + assertThrows(CudfException.class, () -> { + try (ColumnVector v = ColumnVector.fromStrings("ABC123defgh"); + ColumnVector r = + v.stringReplaceWithBackrefs(new RegexProgram("([A-Z]+)([0-9]+)([a-z]+)"), "\\4")) { + } + }); } @Test