diff --git a/ci/cpu/prebuild.sh b/ci/cpu/prebuild.sh index 8a2c9d9be7c..1699fc16a47 100755 --- a/ci/cpu/prebuild.sh +++ b/ci/cpu/prebuild.sh @@ -3,32 +3,11 @@ # Copyright (c) 2020, NVIDIA CORPORATION. set -e -DEFAULT_CUDA_VER="11.5" -DEFAULT_PYTHON_VER="3.8" - -#Always upload cudf Python package +#Always upload cudf packages export UPLOAD_CUDF=1 - -#Upload libcudf once per CUDA -if [[ "$PYTHON" == "${DEFAULT_PYTHON_VER}" ]]; then - export UPLOAD_LIBCUDF=1 -else - export UPLOAD_LIBCUDF=0 -fi - -# upload cudf_kafka for all versions of Python -if [[ "$CUDA" == "${DEFAULT_CUDA_VER}" ]]; then - export UPLOAD_CUDF_KAFKA=1 -else - export UPLOAD_CUDF_KAFKA=0 -fi - -#We only want to upload libcudf_kafka once per python/CUDA combo -if [[ "$PYTHON" == "${DEFAULT_PYTHON_VER}" ]] && [[ "$CUDA" == "${DEFAULT_CUDA_VER}" ]]; then - export UPLOAD_LIBCUDF_KAFKA=1 -else - export UPLOAD_LIBCUDF_KAFKA=0 -fi +export UPLOAD_LIBCUDF=1 +export UPLOAD_CUDF_KAFKA=1 +export UPLOAD_LIBCUDF_KAFKA=1 if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then #If project flash is not activate, always build both diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e4637408110..90e94ffcc7b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -425,13 +425,11 @@ add_library( src/strings/copying/concatenate.cu src/strings/copying/copying.cu src/strings/copying/shift.cu + src/strings/count_matches.cu src/strings/extract/extract.cu src/strings/extract/extract_all.cu src/strings/filling/fill.cu src/strings/filter_chars.cu - src/strings/findall.cu - src/strings/find.cu - src/strings/find_multiple.cu src/strings/padding.cu src/strings/json/json_path.cu src/strings/regex/regcomp.cpp @@ -441,6 +439,10 @@ add_library( src/strings/replace/multi_re.cu src/strings/replace/replace.cu src/strings/replace/replace_re.cu + src/strings/search/findall.cu + src/strings/search/findall_record.cu + src/strings/search/find.cu + src/strings/search/find_multiple.cu src/strings/split/partition.cu src/strings/split/split.cu src/strings/split/split_record.cu diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp index ca3a748ccad..5f23cbbafdd 100644 --- a/cpp/benchmarks/fixture/benchmark_fixture.hpp +++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp @@ -32,8 +32,9 @@ inline auto make_cuda() { return std::make_shared inline auto make_pool_instance() { static rmm::mr::cuda_memory_resource cuda_mr; - static rmm::mr::pool_memory_resource pool_mr{&cuda_mr}; - return std::shared_ptr(&pool_mr); + static auto pool_mr = + std::make_shared>(&cuda_mr); + return pool_mr; } } // namespace diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp index 980b353406a..fbcfabb4532 100644 --- a/cpp/benchmarks/string/contains.cpp +++ b/cpp/benchmarks/string/contains.cpp @@ -46,7 +46,7 @@ static void BM_contains(benchmark::State& state, contains_type ct) cudf::strings::count_re(input, "\\d+"); break; case contains_type::findall: // returns occurrences of matches - cudf::strings::findall_re(input, "\\d+"); + cudf::strings::findall(input, "\\d+"); break; } } diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp index 6c3139747af..4207cddbafb 100644 --- a/cpp/include/cudf/strings/findall.hpp +++ b/cpp/include/cudf/strings/findall.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -52,7 +52,37 @@ namespace strings { * @param mr Device memory resource used to allocate the returned table's device memory. * @return New table of strings columns. */ -std::unique_ptr findall_re( +std::unique_ptr
findall( + strings_column_view const& strings, + std::string const& pattern, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns a lists column of strings for each matching occurrence of the + * regex pattern within each string. + * + * @code{.pseudo} + * Example: + * s = ["bunny", "rabbit", "hare", "dog"] + * r = findall_record(s, "[ab]"") + * r is now a lists column like: + * [ ["b"] + * ["a","b","b"] + * ["a"] + * null ] + * @endcode + * + * A null output row results if the pattern is not found in the corresponding row + * input string. + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @param strings Strings instance for this operation. + * @param pattern Regex pattern to match within each string. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New lists column of strings. + */ +std::unique_ptr findall_record( strings_column_view const& strings, std::string const& pattern, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/strings/count_matches.cuh b/cpp/src/strings/count_matches.cu similarity index 87% rename from cpp/src/strings/count_matches.cuh rename to cpp/src/strings/count_matches.cu index c14142f4779..d0a6825666b 100644 --- a/cpp/src/strings/count_matches.cuh +++ b/cpp/src/strings/count_matches.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,16 +14,13 @@ * limitations under the License. */ -#pragma once - +#include #include -#include #include #include #include -#include #include #include @@ -32,6 +29,7 @@ namespace cudf { namespace strings { namespace detail { +namespace { /** * @brief Functor counts the total matches to the given regex in each string. */ @@ -50,12 +48,13 @@ struct count_matches_fn { int32_t end = d_str.length(); while ((begin < end) && (prog.find(idx, d_str, begin, end) > 0)) { ++count; - begin = end; + begin = end + (begin == end); end = d_str.length(); } return count; } }; +} // namespace /** * @brief Returns a column of regex match counts for each string in the given column. @@ -67,11 +66,10 @@ struct count_matches_fn { * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. */ -std::unique_ptr count_matches( - column_device_view const& d_strings, - reprog_device const& d_prog, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr count_matches(column_device_view const& d_strings, + reprog_device const& d_prog, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Create output column auto counts = make_numeric_column( diff --git a/cpp/src/strings/count_matches.hpp b/cpp/src/strings/count_matches.hpp new file mode 100644 index 00000000000..1339f2b1ebd --- /dev/null +++ b/cpp/src/strings/count_matches.hpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cudf { + +class column_device_view; + +namespace strings { +namespace detail { + +class reprog_device; + +/** + * @brief Returns a column of regex match counts for each string in the given column. + * + * A null entry will result in a zero count for that output row. + * + * @param d_strings Device view of the input strings column. + * @param d_prog Regex instance to evaluate on each string. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. + */ +std::unique_ptr count_matches( + column_device_view const& d_strings, + reprog_device const& d_prog, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +} // namespace detail +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu index 584741298c2..c4749eae003 100644 --- a/cpp/src/strings/extract/extract_all.cu +++ b/cpp/src/strings/extract/extract_all.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include #include diff --git a/cpp/src/strings/find.cu b/cpp/src/strings/search/find.cu similarity index 100% rename from cpp/src/strings/find.cu rename to cpp/src/strings/search/find.cu diff --git a/cpp/src/strings/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu similarity index 100% rename from cpp/src/strings/find_multiple.cu rename to cpp/src/strings/search/find_multiple.cu diff --git a/cpp/src/strings/findall.cu b/cpp/src/strings/search/findall.cu similarity index 95% rename from cpp/src/strings/findall.cu rename to cpp/src/strings/search/findall.cu index c82ab4f81c3..8fb754848d4 100644 --- a/cpp/src/strings/findall.cu +++ b/cpp/src/strings/search/findall.cu @@ -109,11 +109,11 @@ struct findall_count_fn : public findall_fn { } // namespace // -std::unique_ptr
findall_re( +std::unique_ptr
findall( strings_column_view const& strings, std::string const& pattern, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - rmm::cuda_stream_view stream = rmm::cuda_stream_default) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto const strings_count = strings.size(); auto const d_strings = column_device_view::create(strings.parent(), stream); @@ -205,12 +205,12 @@ std::unique_ptr
findall_re( // external API -std::unique_ptr
findall_re(strings_column_view const& strings, - std::string const& pattern, - rmm::mr::device_memory_resource* mr) +std::unique_ptr
findall(strings_column_view const& strings, + std::string const& pattern, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::findall_re(strings, pattern, mr); + return detail::findall(strings, pattern, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/search/findall_record.cu b/cpp/src/strings/search/findall_record.cu new file mode 100644 index 00000000000..9ffdb33f5f2 --- /dev/null +++ b/cpp/src/strings/search/findall_record.cu @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include + +namespace cudf { +namespace strings { +namespace detail { + +using string_index_pair = thrust::pair; + +namespace { + +/** + * @brief This functor handles extracting matched strings by applying the compiled regex pattern + * and creating string_index_pairs for all the substrings. + */ +template +struct findall_fn { + column_device_view const d_strings; + reprog_device prog; + offset_type const* d_offsets; + string_index_pair* d_indices; + + __device__ void operator()(size_type const idx) + { + if (d_strings.is_null(idx)) { return; } + auto const d_str = d_strings.element(idx); + + auto d_output = d_indices + d_offsets[idx]; + size_type output_idx = 0; + + int32_t begin = 0; + int32_t end = d_str.length(); + while ((begin < end) && (prog.find(idx, d_str, begin, end) > 0)) { + auto const spos = d_str.byte_offset(begin); // convert + auto const epos = d_str.byte_offset(end); // to bytes + + d_output[output_idx++] = string_index_pair{d_str.data() + spos, (epos - spos)}; + + begin = end + (begin == end); + end = d_str.length(); + } + } +}; + +} // namespace + +// +std::unique_ptr findall_record( + strings_column_view const& strings, + std::string const& pattern, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +{ + auto const strings_count = strings.size(); + auto const d_strings = column_device_view::create(strings.parent(), stream); + + // compile regex into device object + auto const d_prog = + reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); + + // Create lists offsets column + auto offsets = count_matches(*d_strings, *d_prog, stream, mr); + auto d_offsets = offsets->mutable_view().data(); + + // Compute null output rows + auto [null_mask, null_count] = cudf::detail::valid_if( + d_offsets, + d_offsets + strings_count, + [] __device__(auto const v) { return v > 0; }, + stream, + mr); + + auto const valid_count = strings_count - null_count; + // Return an empty lists column if there are no valid rows + if (valid_count == 0) { + return make_lists_column(0, + make_empty_column(type_to_id()), + make_empty_column(type_id::STRING), + 0, + rmm::device_buffer{}, + stream, + mr); + } + + // Convert counts into offsets + thrust::exclusive_scan( + rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets); + + // Create indices vector with the total number of groups that will be extracted + auto total_matches = cudf::detail::get_value(offsets->view(), strings_count, stream); + + rmm::device_uvector indices(total_matches, stream); + auto d_indices = indices.data(); + auto begin = thrust::make_counting_iterator(0); + + // Build the string indices + auto const regex_insts = d_prog->insts_counts(); + if (regex_insts <= RX_SMALL_INSTS) { + findall_fn fn{*d_strings, *d_prog, d_offsets, d_indices}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn); + } else if (regex_insts <= RX_MEDIUM_INSTS) { + findall_fn fn{*d_strings, *d_prog, d_offsets, d_indices}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn); + } else if (regex_insts <= RX_LARGE_INSTS) { + findall_fn fn{*d_strings, *d_prog, d_offsets, d_indices}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn); + } else { + findall_fn fn{*d_strings, *d_prog, d_offsets, d_indices}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn); + } + + // Build the child strings column from the resulting indices + auto strings_output = make_strings_column(indices.begin(), indices.end(), stream, mr); + + // Build the lists column from the offsets and the strings + return make_lists_column(strings_count, + std::move(offsets), + std::move(strings_output), + null_count, + std::move(null_mask), + stream, + mr); +} + +} // namespace detail + +// external API + +std::unique_ptr findall_record(strings_column_view const& strings, + std::string const& pattern, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::findall_record(strings, pattern, rmm::cuda_stream_default, mr); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp index d7bf162d36f..4b1305a870a 100644 --- a/cpp/tests/strings/findall_tests.cpp +++ b/cpp/tests/strings/findall_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -56,7 +56,7 @@ TEST_F(StringsFindallTests, FindallTest) nullptr}; std::string pattern = "(\\w+)"; - auto results = cudf::strings::findall_re(strings_view, pattern); + auto results = cudf::strings::findall(strings_view, pattern); EXPECT_TRUE(results->num_columns() == 2); cudf::test::strings_column_wrapper expected1( @@ -75,6 +75,28 @@ TEST_F(StringsFindallTests, FindallTest) CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); } +TEST_F(StringsFindallTests, FindallRecord) +{ + cudf::test::strings_column_wrapper input( + {"3-A", "4-May 5-Day 6-Hay", "12-Dec-2021-Jan", "Feb-March", "4 ABC", "", "", "25-9000-Hal"}, + {1, 1, 1, 1, 1, 0, 1, 1}); + + auto results = cudf::strings::findall_record(cudf::strings_column_view(input), "(\\d+)-(\\w+)"); + + bool valids[] = {1, 1, 1, 0, 0, 0, 0, 1}; + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{"3-A"}, + LCW{"4-May", "5-Day", "6-Hay"}, + LCW{"12-Dec", "2021-Jan"}, + LCW{}, + LCW{}, + LCW{}, + LCW{}, + LCW{"25-9000"}}, + valids); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); +} + TEST_F(StringsFindallTests, MediumRegex) { // This results in 15 regex instructions and falls in the 'medium' range. @@ -87,7 +109,7 @@ TEST_F(StringsFindallTests, MediumRegex) thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto strings_view = cudf::strings_column_view(strings); - auto results = cudf::strings::findall_re(strings_view, medium_regex); + auto results = cudf::strings::findall(strings_view, medium_regex); EXPECT_TRUE(results->num_columns() == 2); std::vector h_expected1{"first words 1234", nullptr}; @@ -115,9 +137,11 @@ TEST_F(StringsFindallTests, LargeRegex) std::vector h_strings{ "hello @abc @def world The quick brown @fox jumps over the lazy @dog hello " "http://www.world.com I'm here @home zzzz", - "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234" + "12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012" + "34" "5678901234567890", - "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnop" + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmn" + "op" "qrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"}; cudf::test::strings_column_wrapper strings( h_strings.begin(), @@ -125,7 +149,7 @@ TEST_F(StringsFindallTests, LargeRegex) thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto strings_view = cudf::strings_column_view(strings); - auto results = cudf::strings::findall_re(strings_view, large_regex); + auto results = cudf::strings::findall(strings_view, large_regex); EXPECT_TRUE(results->num_columns() == 1); std::vector h_expected{large_regex.c_str(), nullptr, nullptr}; diff --git a/python/cudf/cudf/_lib/cpp/strings/findall.pxd b/python/cudf/cudf/_lib/cpp/strings/findall.pxd index 189d0770b81..5533467d72a 100644 --- a/python/cudf/cudf/_lib/cpp/strings/findall.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/findall.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -10,6 +10,10 @@ from cudf._lib.cpp.table.table cimport table cdef extern from "cudf/strings/findall.hpp" namespace "cudf::strings" nogil: - cdef unique_ptr[table] findall_re( - column_view source_strings, - string pattern) except + + cdef unique_ptr[table] findall( + const column_view& source_strings, + const string& pattern) except + + + cdef unique_ptr[column] findall_record( + const column_view& source_strings, + const string& pattern) except + diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx index 80af18e7c71..b17988018a6 100644 --- a/python/cudf/cudf/_lib/strings/findall.pyx +++ b/python/cudf/cudf/_lib/strings/findall.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -8,7 +8,10 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.scalar.scalar cimport string_scalar -from cudf._lib.cpp.strings.findall cimport findall_re as cpp_findall_re +from cudf._lib.cpp.strings.findall cimport ( + findall as cpp_findall, + findall_record as cpp_findall_record, +) from cudf._lib.cpp.table.table cimport table from cudf._lib.scalar cimport DeviceScalar from cudf._lib.utils cimport data_from_unique_ptr @@ -25,7 +28,7 @@ def findall(Column source_strings, pattern): cdef string pattern_string = str(pattern).encode() with nogil: - c_result = move(cpp_findall_re( + c_result = move(cpp_findall( source_view, pattern_string )) @@ -34,3 +37,22 @@ def findall(Column source_strings, pattern): move(c_result), column_names=range(0, c_result.get()[0].num_columns()) ) + + +def findall_record(Column source_strings, pattern): + """ + Returns data with all non-overlapping matches of `pattern` + in each string of `source_strings` as a lists column. + """ + cdef unique_ptr[column] c_result + cdef column_view source_view = source_strings.view() + + cdef string pattern_string = str(pattern).encode() + + with nogil: + c_result = move(cpp_findall_record( + source_view, + pattern_string + )) + + return Column.from_unique_ptr(move(c_result))