diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index aae62fbd47c..6d1c0528832 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -86,7 +86,7 @@ git submodule update --init --remote --recursive ```bash # create the conda environment (assuming in base `cudf` directory) # note: RAPIDS currently doesn't support `channel_priority: strict`; use `channel_priority: flexible` instead -conda env create --name cudf_dev --file conda/environments/cudf_dev_cuda11.0.yml +conda env create --name cudf_dev --file conda/environments/cudf_dev_cuda11.5.yml # activate the environment conda activate cudf_dev ``` diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 86ec24c1b7b..84e486c7e18 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -343,6 +343,7 @@ add_library( src/lists/lists_column_factories.cu src/lists/lists_column_view.cu src/lists/segmented_sort.cu + src/lists/sequences.cu src/merge/merge.cu src/partitioning/partitioning.cu src/partitioning/round_robin.cu @@ -416,7 +417,8 @@ add_library( src/strings/copying/concatenate.cu src/strings/copying/copying.cu src/strings/copying/shift.cu - src/strings/extract.cu + src/strings/extract/extract.cu + src/strings/extract/extract_all.cu src/strings/filling/fill.cu src/strings/filter_chars.cu src/strings/findall.cu diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake index b58bdb55de3..16e7a58b020 100644 --- a/cpp/cmake/thirdparty/get_cucollections.cmake +++ b/cpp/cmake/thirdparty/get_cucollections.cmake @@ -21,7 +21,7 @@ function(find_and_configure_cucollections) cuco 0.0 GLOBAL_TARGETS cuco::cuco CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections - GIT_TAG 6433e8ad7571f14cc5384051b049029c60dd1ce0 + GIT_TAG 193de1aa74f5721717f991ca757dc610c852bb17 OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF" ) diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake index 574bfa26a0c..fcf9f0d73ee 100644 --- a/cpp/cmake/thirdparty/get_thrust.cmake +++ b/cpp/cmake/thirdparty/get_thrust.cmake @@ -80,6 +80,6 @@ function(find_and_configure_thrust VERSION) endif() endfunction() -set(CUDF_MIN_VERSION_Thrust 1.12.0) +set(CUDF_MIN_VERSION_Thrust 1.15.0) find_and_configure_thrust(${CUDF_MIN_VERSION_Thrust}) diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index 17bea935dfd..117119cd40f 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -285,280 +285,66 @@ std::unique_ptr extract_quarter( cudf::column_view const& column, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** @} */ // end of group - -/** - * @brief Round up to the nearest day - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr ceil_day( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round up to the nearest hour - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr ceil_hour( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round up to the nearest minute - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr ceil_minute( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round up to the nearest second - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr ceil_second( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round up to the nearest millisecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr ceil_millisecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round up to the nearest microsecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr ceil_microsecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round up to the nearest nanosecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr ceil_nanosecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** - * @brief Round down to the nearest day - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. + * @brief Fixed frequencies supported by datetime rounding functions ceil, floor, round. * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column */ -std::unique_ptr floor_day( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +enum class rounding_frequency : int32_t { + DAY, + HOUR, + MINUTE, + SECOND, + MILLISECOND, + MICROSECOND, + NANOSECOND +}; /** - * @brief Round down to the nearest hour + * @brief Round datetimes up to the nearest multiple of the given frequency. * - * @param column cudf::column_view of the input datetime values + * @param column cudf::column_view of the input datetime values. + * @param freq rounding_frequency indicating the frequency to round up to. * @param mr Device memory resource used to allocate device memory of the returned column. * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column + * @throw cudf::logic_error if input column datatype is not TIMESTAMP. + * @return cudf::column of the same datetime resolution as the input column. */ -std::unique_ptr floor_hour( +std::unique_ptr ceil_datetimes( cudf::column_view const& column, + rounding_frequency freq, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Round down to the nearest minute + * @brief Round datetimes down to the nearest multiple of the given frequency. * - * @param column cudf::column_view of the input datetime values + * @param column cudf::column_view of the input datetime values. + * @param freq rounding_frequency indicating the frequency to round down to. * @param mr Device memory resource used to allocate device memory of the returned column. * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column + * @throw cudf::logic_error if input column datatype is not TIMESTAMP. + * @return cudf::column of the same datetime resolution as the input column. */ -std::unique_ptr floor_minute( +std::unique_ptr floor_datetimes( cudf::column_view const& column, + rounding_frequency freq, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Round down to the nearest second + * @brief Round datetimes to the nearest multiple of the given frequency. * - * @param column cudf::column_view of the input datetime values + * @param column cudf::column_view of the input datetime values. + * @param freq rounding_frequency indicating the frequency to round to. * @param mr Device memory resource used to allocate device memory of the returned column. * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column + * @throw cudf::logic_error if input column datatype is not TIMESTAMP. + * @return cudf::column of the same datetime resolution as the input column. */ -std::unique_ptr floor_second( +std::unique_ptr round_datetimes( cudf::column_view const& column, + rounding_frequency freq, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** - * @brief Round down to the nearest millisecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr floor_millisecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round down to the nearest microsecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr floor_microsecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round down to the nearest nanosecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr floor_nanosecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round to the nearest day - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr round_day( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round to the nearest hour - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr round_hour( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round to the nearest minute - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr round_minute( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round to the nearest second - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr round_second( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round to the nearest millisecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr round_millisecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round to the nearest microsecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr round_microsecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round to the nearest nanosecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr round_nanosecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** @} */ // end of group } // namespace datetime } // namespace cudf diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/detail/hashing.hpp index bd5c8a42a51..0fc807593fb 100644 --- a/cpp/include/cudf/detail/hashing.hpp +++ b/cpp/include/cudf/detail/hashing.hpp @@ -32,17 +32,15 @@ namespace detail { */ std::unique_ptr hash( table_view const& input, - hash_id hash_function = hash_id::HASH_MURMUR3, - cudf::host_span initial_hash = {}, - uint32_t seed = 0, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + hash_id hash_function = hash_id::HASH_MURMUR3, + uint32_t seed = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); std::unique_ptr murmur_hash3_32( table_view const& input, - cudf::host_span initial_hash = {}, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); std::unique_ptr md5_hash( table_view const& input, diff --git a/cpp/include/cudf/filling.hpp b/cpp/include/cudf/filling.hpp index aff0d20a467..905a897eb40 100644 --- a/cpp/include/cudf/filling.hpp +++ b/cpp/include/cudf/filling.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -169,7 +169,7 @@ std::unique_ptr repeat( * @param init First value in the sequence * @param step Increment value * @param mr Device memory resource used to allocate the returned column's device memory - * @return std::unique_ptr The result table containing the sequence + * @return The result column containing the generated sequence */ std::unique_ptr sequence( size_type size, @@ -195,7 +195,7 @@ std::unique_ptr sequence( * @param size Size of the output column * @param init First value in the sequence * @param mr Device memory resource used to allocate the returned column's device memory - * @return std::unique_ptr The result table containing the sequence + * @return The result column containing the generated sequence */ std::unique_ptr sequence( size_type size, @@ -223,7 +223,7 @@ std::unique_ptr sequence( * @param months Months to increment * @param mr Device memory resource used to allocate the returned column's device memory * - * @returns Timestamps column with sequences of months. + * @return Timestamps column with sequences of months. */ std::unique_ptr calendrical_month_sequence( size_type size, diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp index 6b281c3f7f4..cce05042917 100644 --- a/cpp/include/cudf/hashing.hpp +++ b/cpp/include/cudf/hashing.hpp @@ -31,8 +31,6 @@ namespace cudf { * * @param input The table of columns to hash. * @param hash_function The hash function enum to use. - * @param initial_hash Optional host_span of initial hash values for each column. - * If this span is empty then each element will be hashed as-is. * @param seed Optional seed value to use for the hash function. * @param mr Device memory resource used to allocate the returned column's device memory. * @@ -40,10 +38,9 @@ namespace cudf { */ std::unique_ptr hash( table_view const& input, - hash_id hash_function = hash_id::HASH_MURMUR3, - cudf::host_span initial_hash = {}, - uint32_t seed = DEFAULT_HASH_SEED, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + hash_id hash_function = hash_id::HASH_MURMUR3, + uint32_t seed = DEFAULT_HASH_SEED, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/lists/filling.hpp b/cpp/include/cudf/lists/filling.hpp new file mode 100644 index 00000000000..74a4dac1e10 --- /dev/null +++ b/cpp/include/cudf/lists/filling.hpp @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cudf::lists { +/** + * @addtogroup lists_filling + * @{ + * @file + * @brief Column APIs for individual list sequence + */ + +/** + * @brief Create a lists column in which each row contains a sequence of values specified by a tuple + * of (`start`, `size`) parameters. + * + * Create a lists column in which each row is a sequence of values starting from a `start` value, + * incrementing by one, and its cardinality is specified by a `size` value. The `start` and `size` + * values used to generate each list is taken from the corresponding row of the input @p starts and + * @p sizes columns. + * + * - @p sizes must be a column of integer types. + * - All the input columns must not have nulls. + * - If any row of the @p sizes column contains negative value, the output is undefined. + * + * @code{.pseudo} + * starts = [0, 1, 2, 3, 4] + * sizes = [0, 2, 2, 1, 3] + * + * output = [ [], [1, 2], [2, 3], [3], [4, 5, 6] ] + * @endcode + * + * @throws cudf::logic_error if @p sizes column is not of integer types. + * @throws cudf::logic_error if any input column has nulls. + * @throws cudf::logic_error if @p starts and @p sizes columns do not have the same size. + * + * @param starts First values in the result sequences. + * @param sizes Numbers of values in the result sequences. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return The result column containing generated sequences. + */ +std::unique_ptr sequences( + column_view const& starts, + column_view const& sizes, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Create a lists column in which each row contains a sequence of values specified by a tuple + * of (`start`, `step`, `size`) parameters. + * + * Create a lists column in which each row is a sequence of values starting from a `start` value, + * incrementing by a `step` value, and its cardinality is specified by a `size` value. The values + * `start`, `step`, and `size` used to generate each list is taken from the corresponding row of the + * input @p starts, @p steps, and @p sizes columns. + * + * - @p sizes must be a column of integer types. + * - @p starts and @p steps columns must have the same type. + * - All the input columns must not have nulls. + * - If any row of the @p sizes column contains negative value, the output is undefined. + * + * @code{.pseudo} + * starts = [0, 1, 2, 3, 4] + * steps = [2, 1, 1, 1, -3] + * sizes = [0, 2, 2, 1, 3] + * + * output = [ [], [1, 2], [2, 3], [3], [4, 1, -2] ] + * @endcode + * + * @throws cudf::logic_error if @p sizes column is not of integer types. + * @throws cudf::logic_error if any input column has nulls. + * @throws cudf::logic_error if @p starts and @p steps columns have different types. + * @throws cudf::logic_error if @p starts, @p steps, and @p sizes columns do not have the same size. + * + * @param starts First values in the result sequences. + * @param steps Increment values for the result sequences. + * @param sizes Numbers of values in the result sequences. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return The result column containing generated sequences. + */ +std::unique_ptr sequences( + column_view const& starts, + column_view const& steps, + column_view const& sizes, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of group +} // namespace cudf::lists diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh index b35f5df2903..9da3c6b0e91 100644 --- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh +++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh @@ -33,6 +33,12 @@ namespace cudf { namespace strings { namespace detail { +/** + * @brief Basic type expected for iterators passed to `make_strings_column` that represent string + * data in device memory. + */ +using string_index_pair = thrust::pair; + /** * @brief Average string byte-length threshold for deciding character-level * vs. row-level parallel algorithm. @@ -64,8 +70,6 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, size_type strings_count = thrust::distance(begin, end); if (strings_count == 0) return make_empty_column(type_id::STRING); - using string_index_pair = thrust::pair; - // check total size is not too large for cudf column auto size_checker = [] __device__(string_index_pair const& item) { return (item.first != nullptr) ? item.second : 0; diff --git a/cpp/include/cudf/strings/extract.hpp b/cpp/include/cudf/strings/extract.hpp index 6f5902266b2..466f71aace0 100644 --- a/cpp/include/cudf/strings/extract.hpp +++ b/cpp/include/cudf/strings/extract.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,20 +27,21 @@ namespace strings { */ /** - * @brief Returns a vector of strings columns for each matching group specified in the given regular - * expression pattern. + * @brief Returns a table of strings columns where each column corresponds to the matching + * group specified in the given regular expression pattern. * * All the strings for the first group will go in the first output column; the second group - * go in the second column and so on. Null entries are added if the string does match. + * go in the second column and so on. Null entries are added to the columns in row `i` if + * the string at row `i` does not match. * * Any null string entries return corresponding null output column entries. * * @code{.pseudo} * Example: - * s = ["a1","b2","c3"] - * r = extract(s,"([ab])(\\d)") - * r is now [["a","b",null], - * ["1","2",null]] + * s = ["a1", "b2", "c3"] + * r = extract(s, "([ab])(\\d)") + * r is now [ ["a", "b", null], + * ["1", "2", null] ] * @endcode * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. @@ -55,6 +56,39 @@ std::unique_ptr
extract( std::string const& pattern, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a lists column of strings where each string column row corresponds to the + * matching group specified in the given regular expression pattern. + * + * All the matching groups for the first row will go in the first row output column; the second + * row results will go into the second row output column and so on. + * + * A null output row will result if the corresponding input string row does not match or + * that input row is null. + * + * @code{.pseudo} + * Example: + * s = ["a1 b4", "b2", "c3 a5", "b", null] + * r = extract_all(s,"([ab])(\\d)") + * r is now [ ["a", "1", "b", "4"], + * ["b", "2"], + * ["a", "5"], + * null, + * null ] + * @endcode + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @param strings Strings instance for this operation. + * @param pattern The regular expression pattern with group indicators. + * @param mr Device memory resource used to allocate any returned device memory. + * @return Lists column containing strings extracted from the input column. + */ +std::unique_ptr extract_all( + strings_column_view const& strings, + std::string const& pattern, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh index 0f3ca073380..32ddd1ef49a 100644 --- a/cpp/include/cudf/table/row_operators.cuh +++ b/cpp/include/cudf/table/row_operators.cuh @@ -539,52 +539,4 @@ class row_hasher { uint32_t _seed{DEFAULT_HASH_SEED}; }; -/** - * @brief Computes the hash value of a row in the given table, combined with an - * initial hash value for each column. - * - * @tparam hash_function Hash functor to use for hashing elements. - * @tparam Nullate A cudf::nullate type describing how to check for nulls. - */ -template