diff --git a/cpp/include/cudf/strings/detail/char_tables.hpp b/cpp/include/cudf/strings/detail/char_tables.hpp index 4ea7e3ee952..275b7223a3b 100644 --- a/cpp/include/cudf/strings/detail/char_tables.hpp +++ b/cpp/include/cudf/strings/detail/char_tables.hpp @@ -46,6 +46,7 @@ constexpr uint8_t IS_LOWER(uint8_t x) { return ((x) & (1 << 6)); } constexpr uint8_t IS_SPECIAL(uint8_t x) { return ((x) & (1 << 7)); } constexpr uint8_t IS_ALPHANUM(uint8_t x) { return ((x) & (0x0F)); } constexpr uint8_t IS_UPPER_OR_LOWER(uint8_t x) { return ((x) & ((1 << 5) | (1 << 6))); } +constexpr uint8_t ALL_FLAGS = 0xFF; // Type for the character cases table. using character_cases_table_type = uint16_t; diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp index 03bf538b1b2..265adc60392 100644 --- a/cpp/include/cudf/strings/string_view.hpp +++ b/cpp/include/cudf/strings/string_view.hpp @@ -17,6 +17,8 @@ #include +#include + #include /** diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/case.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/case.cuh new file mode 100644 index 00000000000..472101959a6 --- /dev/null +++ b/python/strings_udf/cpp/include/cudf/strings/udf/case.cuh @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "udf_string.cuh" + +#include +#include +#include + +namespace cudf { +namespace strings { +namespace udf { + +/** + * @brief Global variables for character-type flags and case conversion + */ +struct chars_tables { + cudf::strings::detail::character_flags_table_type* flags_table; + cudf::strings::detail::character_cases_table_type* cases_table; + struct cudf::strings::detail::special_case_mapping* special_case_mapping_table; +}; + +namespace detail { + +/** + * @brief Utility for converting a single character + * + * There are special cases where the conversion may result in multiple characters. + * + * @param tables The char tables required for conversion + * @param result String to append the converted character + * @param code_point The code-point of the character to convert + * @param flag The char-type flag of the character to convert + */ +__device__ inline void convert_char(chars_tables const tables, + udf_string& result, + uint32_t code_point, + uint8_t flag) +{ + if (!cudf::strings::detail::IS_SPECIAL(flag)) { + result.append(cudf::strings::detail::codepoint_to_utf8(tables.cases_table[code_point])); + return; + } + + // handle special case + auto const map = + tables + .special_case_mapping_table[cudf::strings::detail::get_special_case_hash_index(code_point)]; + auto const output_count = + cudf::strings::detail::IS_LOWER(flag) ? map.num_upper_chars : map.num_lower_chars; + auto const* output_chars = cudf::strings::detail::IS_LOWER(flag) ? map.upper : map.lower; + for (uint16_t idx = 0; idx < output_count; idx++) { + result.append(cudf::strings::detail::codepoint_to_utf8(output_chars[idx])); + } +} + +/** + * @brief Converts the given string to either upper or lower case + * + * @param tables The char tables required for conversion + * @param d_str Input string to convert + * @param case_flag Identifies upper/lower case conversion + * @return New string containing the converted characters + */ +__device__ inline udf_string convert_case( + chars_tables const tables, + string_view d_str, + cudf::strings::detail::character_flags_table_type case_flag) +{ + udf_string result; + for (auto const chr : d_str) { + auto const code_point = cudf::strings::detail::utf8_to_codepoint(chr); + auto const flag = code_point <= 0x00FFFF ? tables.flags_table[code_point] : 0; + + if ((flag & case_flag) || (cudf::strings::detail::IS_SPECIAL(flag) && + !cudf::strings::detail::IS_UPPER_OR_LOWER(flag))) { + convert_char(tables, result, code_point, flag); + } else { + result.append(chr); + } + } + + return result; +} + +/** + * @brief Utility for capitalize and title functions + * + * @tparam CapitalizeNextFn returns true if the next candidate character should be capitalized + * @param tables The char tables required for conversion + * @param d_str Input string to convert + * @param next_fn Function for next character capitalized + * @return New string containing the converted characters + */ +template +__device__ inline udf_string capitalize(chars_tables const tables, + string_view d_str, + CapitalizeNextFn next_fn) +{ + udf_string result; + bool capitalize = true; + for (auto const chr : d_str) { + auto const code_point = cudf::strings::detail::utf8_to_codepoint(chr); + auto const flag = code_point <= 0x00FFFF ? tables.flags_table[code_point] : 0; + auto const change_case = + capitalize ? cudf::strings::detail::IS_LOWER(flag) : cudf::strings::detail::IS_UPPER(flag); + if (change_case) { + detail::convert_char(tables, result, code_point, flag); + } else { + result.append(chr); + } + capitalize = next_fn(flag); + } + return result; +} +} // namespace detail + +/** + * @brief Converts the given string to lower case + * + * @param tables The char tables required for conversion + * @param d_str Input string to convert + * @return New string containing the converted characters + */ +__device__ inline udf_string to_lower(chars_tables const tables, string_view d_str) +{ + cudf::strings::detail::character_flags_table_type case_flag = cudf::strings::detail::IS_UPPER( + cudf::strings::detail::ALL_FLAGS); // convert only upper case characters + return detail::convert_case(tables, d_str, case_flag); +} + +/** + * @brief Converts the given string to upper case + * + * @param tables The char tables required for conversion + * @param d_str Input string to convert + * @return New string containing the converted characters + */ +__device__ inline udf_string to_upper(chars_tables const tables, string_view d_str) +{ + cudf::strings::detail::character_flags_table_type case_flag = cudf::strings::detail::IS_LOWER( + cudf::strings::detail::ALL_FLAGS); // convert only lower case characters + return detail::convert_case(tables, d_str, case_flag); +} + +/** + * @brief Converts the given string to lower/upper case + * + * All lower case characters are converted to upper case and + * all upper case characters are converted to lower case. + * + * @param tables The char tables required for conversion + * @param d_str Input string to convert + * @return New string containing the converted characters + */ +__device__ inline udf_string swap_case(chars_tables const tables, string_view d_str) +{ + cudf::strings::detail::character_flags_table_type case_flag = + cudf::strings::detail::IS_LOWER(cudf::strings::detail::ALL_FLAGS) | + cudf::strings::detail::IS_UPPER(cudf::strings::detail::ALL_FLAGS); + return detail::convert_case(tables, d_str, case_flag); +} + +/** + * @brief Capitalize the first character of the given string + * + * @param tables The char tables required for conversion + * @param d_str Input string to convert + * @return New string containing the converted characters + */ +__device__ inline udf_string capitalize(chars_tables const tables, string_view d_str) +{ + auto next_fn = [](cudf::strings::detail::character_flags_table_type) -> bool { return false; }; + return detail::capitalize(tables, d_str, next_fn); +} + +/** + * @brief Converts the given string to title case + * + * The first character after a non-character is converted to upper case. + * All other characters are converted to lower case. + * + * @param tables The char tables required for conversion + * @param d_str Input string to convert + * @return New string containing the converted characters + */ +__device__ inline udf_string title(chars_tables const tables, string_view d_str) +{ + auto next_fn = [](cudf::strings::detail::character_flags_table_type flag) -> bool { + return !cudf::strings::detail::IS_ALPHA(flag); + }; + return detail::capitalize(tables, d_str, next_fn); +} + +} // namespace udf +} // namespace strings +} // namespace cudf diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh new file mode 100644 index 00000000000..c8c9f6e46f4 --- /dev/null +++ b/python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh @@ -0,0 +1,72 @@ + +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "udf_string.cuh" + +#include +#include +#include + +namespace cudf { +namespace strings { +namespace udf { + +/** + * @brief Converts a string into an integer + * + * The '+' and '-' are allowed but only at the beginning of the string. + * The string is expected to contain base-10 [0-9] characters only. + * Any other character will end the parse. + * Overflow of the int64 type is not detected. + */ +__device__ inline int64_t stoi(string_view const& d_str) +{ + return cudf::strings::detail::string_to_integer(d_str); +} + +/** + * @brief Converts an integer into string + * + * @param value integer value to convert + */ +__device__ inline udf_string to_string(int64_t value) +{ + udf_string result; + if (value == 0) { + result.append("0"); + return result; + } + result.resize(cudf::strings::detail::count_digits(value)); + cudf::strings::detail::integer_to_string(value, result.data()); + return result; +} + +/** + * @brief Converts a string into a double + * + * This function supports scientific notation. + * Overflow goes to inf or -inf and underflow may go to 0. + */ +__device__ inline double stod(string_view const& d_str) +{ + return cudf::strings::detail::stod(d_str); +} + +} // namespace udf +} // namespace strings +} // namespace cudf diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh new file mode 100644 index 00000000000..d6d4ed637e9 --- /dev/null +++ b/python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh @@ -0,0 +1,72 @@ + +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "udf_string.cuh" + +#include + +namespace cudf { +namespace strings { +namespace udf { + +/** + * @brief Pad beginning and/or end of a string with the given fill character + * + * The side_type::BOTH will attempt to center the text using the `fill_char`. + * If `width <= d_str.length()` no change occurs and the input `d_str` is returned. + * + * @tparam side Specify where the padding should occur + * @param d_str String to pad + * @param width Minimum length in characters of the output string + * @param fill_char Character used for padding + */ +template +__device__ udf_string pad(cudf::string_view const d_str, + cudf::size_type width, + cudf::string_view fill_char = cudf::string_view{" ", 1}) +{ + if (fill_char.empty()) { return udf_string{d_str}; } + + udf_string result; + result.resize(cudf::strings::detail::compute_padded_size(d_str, width, fill_char.size_bytes())); + cudf::strings::detail::pad_impl(d_str, width, *fill_char.begin(), result.data()); + return result; +} + +/** + * @brief Pad beginning of a string with zero '0' + * + * If the `width` is smaller than the length of `d_str` no change occurs. + * + * If `d_str` starts with a sign character ('-' or '+') then '0' padding + * starts after the sign. + * + * @param d_str String to fill + * @param width Minimum length in characters of the output string (including the sign character) + */ +__device__ udf_string zfill(cudf::string_view const d_str, cudf::size_type width) +{ + udf_string result; + result.resize(cudf::strings::detail::compute_padded_size(d_str, width, 1)); + cudf::strings::detail::zfill_impl(d_str, width, result.data()); + return result; +} + +} // namespace udf +} // namespace strings +} // namespace cudf diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh new file mode 100644 index 00000000000..ca31425aa62 --- /dev/null +++ b/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh @@ -0,0 +1,282 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "udf_string.cuh" + +#include +#include + +namespace cudf { +namespace strings { +namespace udf { +namespace detail { + +/** + * @brief Split string using given string + * + * The caller must allocate an array of cudf::string_view to be filled + * in by this function. This function can be called with a `result=nullptr` + * to compute the number of tokens. + * + * @code{.cpp} + * auto d_str = cudf::string_view{"the best of times ", 19}; + * auto tgt = cudf::string_view{" ", 1}; + * auto token_count = split(d_str, tgt, nullptr); + * auto result = new cudf::string_view[token_count]; + * split(d_str, tgt, result); + * // result is array like ["the", "best", "", "of", "times", ""] + * @endcode + * + * @param d_str String to split + * @param tgt String to split on + * @param result Empty array to populate with output objects. + * Pass `nullptr` to just get the token count. + * @return Number of tokens returned + */ +__device__ inline cudf::size_type split(cudf::string_view const d_str, + cudf::string_view const tgt, + cudf::string_view* result) +{ + auto const nchars = d_str.length(); + cudf::size_type count = 0; + + cudf::size_type last_pos = 0; + while (last_pos <= nchars) { + cudf::size_type const pos = d_str.find(tgt, last_pos); + auto const length = (pos < 0 ? nchars : pos) - last_pos; + if (result) { *result++ = d_str.substr(last_pos, length); } + last_pos = pos + tgt.length(); + ++count; + if (pos < 0) { break; } + } + + return count; +} +} // namespace detail + +/** + * @brief Count tokens in a string without performing the split + * + * @code{.cpp} + * auto d_str = cudf::string_view{"the best of times ", 19}; + * auto tgt = cudf::string_view{" ", 1}; + * auto token_count = count_tokens(d_str, tgt); + * // token_count is 6 + * @endcode + * + * @param d_str String to split + * @param tgt String to split on + * @return Number of tokens returned + */ +__device__ inline cudf::size_type count_tokens(cudf::string_view const d_str, + cudf::string_view const tgt) +{ + return detail::split(d_str, tgt, nullptr); +} + +/** + * @brief Split string using given string + * + * The caller must allocate an array of cudf::string_view to be filled + * in by this function. + * + * @code{.cpp} + * auto d_str = cudf::string_view{"the best of times ", 19}; + * auto tgt = cudf::string_view{" ", 1}; + * auto token_count = count_tokens(d_str, tgt); + * auto result = new cudf::string_view[token_count]; + * split(d_str, tgt, result); + * // result is array like ["the", "best", "", "of", "times", ""] + * @endcode + * + * @param d_str String to split + * @param tgt String to split on + * @param result Empty array to populate with output objects. + * @return Number of tokens returned + */ +__device__ inline cudf::size_type split(cudf::string_view const d_str, + cudf::string_view const tgt, + cudf::string_view* result) +{ + return detail::split(d_str, tgt, result); +} + +/** + * @brief Split string using given target array + * + * @param d_str String to split + * @param tgt Character array encoded in UTF-8 used for identifying split points + * @param bytes Number of bytes to read from `tgt` + * @param result Empty array to populate with output objects + * @return Number of tokens returned + */ +__device__ inline cudf::size_type split(cudf::string_view const d_str, + char const* tgt, + cudf::size_type bytes, + cudf::string_view* result) +{ + return detail::split(d_str, cudf::string_view{tgt, bytes}, result); +} + +/** + * @brief Split string using given target array + * + * @param d_str String to split + * @param tgt Null-terminated character array encoded in UTF-8 used for identifying split points + * @param result Empty array to populate with output objects + * @return Number of tokens returned + */ +__device__ inline cudf::size_type split(cudf::string_view const d_str, + char const* tgt, + cudf::string_view* result) +{ + return split(d_str, tgt, detail::bytes_in_null_terminated_string(tgt), result); +} + +namespace detail { +/** + * @brief Split string on whitespace + * + * The caller must allocate an array of cudf::string_view to be filled + * in by this function. This function can be called with a `result=nullptr` + * to compute the number of tokens. + * + * @code{.cpp} + * auto d_str = cudf::string_view{"the best of times ", 19}; + * auto token_count = split(d_str, nullptr); + * auto result = new cudf::string_view[token_count]; + * split(d_str, result); + * // result is array like ["the", "best", "of", "times"] + * @endcode + * + * @param d_str String to split + * @param result Empty array to populate with output objects. + * Pass `nullptr` to just get the token count. + * @return Number of tokens returned + */ +__device__ inline cudf::size_type split(cudf::string_view const d_str, cudf::string_view* result) +{ + cudf::strings::detail::whitespace_string_tokenizer tokenizer{d_str}; + cudf::size_type count = 0; + while (tokenizer.next_token()) { + auto token = tokenizer.get_token(); + if (result) { *result++ = d_str.substr(token.first, token.second - token.first); } + ++count; + } + return count; +} +} // namespace detail + +/** + * @brief Count tokens in a string without performing the split on whitespace + * + * @code{.cpp} + * auto d_str = cudf::string_view{"the best of times ", 19}; + * auto token_count = count_tokens(d_str); + * // token_count is 4 + * @endcode + * + * @param d_str String to split + * @return Number of tokens returned + */ +__device__ inline cudf::size_type count_tokens(cudf::string_view const d_str) +{ + return detail::split(d_str, nullptr); +} + +/** + * @brief Split string on whitespace + * + * This will create tokens by splitting on one or more consecutive whitespace characters + * found in `d_str`. + * + * @param d_str String to split + * @param result Empty array to populate with output objects. + * @return Number of tokens returned + */ +__device__ inline cudf::size_type split(cudf::string_view const d_str, cudf::string_view* result) +{ + return detail::split(d_str, result); +} + +/** + * @brief Join an array of strings with a separator + * + * @code{.cpp} + * auto separator = cudf::string_view{"::", 2}; + * cudf::string_view input[] = { + * cudf::string_view{"hello", 5}, + * cudf::string_view{"goodbye", 7}, + * cudf::string_view{"world", 5} }; + * + * auto result = join(separator, input, 3); + * // result is "hello::goodbye::world" + * @endcode + * + * @param separator Separator string + * @param input An array of strings to join + * @param count Number of elements in `input` + * @return New string + */ +__device__ inline udf_string join(cudf::string_view const separator, + cudf::string_view* input, + cudf::size_type count) +{ + udf_string result{""}; + while (count-- > 0) { + result += *input++; + if (count > 0) { result += separator; } + } + return result; +} + +/** + * @brief Join an array of strings with a separator + * + * @param separator Null-terminated UTF-8 string + * @param bytes Number of bytes to read from `separator` + * @param input An array of strings to join + * @param count Number of elements in `input` + * @return New string + */ +__device__ inline udf_string join(char const* separator, + cudf::size_type bytes, + cudf::string_view* input, + cudf::size_type count) +{ + return join(cudf::string_view{separator, bytes}, input, count); +} + +/** + * @brief Join an array of strings with a separator + * + * @param separator Null-terminated UTF-8 string + * @param input An array of strings to join + * @param count Number of elements in `input` + * @return New string + */ +__device__ inline udf_string join(char const* separator, + cudf::string_view* input, + cudf::size_type count) +{ + return join(separator, detail::bytes_in_null_terminated_string(separator), input, count); +} + +} // namespace udf +} // namespace strings +} // namespace cudf diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/strip.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/strip.cuh new file mode 100644 index 00000000000..f2db3073460 --- /dev/null +++ b/python/strings_udf/cpp/include/cudf/strings/udf/strip.cuh @@ -0,0 +1,80 @@ + +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "udf_string.cuh" + +#include +#include + +namespace cudf { +namespace strings { +namespace udf { + +/** + * @brief Strip characters from the beginning and/or end of the given string + * + * The `d_to_strip` is interpreted as an array of characters to be removed. + * If `d_to_strip` is an empty string, whitespace characters are stripped. + * + * @code{.cpp} + * auto d_str = cudf::string_view{" aba ", 5}; + * auto d_to_strip = cudf::string_view{}; // empty string + * auto result = strip(d_str, d_to_strip); + * // result is "aba" + * d_to_strip = cudf::string_view{" a", 2}; // space and 'a' + * result = strip(d_str, d_to_strip); + * // result is "b" ('a' or ' ' removed from the ends) + * @endcode + * + * @code{.cpp} + * auto d_str = cudf::string_view{" aba ", 5}; + * auto d_to_strip = cudf::string_view{}; // empty string + * auto result = strip(d_str, d_to_strip, side_type::LEFT); + * // result is "aba " + * d_to_strip = cudf::string_view{"a ", 2}; // 'a' and space + * result = strip(d_str, d_to_strip, side_type::LEFT); + * // result is "ba " ('a' or ' ' removed from the beginning) + * @endcode + * + * @code{.cpp} + * auto d_str = cudf::string_view{" aba ", 5}; + * auto d_to_strip = cudf::string_view{}; // empty string + * auto result = strip(d_str, d_to_strip, side_type::RIGHT); + * // result is " aba" + * d_to_strip = cudf::string_view{" a", 2}; // space and 'a' + * result = rstrip(d_str, d_to_strip, side_type::RIGHT); + * // result is " ab" ('a' or ' ' removed from the end) + * @endcode + * + * @param d_str String to strip characters from + * @param d_to_strip Characters to remove + * @param stype From where to strip the characters; + * Default `BOTH` indicates stripping characters from the + * beginning and the end of the input string `d_str` + * @return New string with characters removed + */ +__device__ udf_string strip(cudf::string_view const d_str, + cudf::string_view const d_to_strip, + side_type stype = side_type::BOTH) +{ + return udf_string{cudf::strings::detail::strip(d_str, d_to_strip, stype)}; +} + +} // namespace udf +} // namespace strings +} // namespace cudf diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp b/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp index 6de9b91de08..68834afa082 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp +++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include @@ -26,14 +27,47 @@ namespace cudf { namespace strings { namespace udf { +class udf_string; + /** * @brief Return a cudf::string_view array for the given strings column * + * No string data is copied so the input column controls the lifetime of the + * underlying strings. + * + * New device memory is allocated and returned to hold just the string_view instances. + * * @param input Strings column to convert to a string_view array. - * @throw cudf::logic_error if input is not a strings column. + * @return Array of string_view objects in device memory */ std::unique_ptr to_string_view_array(cudf::column_view const input); +/** + * @brief Return a STRINGS column given an array of udf_string objects + * + * This will make a copy of the strings in d_string in order to build + * the output column. + * The individual udf_strings are also cleared freeing each of their internal + * device memory buffers. + * + * @param d_strings Pointer to device memory of udf_string objects + * @param size The number of elements in the d_strings array + * @return A strings column copy of the udf_string objects + */ +std::unique_ptr column_from_udf_string_array(udf_string* d_strings, + cudf::size_type size); + +/** + * @brief Frees a vector of udf_string objects + * + * The individual udf_strings are cleared freeing each of their internal + * device memory buffers. + * + * @param d_strings Pointer to device memory of udf_string objects + * @param size The number of elements in the d_strings array + */ +void free_udf_string_array(udf_string* d_strings, cudf::size_type size); + } // namespace udf } // namespace strings } // namespace cudf diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh new file mode 100644 index 00000000000..5c9a02a9510 --- /dev/null +++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh @@ -0,0 +1,457 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "udf_string.hpp" + +#include +#include + +#include +#include +#include + +namespace cudf { +namespace strings { +namespace udf { +namespace detail { + +/** + * @brief Count the bytes in a null-terminated character array + * + * @param str Null-terminated string + * @return Number of bytes in `str` upto but not including the null-terminator + */ +__device__ inline static cudf::size_type bytes_in_null_terminated_string(char const* str) +{ + if (!str) return 0; + cudf::size_type bytes = 0; + while (*str++) + ++bytes; + return bytes; +} + +} // namespace detail + +/** + * @brief Allocate memory for strings operation + * + * @param bytes Number of bytes in to allocate + * @return Pointer to allocated memory + */ +__device__ inline char* udf_string::allocate(cudf::size_type bytes) +{ + char* data = static_cast(malloc(bytes + 1)); + data[bytes] = '\0'; // add null-terminator so we can printf strings in device code + return data; +} + +/** + * @brief Free memory created by allocate() + * + * @param data Pointer to allocated memory + */ +__device__ inline void udf_string::deallocate(char* data) +{ + if (data) free(data); +} + +/** + * @brief Allocate memory for strings operation + * + * Reallocates memory for `m_data` with new size `bytes` + * The original data in `m_data` is preserved up to `min(bytes,m_bytes)` + * + * @param bytes Number of bytes in to allocate + * @return Pointer to allocated memory + */ +__device__ void udf_string::reallocate(cudf::size_type bytes) +{ + m_capacity = bytes; + auto new_data = allocate(m_capacity); + memcpy(new_data, m_data, std::min(m_bytes, bytes)); + deallocate(m_data); + m_data = new_data; +} + +__device__ inline udf_string::udf_string(char const* data, cudf::size_type bytes) + : m_bytes(bytes), m_capacity(bytes) +{ + m_data = allocate(m_capacity); + memcpy(m_data, data, bytes); +} + +__device__ udf_string::udf_string(cudf::size_type count, cudf::char_utf8 chr) +{ + if (count <= 0) { return; } + m_bytes = m_capacity = cudf::strings::detail::bytes_in_char_utf8(chr) * count; + m_data = allocate(m_capacity); + auto out_ptr = m_data; + for (cudf::size_type idx = 0; idx < count; ++idx) { + out_ptr += cudf::strings::detail::from_char_utf8(chr, out_ptr); + } +} + +__device__ inline udf_string::udf_string(char const* data) + : udf_string(data, detail::bytes_in_null_terminated_string(data)) +{ +} + +__device__ inline udf_string::udf_string(udf_string const& src) + : udf_string(src.m_data, src.m_bytes) +{ +} + +__device__ inline udf_string::udf_string(udf_string&& src) noexcept + : m_data(src.m_data), m_bytes(src.m_bytes), m_capacity(src.m_capacity) +{ + src.m_data = nullptr; + src.m_bytes = 0; + src.m_capacity = 0; +} + +__device__ inline udf_string::udf_string(cudf::string_view str) + : udf_string(str.data(), str.size_bytes()) +{ +} + +__device__ inline udf_string::~udf_string() { deallocate(m_data); } + +__device__ inline udf_string& udf_string::operator=(udf_string const& str) { return assign(str); } + +__device__ inline udf_string& udf_string::operator=(udf_string&& str) noexcept +{ + return assign(std::move(str)); +} + +__device__ inline udf_string& udf_string::operator=(cudf::string_view str) { return assign(str); } + +__device__ inline udf_string& udf_string::operator=(char const* str) { return assign(str); } + +__device__ udf_string& udf_string::assign(udf_string&& str) noexcept +{ + if (this == &str) { return *this; } + m_data = str.m_data; + m_bytes = str.m_bytes; + m_capacity = str.m_capacity; + str.m_data = nullptr; + str.m_bytes = 0; + str.m_capacity = 0; + return *this; +} + +__device__ udf_string& udf_string::assign(cudf::string_view str) +{ + return assign(str.data(), str.size_bytes()); +} + +__device__ udf_string& udf_string::assign(char const* str) +{ + return assign(str, detail::bytes_in_null_terminated_string(str)); +} + +__device__ udf_string& udf_string::assign(char const* str, cudf::size_type bytes) +{ + if (bytes >= m_capacity) { + deallocate(m_data); + m_capacity = bytes; + m_data = allocate(m_capacity); + } + m_bytes = bytes; + memcpy(m_data, str, bytes); + m_data[m_bytes] = '\0'; + return *this; +} + +__device__ inline cudf::size_type udf_string::size_bytes() const noexcept { return m_bytes; } + +__device__ inline cudf::size_type udf_string::length() const noexcept +{ + return cudf::strings::detail::characters_in_string(m_data, m_bytes); +} + +__device__ constexpr cudf::size_type udf_string::max_size() const noexcept +{ + return std::numeric_limits::max() - 1; +} + +__device__ inline char* udf_string::data() noexcept { return m_data; } + +__device__ inline char const* udf_string::data() const noexcept { return m_data; } + +__device__ inline bool udf_string::is_empty() const noexcept { return m_bytes == 0; } + +__device__ inline cudf::string_view::const_iterator udf_string::begin() const noexcept +{ + return cudf::string_view::const_iterator(cudf::string_view(m_data, m_bytes), 0); +} + +__device__ inline cudf::string_view::const_iterator udf_string::end() const noexcept +{ + return cudf::string_view::const_iterator(cudf::string_view(m_data, m_bytes), length()); +} + +__device__ inline cudf::char_utf8 udf_string::at(cudf::size_type pos) const +{ + auto const offset = byte_offset(pos); + auto chr = cudf::char_utf8{0}; + if (offset < m_bytes) { cudf::strings::detail::to_char_utf8(data() + offset, chr); } + return chr; +} + +__device__ inline cudf::char_utf8 udf_string::operator[](cudf::size_type pos) const +{ + return at(pos); +} + +__device__ inline cudf::size_type udf_string::byte_offset(cudf::size_type pos) const +{ + cudf::size_type offset = 0; + + auto start = m_data; + auto end = start + m_bytes; + while ((pos > 0) && (start < end)) { + auto const byte = static_cast(*start++); + auto const char_bytes = cudf::strings::detail::bytes_in_utf8_byte(byte); + if (char_bytes) { --pos; } + offset += char_bytes; + } + return offset; +} + +__device__ inline int udf_string::compare(cudf::string_view in) const noexcept +{ + return compare(in.data(), in.size_bytes()); +} + +__device__ inline int udf_string::compare(char const* data, cudf::size_type bytes) const +{ + auto const view = static_cast(*this); + return view.compare(data, bytes); +} + +__device__ inline bool udf_string::operator==(cudf::string_view rhs) const noexcept +{ + return m_bytes == rhs.size_bytes() && compare(rhs) == 0; +} + +__device__ inline bool udf_string::operator!=(cudf::string_view rhs) const noexcept +{ + return compare(rhs) != 0; +} + +__device__ inline bool udf_string::operator<(cudf::string_view rhs) const noexcept +{ + return compare(rhs) < 0; +} + +__device__ inline bool udf_string::operator>(cudf::string_view rhs) const noexcept +{ + return compare(rhs) > 0; +} + +__device__ inline bool udf_string::operator<=(cudf::string_view rhs) const noexcept +{ + return compare(rhs) <= 0; +} + +__device__ inline bool udf_string::operator>=(cudf::string_view rhs) const noexcept +{ + return compare(rhs) >= 0; +} + +__device__ inline void udf_string::clear() noexcept +{ + deallocate(m_data); + m_data = nullptr; + m_bytes = 0; + m_capacity = 0; +} + +__device__ inline void udf_string::resize(cudf::size_type count) +{ + if (count > max_size()) { return; } + if (count > m_capacity) { reallocate(count); } + + // add padding if necessary (null chars) + if (count > m_bytes) { memset(m_data + m_bytes, 0, count - m_bytes); } + + m_bytes = count; + m_data[m_bytes] = '\0'; +} + +__device__ void udf_string::reserve(cudf::size_type count) +{ + if (count < max_size() && count > m_capacity) { reallocate(count); } +} + +__device__ cudf::size_type udf_string::capacity() const noexcept { return m_capacity; } + +__device__ void udf_string::shrink_to_fit() +{ + if (m_bytes < m_capacity) { reallocate(m_bytes); } +} + +__device__ inline udf_string& udf_string::append(char const* str, cudf::size_type bytes) +{ + if (bytes <= 0) { return *this; } + auto const nbytes = m_bytes + bytes; + if (nbytes > m_capacity) { reallocate(2 * nbytes); } + memcpy(m_data + m_bytes, str, bytes); + m_bytes = nbytes; + m_data[m_bytes] = '\0'; + return *this; +} + +__device__ inline udf_string& udf_string::append(char const* str) +{ + return append(str, detail::bytes_in_null_terminated_string(str)); +} + +__device__ inline udf_string& udf_string::append(cudf::char_utf8 chr, cudf::size_type count) +{ + auto d_str = udf_string(count, chr); + return append(d_str); +} + +__device__ inline udf_string& udf_string::append(cudf::string_view in) +{ + return append(in.data(), in.size_bytes()); +} + +__device__ inline udf_string& udf_string::operator+=(cudf::string_view in) { return append(in); } + +__device__ inline udf_string& udf_string::operator+=(cudf::char_utf8 chr) { return append(chr); } + +__device__ inline udf_string& udf_string::operator+=(char const* str) { return append(str); } + +__device__ inline udf_string& udf_string::insert(cudf::size_type pos, + char const* str, + cudf::size_type in_bytes) +{ + return replace(pos, 0, str, in_bytes); +} + +__device__ inline udf_string& udf_string::insert(cudf::size_type pos, char const* str) +{ + return insert(pos, str, detail::bytes_in_null_terminated_string(str)); +} + +__device__ inline udf_string& udf_string::insert(cudf::size_type pos, cudf::string_view in) +{ + return insert(pos, in.data(), in.size_bytes()); +} + +__device__ inline udf_string& udf_string::insert(cudf::size_type pos, + cudf::size_type count, + cudf::char_utf8 chr) +{ + return replace(pos, 0, count, chr); +} + +__device__ inline udf_string udf_string::substr(cudf::size_type pos, cudf::size_type count) const +{ + if (pos < 0) { return udf_string{"", 0}; } + auto const start_pos = byte_offset(pos); + if (start_pos >= m_bytes) { return udf_string{"", 0}; } + auto const end_pos = count < 0 ? m_bytes : std::min(byte_offset(pos + count), m_bytes); + return udf_string{data() + start_pos, end_pos - start_pos}; +} + +// utility for replace() +__device__ void udf_string::shift_bytes(cudf::size_type start_pos, + cudf::size_type end_pos, + cudf::size_type nbytes) +{ + if (nbytes < m_bytes) { + // shift bytes to the left [...wxyz] -> [wxyzxyz] + auto src = end_pos; + auto tgt = start_pos; + while (tgt < nbytes) { + m_data[tgt++] = m_data[src++]; + } + } else if (nbytes > m_bytes) { + // shift bytes to the right [abcd...] -> [abcabcd] + auto src = m_bytes; + auto tgt = nbytes; + while (src > end_pos) { + m_data[--tgt] = m_data[--src]; + } + } +} + +__device__ inline udf_string& udf_string::replace(cudf::size_type pos, + cudf::size_type count, + char const* str, + cudf::size_type in_bytes) +{ + if (pos < 0 || in_bytes < 0) { return *this; } + auto const start_pos = byte_offset(pos); + if (start_pos > m_bytes) { return *this; } + auto const end_pos = count < 0 ? m_bytes : std::min(byte_offset(pos + count), m_bytes); + + // compute new size + auto const nbytes = m_bytes + in_bytes - (end_pos - start_pos); + if (nbytes > m_capacity) { reallocate(2 * nbytes); } + + // move bytes -- make room for replacement + shift_bytes(start_pos + in_bytes, end_pos, nbytes); + + // insert the replacement + memcpy(m_data + start_pos, str, in_bytes); + + m_bytes = nbytes; + m_data[m_bytes] = '\0'; + return *this; +} + +__device__ inline udf_string& udf_string::replace(cudf::size_type pos, + cudf::size_type count, + char const* str) +{ + return replace(pos, count, str, detail::bytes_in_null_terminated_string(str)); +} + +__device__ inline udf_string& udf_string::replace(cudf::size_type pos, + cudf::size_type count, + cudf::string_view in) +{ + return replace(pos, count, in.data(), in.size_bytes()); +} + +__device__ inline udf_string& udf_string::replace(cudf::size_type pos, + cudf::size_type count, + cudf::size_type chr_count, + cudf::char_utf8 chr) +{ + auto d_str = udf_string(chr_count, chr); + return replace(pos, count, d_str); +} + +__device__ udf_string& udf_string::erase(cudf::size_type pos, cudf::size_type count) +{ + return replace(pos, count, nullptr, 0); +} + +__device__ inline cudf::size_type udf_string::char_offset(cudf::size_type byte_pos) const +{ + return cudf::strings::detail::characters_in_string(data(), byte_pos); +} + +} // namespace udf +} // namespace strings +} // namespace cudf diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp new file mode 100644 index 00000000000..2bbda357cee --- /dev/null +++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp @@ -0,0 +1,550 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +// This header contains all class and function declarations so that it +// can be included in a .cpp file which only has declaration requirements +// (i.e. sizeof, conditionally-comparable, explicit conversions, etc). +// The definitions are coded in udf_string.cuh which is to be included +// in .cu files that use this class in kernel calls. + +namespace cudf { +namespace strings { +namespace udf { + +/** + * @brief Device string class for use with user-defined functions + * + * This class manages a device buffer of UTF-8 encoded characters + * for string manipulation in a device kernel. + * + * Its methods and behavior are modelled after std::string but + * with special consideration for UTF-8 encoded strings and for + * use within a cuDF UDF. + */ +class udf_string { + public: + /** + * @brief Represents unknown character position or length + */ + static constexpr cudf::size_type npos = static_cast(-1); + + /** + * @brief Cast to cudf::string_view operator + */ + __device__ operator cudf::string_view() const { return cudf::string_view(m_data, m_bytes); } + + /** + * @brief Create an empty string. + */ + udf_string() = default; + + /** + * @brief Create a string using existing device memory + * + * The given memory is copied into the instance returned. + * + * @param data Device pointer to UTF-8 encoded string + * @param bytes Number of bytes in `data` + */ + __device__ udf_string(char const* data, cudf::size_type bytes); + + /** + * @brief Create a string object from a null-terminated character array + * + * The given memory is copied into the instance returned. + * + * @param data Device pointer to UTF-8 encoded null-terminated + * character array. + */ + __device__ udf_string(char const* data); + + /** + * @brief Create a string object from a cudf::string_view + * + * The input string data is copied into the instance returned. + * + * @param str String to copy + */ + __device__ udf_string(cudf::string_view str); + + /** + * @brief Create a string object with `count` copies of character `chr` + * + * @param count Number of times to copy `chr` + * @param chr Character from which to create the string + */ + __device__ udf_string(cudf::size_type count, cudf::char_utf8 chr); + + /** + * @brief Create a string object from another instance + * + * The string data is copied from the `src` into the instance returned. + * + * @param src String to copy + */ + __device__ udf_string(udf_string const& src); + + /** + * @brief Move a string object from an rvalue reference + * + * The string data is moved from `src` into the instance returned. + * The `src` will have no content. + * + * @param src String to copy + */ + __device__ udf_string(udf_string&& src) noexcept; + + __device__ ~udf_string(); + + __device__ udf_string& operator=(udf_string const&); + __device__ udf_string& operator=(udf_string&&) noexcept; + __device__ udf_string& operator=(cudf::string_view const); + __device__ udf_string& operator=(char const*); + + /** + * @brief Return the number of bytes in this string + */ + __device__ cudf::size_type size_bytes() const noexcept; + + /** + * @brief Return the number of characters in this string + */ + __device__ cudf::size_type length() const noexcept; + + /** + * @brief Return the maximum number of bytes a udf_string can hold + */ + __device__ constexpr cudf::size_type max_size() const noexcept; + + /** + * @brief Return the internal pointer to the character array for this object + */ + __device__ char* data() noexcept; + __device__ char const* data() const noexcept; + + /** + * @brief Returns true if there are no characters in this string + */ + __device__ bool is_empty() const noexcept; + + /** + * @brief Returns an iterator that can be used to navigate through + * the UTF-8 characters in this string + * + * This returns a `cudf::string_view::const_iterator` which is read-only. + */ + __device__ cudf::string_view::const_iterator begin() const noexcept; + __device__ cudf::string_view::const_iterator end() const noexcept; + + /** + * @brief Returns the character at the specified position + * + * This will return 0 if `pos >= length()`. + * + * @param pos Index position of character to return + * @return Character at position `pos` + */ + __device__ cudf::char_utf8 at(cudf::size_type pos) const; + + /** + * @brief Returns the character at the specified index + * + * This will return 0 if `pos >= length()`. + * Note this is read-only. Use replace() to modify a character. + * + * @param pos Index position of character to return + * @return Character at position `pos` + */ + __device__ cudf::char_utf8 operator[](cudf::size_type pos) const; + + /** + * @brief Return the byte offset for a given character position + * + * The byte offset for the character at `pos` such that + * `data() + byte_offset(pos)` points to the memory location + * the character at position `pos`. + * + * The behavior is undefined if `pos < 0 or pos >= length()` + * + * @param pos Index position of character to return byte offset. + * @return Byte offset for character at `pos` + */ + __device__ cudf::size_type byte_offset(cudf::size_type pos) const; + + /** + * @brief Comparing target string with this string + * + * @param str Target string to compare with this string + * @return 0 If they compare equal + * <0 Either the value of the first character of this string that does + * not match is ordered before the corresponding character in `str`, + * or all compared characters match but the `str` string is shorter. + * >0 Either the value of the first character of this string that does + * not match is ordered after the corresponding character in `str`, + * or all compared characters match but the `str` string is longer. + */ + __device__ int compare(cudf::string_view str) const noexcept; + + /** + * @brief Comparing target character array with this string + * + * @param str Target array of UTF-8 characters. + * @param bytes Number of bytes in `str`. + * @return 0 If they compare equal + * <0 Either the value of the first character of this string that does + * not match is ordered before the corresponding character in `str`, + * or all compared characters match but `bytes < size_bytes()`. + * >0 Either the value of the first character of this string that does + * not match is ordered after the corresponding character in `str`, + * or all compared characters match but `bytes > size_bytes()`. + */ + __device__ int compare(char const* str, cudf::size_type bytes) const; + + /** + * @brief Returns true if `rhs` matches this string exactly + */ + __device__ bool operator==(cudf::string_view rhs) const noexcept; + + /** + * @brief Returns true if `rhs` does not match this string + */ + __device__ bool operator!=(cudf::string_view rhs) const noexcept; + + /** + * @brief Returns true if this string is ordered before `rhs` + */ + __device__ bool operator<(cudf::string_view rhs) const noexcept; + + /** + * @brief Returns true if `rhs` is ordered before this string + */ + __device__ bool operator>(cudf::string_view rhs) const noexcept; + + /** + * @brief Returns true if this string matches or is ordered before `rhs` + */ + __device__ bool operator<=(cudf::string_view rhs) const noexcept; + + /** + * @brief Returns true if `rhs` matches or is ordered before this string + */ + __device__ bool operator>=(cudf::string_view rhs) const noexcept; + + /** + * @brief Remove all bytes from this string + * + * All pointers, references, and iterators are invalidated. + */ + __device__ void clear() noexcept; + + /** + * @brief Resizes string to contain `count` bytes + * + * If `count > size_bytes()` then zero-padding is added. + * If `count < size_bytes()` then the string is truncated to size `count`. + * + * All pointers, references, and iterators may be invalidated. + * + * The behavior is undefined if `count > max_size()` + * + * @param count Size in bytes of this string. + */ + __device__ void resize(cudf::size_type count); + + /** + * @brief Reserve `count` bytes in this string + * + * If `count > capacity()`, new memory is allocated and `capacity()` will + * be greater than or equal to `count`. + * There is no effect if `count <= capacity()`. + * + * @param count Total number of bytes to reserve for this string + */ + __device__ void reserve(cudf::size_type count); + + /** + * @brief Returns the number of bytes that the string has allocated + */ + __device__ cudf::size_type capacity() const noexcept; + + /** + * @brief Reduces internal allocation to just `size_bytes()` + * + * All pointers, references, and iterators may be invalidated. + */ + __device__ void shrink_to_fit(); + + /** + * @brief Moves the contents of `str` into this string instance + * + * On return, the `str` will have no contents. + * + * @param str String to move + * @return This string with new contents + */ + __device__ udf_string& assign(udf_string&& str) noexcept; + + /** + * @brief Replaces the contents of this string with contents of `str` + * + * @param str String to copy + * @return This string with new contents + */ + __device__ udf_string& assign(cudf::string_view str); + + /** + * @brief Replaces the contents of this string with contents of `str` + * + * @param str Null-terminated UTF-8 character array + * @return This string with new contents + */ + __device__ udf_string& assign(char const* str); + + /** + * @brief Replaces the contents of this string with contents of `str` + * + * @param str UTF-8 character array + * @param bytes Number of bytes to copy from `str` + * @return This string with new contents + */ + __device__ udf_string& assign(char const* str, cudf::size_type bytes); + + /** + * @brief Append a string to the end of this string + * + * @param str String to append + * @return This string with the appended argument + */ + __device__ udf_string& operator+=(cudf::string_view str); + + /** + * @brief Append a character to the end of this string + * + * @param str Character to append + * @return This string with the appended argument + */ + __device__ udf_string& operator+=(cudf::char_utf8 chr); + + /** + * @brief Append a null-terminated device memory character array + * to the end of this string + * + * @param str String to append + * @return This string with the appended argument + */ + __device__ udf_string& operator+=(char const* str); + + /** + * @brief Append a null-terminated character array to the end of this string + * + * @param str String to append + * @return This string with the appended argument + */ + __device__ udf_string& append(char const* str); + + /** + * @brief Append a character array to the end of this string + * + * @param str Character array to append + * @param bytes Number of bytes from `str` to append. + * @return This string with the appended argument + */ + __device__ udf_string& append(char const* str, cudf::size_type bytes); + + /** + * @brief Append a string to the end of this string + * + * @param str String to append + * @return This string with the appended argument + */ + __device__ udf_string& append(cudf::string_view str); + + /** + * @brief Append a character to the end of this string + * a specified number of times. + * + * @param chr Character to append + * @param count Number of times to append `chr` + * @return This string with the append character(s) + */ + __device__ udf_string& append(cudf::char_utf8 chr, cudf::size_type count = 1); + + /** + * @brief Insert a string into the character position specified + * + * There is no effect if `pos < 0 or pos > length()`. + * + * @param pos Character position to begin insert + * @param str String to insert into this one + * @return This string with the inserted argument + */ + __device__ udf_string& insert(cudf::size_type pos, cudf::string_view str); + + /** + * @brief Insert a null-terminated character array into the character position specified + * + * There is no effect if `pos < 0 or pos > length()`. + * + * @param pos Character position to begin insert + * @param data Null-terminated character array to insert + * @return This string with the inserted argument + */ + __device__ udf_string& insert(cudf::size_type pos, char const* data); + + /** + * @brief Insert a character array into the character position specified + * + * There is no effect if `pos < 0 or pos > length()`. + * + * @param pos Character position to begin insert + * @param data Character array to insert + * @param bytes Number of bytes from `data` to insert + * @return This string with the inserted argument + */ + __device__ udf_string& insert(cudf::size_type pos, char const* data, cudf::size_type bytes); + + /** + * @brief Insert a character one or more times into the character position specified + * + * There is no effect if `pos < 0 or pos > length()`. + * + * @param pos Character position to begin insert + * @param count Number of times to insert `chr` + * @param chr Character to insert + * @return This string with the inserted argument + */ + __device__ udf_string& insert(cudf::size_type pos, cudf::size_type count, cudf::char_utf8 chr); + + /** + * @brief Returns a substring of this string + * + * An empty string is returned if `pos < 0 or pos >= length()`. + * + * @param pos Character position to start the substring + * @param count Number of characters for the substring; + * This can be greater than the number of available characters. + * Default npos returns characters in range `[pos, length())`. + * @return New string with the specified characters + */ + __device__ udf_string substr(cudf::size_type pos, cudf::size_type count = npos) const; + + /** + * @brief Replace a range of characters with a given string + * + * Replaces characters in range `[pos, pos + count]` with `str`. + * There is no effect if `pos < 0 or pos > length()`. + * + * If `count==0` then `str` is inserted starting at `pos`. + * If `count==npos` then the replacement range is `[pos,length())`. + * + * @param pos Position of first character to replace + * @param count Number of characters to replace + * @param str String to replace the given range + * @return This string modified with the replacement + */ + __device__ udf_string& replace(cudf::size_type pos, cudf::size_type count, cudf::string_view str); + + /** + * @brief Replace a range of characters with a null-terminated character array + * + * Replaces characters in range `[pos, pos + count)` with `data`. + * There is no effect if `pos < 0 or pos > length()`. + * + * If `count==0` then `data` is inserted starting at `pos`. + * If `count==npos` then the replacement range is `[pos,length())`. + * + * @param pos Position of first character to replace + * @param count Number of characters to replace + * @param data Null-terminated character array to replace the given range + * @return This string modified with the replacement + */ + __device__ udf_string& replace(cudf::size_type pos, cudf::size_type count, char const* data); + + /** + * @brief Replace a range of characters with a given character array + * + * Replaces characters in range `[pos, pos + count)` with `[data, data + bytes)`. + * There is no effect if `pos < 0 or pos > length()`. + * + * If `count==0` then `data` is inserted starting at `pos`. + * If `count==npos` then the replacement range is `[pos,length())`. + * + * @param pos Position of first character to replace + * @param count Number of characters to replace + * @param data String to replace the given range + * @param bytes Number of bytes from data to use for replacement + * @return This string modified with the replacement + */ + __device__ udf_string& replace(cudf::size_type pos, + cudf::size_type count, + char const* data, + cudf::size_type bytes); + + /** + * @brief Replace a range of characters with a character one or more times + * + * Replaces characters in range `[pos, pos + count)` with `chr` `chr_count` times. + * There is no effect if `pos < 0 or pos > length()`. + * + * If `count==0` then `chr` is inserted starting at `pos`. + * If `count==npos` then the replacement range is `[pos,length())`. + * + * @param pos Position of first character to replace + * @param count Number of characters to replace + * @param chr_count Number of times `chr` will repeated + * @param chr Character to use for replacement + * @return This string modified with the replacement + */ + __device__ udf_string& replace(cudf::size_type pos, + cudf::size_type count, + cudf::size_type chr_count, + cudf::char_utf8 chr); + + /** + * @brief Removes specified characters from this string + * + * Removes `min(count, length() - pos)` characters starting at `pos`. + * There is no effect if `pos < 0 or pos >= length()`. + * + * @param pos Character position to begin insert + * @param count Number of characters to remove starting at `pos` + * @return This string with remove characters + */ + __device__ udf_string& erase(cudf::size_type pos, cudf::size_type count = npos); + + private: + char* m_data{}; + cudf::size_type m_bytes{}; + cudf::size_type m_capacity{}; + + // utilities + __device__ char* allocate(cudf::size_type bytes); + __device__ void deallocate(char* data); + __device__ void reallocate(cudf::size_type bytes); + __device__ cudf::size_type char_offset(cudf::size_type byte_pos) const; + __device__ void shift_bytes(cudf::size_type start_pos, + cudf::size_type end_pos, + cudf::size_type nbytes); +}; + +} // namespace udf +} // namespace strings +} // namespace cudf diff --git a/python/strings_udf/cpp/src/strings/udf/udf_apis.cu b/python/strings_udf/cpp/src/strings/udf/udf_apis.cu index 89952dadb6c..7927740fd49 100644 --- a/python/strings_udf/cpp/src/strings/udf/udf_apis.cu +++ b/python/strings_udf/cpp/src/strings/udf/udf_apis.cu @@ -15,18 +15,44 @@ */ #include +#include #include #include #include +#include #include +#include + +#include +#include namespace cudf { namespace strings { namespace udf { namespace detail { +namespace { + +/** + * @brief Functor wraps string_view objects around udf_string objects + * + * No string data is copied. + */ +struct udf_string_to_string_view_transform_fn { + __device__ cudf::string_view operator()(cudf::strings::udf::udf_string const& dstr) + { + return cudf::string_view{dstr.data(), dstr.size_bytes()}; + } +}; +} // namespace + +/** + * @copydoc to_string_view_array + * + * @param stream CUDA stream used for allocating/copying device memory and launching kernels + */ std::unique_ptr to_string_view_array(cudf::column_view const input, rmm::cuda_stream_view stream) { @@ -36,13 +62,61 @@ std::unique_ptr to_string_view_array(cudf::column_view const .release())); } +/** + * @copydoc column_from_udf_string_array + * + * @param stream CUDA stream used for allocating/copying device memory and launching kernels + */ +std::unique_ptr column_from_udf_string_array(udf_string* d_strings, + cudf::size_type size, + rmm::cuda_stream_view stream) +{ + // create string_views of the udf_strings + auto indices = rmm::device_uvector(size, stream); + thrust::transform(rmm::exec_policy(stream), + d_strings, + d_strings + size, + indices.data(), + udf_string_to_string_view_transform_fn{}); + + return cudf::make_strings_column(indices, cudf::string_view(nullptr, 0), stream); +} + +/** + * @copydoc free_udf_string_array + * + * @param stream CUDA stream used for allocating/copying device memory and launching kernels + */ +void free_udf_string_array(cudf::strings::udf::udf_string* d_strings, + cudf::size_type size, + rmm::cuda_stream_view stream) +{ + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + size, + [d_strings] __device__(auto idx) { d_strings[idx].clear(); }); +} + } // namespace detail +// external APIs + std::unique_ptr to_string_view_array(cudf::column_view const input) { return detail::to_string_view_array(input, cudf::get_default_stream()); } +std::unique_ptr column_from_udf_string_array(udf_string* d_strings, + cudf::size_type size) +{ + return detail::column_from_udf_string_array(d_strings, size, cudf::get_default_stream()); +} + +void free_udf_string_array(udf_string* d_strings, cudf::size_type size) +{ + detail::free_udf_string_array(d_strings, size, cudf::get_default_stream()); +} + } // namespace udf } // namespace strings } // namespace cudf