From 40eb1e2736e2cfb8ed8ab13c0ba44e4ae5cf0f6e Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 12 Oct 2022 16:46:59 -0400 Subject: [PATCH 01/54] Add strings udf C++ classes and function for phase II --- .../cpp/include/cudf/strings/udf/case.cuh | 207 +++++++ .../cpp/include/cudf/strings/udf/numeric.cuh | 74 +++ .../cpp/include/cudf/strings/udf/split.cuh | 192 +++++++ .../cpp/include/cudf/strings/udf/strip.cuh | 110 ++++ .../cpp/include/cudf/strings/udf/udf_apis.hpp | 10 + .../include/cudf/strings/udf/udf_string.cuh | 472 ++++++++++++++++ .../include/cudf/strings/udf/udf_string.hpp | 523 ++++++++++++++++++ .../cpp/src/strings/udf/udf_apis.cu | 61 ++ 8 files changed, 1649 insertions(+) create mode 100644 python/strings_udf/cpp/include/cudf/strings/udf/case.cuh create mode 100644 python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh create mode 100644 python/strings_udf/cpp/include/cudf/strings/udf/split.cuh create mode 100644 python/strings_udf/cpp/include/cudf/strings/udf/strip.cuh create mode 100644 python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh create mode 100644 python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/case.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/case.cuh new file mode 100644 index 00000000000..98b25f85f74 --- /dev/null +++ b/python/strings_udf/cpp/include/cudf/strings/udf/case.cuh @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "udf_string.cuh" + +#include +#include +#include + +namespace cudf { +namespace strings { +namespace udf { + +// global variables for character-type flags and case conversion +struct chars_tables { + cudf::strings::detail::character_flags_table_type* flags_table; + cudf::strings::detail::character_cases_table_type* cases_table; + struct cudf::strings::detail::special_case_mapping* special_case_mapping_table; +}; + +namespace detail { + +/** + * @brief Utility for converting a single character + * + * There are special cases where the conversion may result in multiple characters. + * + * @param tables The char tables required for conversion + * @param result String to append the converted character + * @param code_point The code-point of the character to convert + * @param flag The char-type flag of the character to convert + */ +__device__ inline void convert_char(chars_tables const tables, + udf_string& result, + uint32_t code_point, + uint8_t flag) +{ + if (!cudf::strings::detail::IS_SPECIAL(flag)) { + result.append(cudf::strings::detail::codepoint_to_utf8(tables.cases_table[code_point])); + return; + } + + // handle special case + auto const map = + tables + .special_case_mapping_table[cudf::strings::detail::get_special_case_hash_index(code_point)]; + auto const output_count = + cudf::strings::detail::IS_LOWER(flag) ? map.num_upper_chars : map.num_lower_chars; + auto const* output_chars = cudf::strings::detail::IS_LOWER(flag) ? map.upper : map.lower; + for (uint16_t idx = 0; idx < output_count; idx++) { + result.append(cudf::strings::detail::codepoint_to_utf8(output_chars[idx])); + } +} + +/** + * @brief Converts the given string to either upper or lower case + * + * @param tables The char tables required for conversion + * @param d_str Input string to convert + * @param case_flag Identifies upper/lower case conversion + * @return New string containing the converted characters + */ +__device__ inline udf_string convert_case( + chars_tables const tables, + string_view d_str, + cudf::strings::detail::character_flags_table_type case_flag) +{ + udf_string result; + for (auto const chr : d_str) { + auto const code_point = cudf::strings::detail::utf8_to_codepoint(chr); + auto const flag = code_point <= 0x00FFFF ? tables.flags_table[code_point] : 0; + + if ((flag & case_flag) || (cudf::strings::detail::IS_SPECIAL(flag) && + !cudf::strings::detail::IS_UPPER_OR_LOWER(flag))) { + convert_char(tables, result, code_point, flag); + } else { + result.append(chr); + } + } + + return result; +} + +/** + * @brief Utility for capitalize and title functions + * + * @tparam CaptializeNextFn returns true if the next candidate character should be capitalized + * @param tables The char tables required for conversion + * @param d_str Input string to convert + * @return New string containing the converted characters + */ +template +__device__ inline udf_string capitalize(chars_tables const tables, + string_view d_str, + CapitalizeNextFn next_fn) +{ + udf_string result; + bool capitalize = true; + for (auto const chr : d_str) { + auto const code_point = cudf::strings::detail::utf8_to_codepoint(chr); + auto const flag = code_point <= 0x00FFFF ? tables.flags_table[code_point] : 0; + auto const change_case = + capitalize ? cudf::strings::detail::IS_LOWER(flag) : cudf::strings::detail::IS_UPPER(flag); + if (change_case) { + detail::convert_char(tables, result, code_point, flag); + } else { + result.append(chr); + } + capitalize = next_fn(flag); + } + return result; +} +} // namespace detail + +/** + * @brief Converts the given string to lower case + * + * @param tables The char tables required for conversion + * @param d_str Input string to convert + * @return New string containing the converted characters + */ +__device__ inline udf_string to_lower(chars_tables const tables, string_view d_str) +{ + cudf::strings::detail::character_flags_table_type case_flag = + cudf::strings::detail::IS_UPPER(0xFF); // convert only upper case characters + return detail::convert_case(tables, d_str, case_flag); +} + +/** + * @brief Converts the given string to upper case + * + * @param tables The char tables required for conversion + * @param d_str Input string to convert + * @return New string containing the converted characters + */ +__device__ inline udf_string to_upper(chars_tables const tables, string_view d_str) +{ + cudf::strings::detail::character_flags_table_type case_flag = + cudf::strings::detail::IS_LOWER(0xFF); // convert only lower case characters + return detail::convert_case(tables, d_str, case_flag); +} + +/** + * @brief Converts the given string to lower/upper case + * + * All lower case characters are converted to upper case and + * all upper case characters are converted to lower case. + * + * @param tables The char tables required for conversion + * @param d_str Input string to convert + * @return New string containing the converted characters + */ +__device__ inline udf_string swapcase(chars_tables const tables, string_view d_str) +{ + cudf::strings::detail::character_flags_table_type case_flag = + cudf::strings::detail::IS_LOWER(0xFF) | cudf::strings::detail::IS_UPPER(0xFF); + return detail::convert_case(tables, d_str, case_flag); +} + +/** + * @brief Capitalize the first character of the given string + * + * @param tables The char tables required for conversion + * @param d_str Input string to convert + * @return New string containing the converted characters + */ +__device__ inline udf_string capitalize(chars_tables const tables, string_view d_str) +{ + auto next_fn = [](cudf::strings::detail::character_flags_table_type) -> bool { return false; }; + return detail::capitalize(tables, d_str, next_fn); +} + +/** + * @brief Converts the given string to title case + * + * The first character after a non-character is converted to upper case. + * All other characters are converted to lower case. + * + * @param tables The char tables required for conversion + * @param d_str Input string to convert + * @return New string containing the converted characters + */ +__device__ inline udf_string title(chars_tables const tables, string_view d_str) +{ + auto next_fn = [](cudf::strings::detail::character_flags_table_type flag) -> bool { + return !cudf::strings::detail::IS_ALPHA(flag); + }; + return detail::capitalize(tables, d_str, next_fn); +} + +} // namespace udf +} // namespace strings +} // namespace cudf diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh new file mode 100644 index 00000000000..48f709ae318 --- /dev/null +++ b/python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh @@ -0,0 +1,74 @@ + +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "udf_string.cuh" + +#include +#include +#include + +namespace cudf { +namespace strings { +namespace udf { + +/** + * @brief Converts a string into an integer. + * + * The '+' and '-' are allowed but only at the beginning of the string. + * The string is expected to contain base-10 [0-9] characters only. + * Any other character will end the parse. + * Overflow of the int64 type is not detected. + */ +__device__ inline int64_t stoi(string_view const& d_str) +{ + return cudf::strings::detail::string_to_integer(d_str); +} + +/** + * @brief Converts an integer into string + * + * @param value integer value to convert + */ +__device__ inline udf_string to_string(int64_t value) +{ + udf_string result; + if (value == 0) { + result.append("0"); + return result; + } + auto const d_value = static_cast(abs(value)); + auto digits = static_cast(log10(d_value)) + 1 + (value < 0); + result.resize(digits); + cudf::strings::detail::integer_to_string(value, result.data()); + return result; +} + +/** + * @brief Converts a string into a double. + * + * Support scientific notation as well. + * Overflow goes to inf or -inf and underflow may go to 0. + */ +__device__ inline double stod(string_view const& d_str) +{ + return cudf::strings::detail::stod(d_str); +} + +} // namespace udf +} // namespace strings +} // namespace cudf diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh new file mode 100644 index 00000000000..f7fa32d0e45 --- /dev/null +++ b/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "udf_string.cuh" + +#include +#include + +namespace cudf { +namespace strings { +namespace udf { + +/** + * @brief Split string using given string + * + * The caller must allocate an array of cudf::string_view to be filled + * in by this function. Calling this with a `nullptr` for the `result` + * will return the number of elements. + * + * @code{.cpp} + * auto d_str = cudf::string_view{"the best of times ", 19}; + * auto tgt = cudf::string_view{}; // empty string + * auto token_count = split(d_str, tgt, nullptr); + * auto result = new cudf::string_view[token_count]; + * split(d_str, tgt, result); + * // result is array like ["the", "best", "of", "times"] + * @endcode + * + * @param d_str String to split + * @param tgt String to split on + * @param result Empty array to populate with output objects. + * Pass `nullptr` to just get the token count. + * @return Number of tokens returned + */ +__device__ inline cudf::size_type split(cudf::string_view const d_str, + cudf::string_view const tgt, + cudf::string_view* result) +{ + auto const nchars = d_str.length(); + cudf::size_type count = 0; + + cudf::size_type last_pos = 0; + while (last_pos <= nchars) { + cudf::size_type const pos = d_str.find(tgt, last_pos); + auto const length = (pos < 0 ? nchars : pos) - last_pos; + if (result) { *result++ = d_str.substr(last_pos, length); } + last_pos = pos + tgt.length(); + ++count; + if (pos < 0) { break; } + } + + return count; +} + +/** + * @brief Split string using given target array. + * + * @param d_str String to split + * @param tgt Character array encoded in UTF-8 used for identifying split points + * @param bytes Number of bytes to read from `tgt` + * @param result Empty array to populate with output objects. + * Pass `nullptr` to just get the token count. + * @return Number of tokens returned + */ +__device__ inline int split(cudf::string_view const d_str, + char const* tgt, + cudf::size_type bytes, + cudf::string_view* result) +{ + return split(d_str, cudf::string_view{tgt, bytes}, result); +} + +/** + * @brief Split string using given target array. + * + * @param d_str String to split + * @param tgt Null-terminated character array encoded in UTF-8 used for identifying split points + * @param result Empty array to populate with output objects. + * Pass `nullptr` to just get the token count. + * @return Number of tokens returned + */ +__device__ inline int split(cudf::string_view const d_str, + char const* tgt, + cudf::string_view* result) +{ + return split(d_str, tgt, detail::bytes_in_null_terminated_string(tgt), result); +} + +/** + * @brief Split string on whitespace. + * + * This will create tokens by splitting on one or more consecutive whitespace characters + * found in `d_str`. + * + * @param d_str String to split + * @param result Empty array to populate with output objects. + * Pass `nullptr` to just get the token count. + * @return Number of tokens returned + */ +__device__ inline cudf::size_type split(cudf::string_view const d_str, cudf::string_view* result) +{ + cudf::strings::detail::whitespace_string_tokenizer tokenizer{d_str}; + cudf::size_type count = 0; + while (tokenizer.next_token()) { + auto token = tokenizer.get_token(); + if (result) { *result++ = d_str.substr(token.first, token.second - token.first); } + ++count; + } + return count; +} + +/** + * @brief Join an array of strings with a separator. + * + * @code{.cpp} + * auto separator = cudf::string_view{"::", 2}; + * cudf::string_view input[] = { + * cudf::string_view{"hello", 5}, + * cudf::string_view{"goodbye", 7}, + * cudf::string_view{"world", 5} }; + * + * auto result = join(separator, input, 3); + * // result is "hello::goodbye::world" + * @endcode + * + * @param separator Separator string + * @param input An array of strings to join + * @param count Number of elements in `input` + * @return New string + */ +__device__ inline udf_string join(cudf::string_view const separator, + cudf::string_view* input, + cudf::size_type count) +{ + udf_string result{""}; + while (count-- > 0) { + result += *input++; + if (count > 0) { result += separator; } + } + return result; +} + +/** + * @brief Join an array of strings with a separator. + * + * @param separator Null-terminated UTF-8 string + * @param bytes Number of bytes to read from `separator` + * @param input An array of strings to join + * @param count Number of elements in `input` + * @return New string + */ +__device__ inline udf_string join(char const* separator, + cudf::size_type bytes, + cudf::string_view* input, + cudf::size_type count) +{ + return join(cudf::string_view{separator, bytes}, input, count); +} + +/** + * @brief Join an array of strings with a separator. + * + * @param separator Null-terminated UTF-8 string + * @param input An array of strings to join + * @param count Number of elements in `input` + * @return New string + */ +__device__ inline udf_string join(char const* separator, + cudf::string_view* input, + cudf::size_type count) +{ + return join(separator, detail::bytes_in_null_terminated_string(separator), input, count); +} + +} // namespace udf +} // namespace strings +} // namespace cudf diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/strip.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/strip.cuh new file mode 100644 index 00000000000..1cd7dfbaa78 --- /dev/null +++ b/python/strings_udf/cpp/include/cudf/strings/udf/strip.cuh @@ -0,0 +1,110 @@ + +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "udf_string.cuh" + +#include +#include + +namespace cudf { +namespace strings { +namespace udf { + +/** + * @brief Strip characters from the beginning and/or end of the given string. + * + * The `d_to_strip` is interpretted as an array of characters to be removed. + * If `d_to_strip` is an empty string, whitespace characters are stripped. + * + * @code{.cpp} + * auto d_str = cudf::string_view{" aba ", 5}; + * auto d_to_strip = cudf::string_view{}; // empty string + * auto result = strip(d_str, d_to_strip); + * // result is "aba" + * d_to_strip = cudf::string_view{" a", 2}; // space and 'a' + * result = strip(d_str, d_to_strip); + * // result is "b" ('a' or ' ' removed from the ends) + * @endcode + * + * @param d_str String to strip characters from + * @param d_to_strip Characters to remove + * @param stype From where to strip the characters; + * Default `BOTH` indicates stripping characters from the + * beginning and the end of the input string `d_str` + * @return New string with characters removed + */ +__device__ udf_string strip(cudf::string_view const d_str, + cudf::string_view const d_to_strip, + side_type stype = side_type::BOTH) +{ + return udf_string{cudf::strings::detail::strip(d_str, d_to_strip, stype)}; +} + +/** + * @brief Strip characters from the beginning of the given string. + * + * The `d_to_strip` is interpretted as an array of characters to be removed. + * If `d_to_strip` is an empty string, whitespace characters are stripped. + * + * @code{.cpp} + * auto d_str = cudf::string_view{" aba ", 5}; + * auto d_to_strip = cudf::string_view{}; // empty string + * auto result = lstrip(d_str, d_to_strip); + * // result is "aba " + * d_to_strip = cudf::string_view{"a ", 2}; // space and 'a' + * result = lstrip(d_str, d_to_strip); + * // result is "ba " ('a' or ' ' removed from the beginning) + * @endcode + * + * @param d_str String to strip characters from + * @param d_to_strip Characters to remove + * @return New string with characters removed + */ +__device__ udf_string lstrip(cudf::string_view const d_str, cudf::string_view d_to_strip) +{ + return strip(d_str, d_to_strip, side_type::LEFT); +} + +/** + * @brief Strip characters from the end of the given string. + * + * The `d_to_strip` is interpretted as an array of characters to be removed. + * If `d_to_strip` is an empty string, whitespace characters are stripped. + * + * @code{.cpp} + * auto d_str = cudf::string_view{" aba ", 5}; + * auto d_to_strip = cudf::string_view{}; // empty string + * auto result = rstrip(d_str, d_to_strip); + * // result is " aba" + * d_to_strip = cudf::string_view{" a", 2}; // space and 'a' + * result = rstrip(d_str, d_to_strip); + * // result is " ab" ('a' or ' ' removed from the end) + * @endcode + * + * @param d_str String to strip characters from + * @param d_to_strip Characters to remove + * @return New string with characters removed + */ +__device__ udf_string rstrip(cudf::string_view const d_str, cudf::string_view d_to_strip) +{ + return strip(d_str, d_to_strip, side_type::RIGHT); +} + +} // namespace udf +} // namespace strings +} // namespace cudf diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp b/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp index 6de9b91de08..f5807be151b 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp +++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include @@ -34,6 +35,15 @@ namespace udf { */ std::unique_ptr to_string_view_array(cudf::column_view const input); +/** + * @brief Return a cudf::column given an array of udf_string objects. + * + * @param d_buffer Pointer to device memory of udf_string objects + * @param d_size The number of bytes in the d_buffer + * @return A strings column copy of the udf_string objects + */ +std::unique_ptr column_from_udf_string_array(void* d_buffer, std::size_t size); + } // namespace udf } // namespace strings } // namespace cudf diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh new file mode 100644 index 00000000000..5ab262fb07d --- /dev/null +++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh @@ -0,0 +1,472 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "udf_string.hpp" + +#include +#include + +#include +#include +#include + +namespace cudf { +namespace strings { +namespace udf { +namespace detail { + +__device__ inline static cudf::size_type bytes_in_null_terminated_string(char const* str) +{ + if (!str) return 0; + cudf::size_type bytes = 0; + while (*str++) ++bytes; + return bytes; +} + +} // namespace detail + +__device__ inline char* udf_string::allocate(cudf::size_type bytes) +{ + char* data = static_cast(malloc(bytes + 1)); + data[bytes] = 0; // add null-terminator so we can printf strings in device code + return data; +} + +__device__ inline void udf_string::deallocate(char* data) +{ + if (data) free(data); +} + +__device__ void udf_string::reallocate(cudf::size_type bytes) +{ + m_capacity = bytes; + auto new_data = allocate(m_capacity); + memcpy(new_data, m_data, std::min(m_bytes, bytes)); + deallocate(m_data); + m_data = new_data; +} + +__device__ inline udf_string::udf_string(char const* data, cudf::size_type bytes) + : m_bytes(bytes), m_capacity(bytes) +{ + m_data = allocate(m_capacity); + memcpy(m_data, data, bytes); +} + +__device__ udf_string::udf_string(cudf::size_type count, cudf::char_utf8 chr) +{ + if (count <= 0) { return; } + m_bytes = m_capacity = cudf::strings::detail::bytes_in_char_utf8(chr) * count; + m_data = allocate(m_capacity); + auto out_ptr = m_data; + for (auto idx = 0; idx < count; ++idx) { + out_ptr += cudf::strings::detail::from_char_utf8(chr, out_ptr); + } +} + +__device__ inline udf_string::udf_string(char const* data) +{ + m_bytes = m_capacity = detail::bytes_in_null_terminated_string(data); + m_data = allocate(m_capacity); + memcpy(m_data, data, m_bytes); +} + +__device__ inline udf_string::udf_string(udf_string const& src) + : m_bytes(src.m_bytes), m_capacity(src.m_bytes) +{ + m_data = allocate(m_capacity); + memcpy(m_data, src.m_data, m_bytes); +} + +__device__ inline udf_string::udf_string(udf_string&& src) + : m_data(src.m_data), m_bytes(src.m_bytes), m_capacity(src.m_capacity) +{ + src.m_data = nullptr; + src.m_bytes = 0; + src.m_capacity = 0; +} + +__device__ inline udf_string::udf_string(cudf::string_view const str) + : m_bytes(str.size_bytes()), m_capacity(str.size_bytes()) +{ + m_data = allocate(m_capacity); + memcpy(m_data, str.data(), m_bytes); +} + +__device__ inline udf_string::~udf_string() { deallocate(m_data); } + +__device__ inline udf_string& udf_string::operator=(udf_string const& str) { return assign(str); } + +__device__ inline udf_string& udf_string::operator=(udf_string&& str) +{ + return assign(std::move(str)); +} + +__device__ inline udf_string& udf_string::operator=(cudf::string_view const str) +{ + return assign(str); +} + +__device__ inline udf_string& udf_string::operator=(char const* str) { return assign(str); } + +__device__ udf_string& udf_string::assign(udf_string&& str) +{ + if (this == &str) { return *this; } + m_data = str.m_data; + m_bytes = str.m_bytes; + m_capacity = str.m_capacity; + str.m_data = nullptr; + str.m_bytes = 0; + str.m_capacity = 0; + return *this; +} + +__device__ udf_string& udf_string::assign(cudf::string_view const str) +{ + return assign(str.data(), str.size_bytes()); +} + +__device__ udf_string& udf_string::assign(char const* str) +{ + return assign(str, detail::bytes_in_null_terminated_string(str)); +} + +__device__ udf_string& udf_string::assign(char const* str, cudf::size_type bytes) +{ + if (bytes >= m_capacity) { + deallocate(m_data); + m_capacity = bytes; + m_data = allocate(m_capacity); + } + m_bytes = bytes; + memcpy(m_data, str, bytes); + m_data[m_bytes] = 0; + return *this; +} + +__device__ inline cudf::size_type udf_string::size_bytes() const { return m_bytes; } + +__device__ inline cudf::size_type udf_string::length() const +{ + return cudf::strings::detail::characters_in_string(m_data, m_bytes); +} + +__device__ cudf::size_type udf_string::max_size() const +{ + return std::numeric_limits::max() - 1; +} + +__device__ inline char* udf_string::data() { return m_data; } + +__device__ inline char const* udf_string::data() const { return m_data; } + +__device__ inline bool udf_string::is_empty() const { return m_bytes == 0; } + +__device__ inline bool udf_string::is_null() const { return m_data == nullptr; } + +__device__ inline cudf::string_view::const_iterator udf_string::begin() const +{ + return cudf::string_view::const_iterator(cudf::string_view(m_data, m_bytes), 0); +} + +__device__ inline cudf::string_view::const_iterator udf_string::end() const +{ + return cudf::string_view::const_iterator(cudf::string_view(m_data, m_bytes), length()); +} + +__device__ inline cudf::char_utf8 udf_string::at(cudf::size_type pos) const +{ + auto const offset = byte_offset(pos); + auto chr = cudf::char_utf8{0}; + if (offset < m_bytes) { cudf::strings::detail::to_char_utf8(data() + offset, chr); } + return chr; +} + +__device__ inline cudf::char_utf8 udf_string::operator[](cudf::size_type pos) const +{ + return at(pos); +} + +__device__ inline cudf::size_type udf_string::byte_offset(cudf::size_type pos) const +{ + cudf::size_type offset = 0; + + auto sptr = m_data; + auto eptr = sptr + m_bytes; + while ((pos > 0) && (sptr < eptr)) { + auto const byte = static_cast(*sptr++); + auto const char_bytes = cudf::strings::detail::bytes_in_utf8_byte(byte); + if (char_bytes) { --pos; } + offset += char_bytes; + } + return offset; +} + +__device__ inline int udf_string::compare(cudf::string_view const in) const +{ + return compare(in.data(), in.size_bytes()); +} + +__device__ inline int udf_string::compare(char const* data, cudf::size_type bytes) const +{ + auto const view = static_cast(*this); + return view.compare(data, bytes); +} + +__device__ inline bool udf_string::operator==(cudf::string_view const rhs) const +{ + return m_bytes == rhs.size_bytes() && compare(rhs) == 0; +} + +__device__ inline bool udf_string::operator!=(cudf::string_view const rhs) const +{ + return compare(rhs) != 0; +} + +__device__ inline bool udf_string::operator<(cudf::string_view const rhs) const +{ + return compare(rhs) < 0; +} + +__device__ inline bool udf_string::operator>(cudf::string_view const rhs) const +{ + return compare(rhs) > 0; +} + +__device__ inline bool udf_string::operator<=(cudf::string_view const rhs) const +{ + int rc = compare(rhs); + return (rc == 0) || (rc < 0); +} + +__device__ inline bool udf_string::operator>=(cudf::string_view const rhs) const +{ + int rc = compare(rhs); + return (rc == 0) || (rc > 0); +} + +__device__ inline void udf_string::clear() +{ + deallocate(m_data); + m_data = nullptr; + m_bytes = 0; + m_capacity = 0; +} + +__device__ inline void udf_string::resize(cudf::size_type count) +{ + if (count > max_size()) { return; } + if (count > m_capacity) { reallocate(count); } + + // add padding if necessary (null chars) + if (count > m_bytes) { memset(m_data + m_bytes, 0, count - m_bytes); } + + m_bytes = count; + m_data[m_bytes] = 0; +} + +__device__ void udf_string::reserve(cudf::size_type count) +{ + if (count < max_size() && count > m_capacity) { reallocate(count); } +} + +__device__ cudf::size_type udf_string::capacity() const { return m_capacity; } + +__device__ void udf_string::shrink_to_fit() +{ + if (m_bytes < m_capacity) { reallocate(m_bytes); } +} + +__device__ inline udf_string& udf_string::append(char const* str, cudf::size_type in_bytes) +{ + if (in_bytes <= 0) { return *this; } + auto const nbytes = m_bytes + in_bytes; + if (nbytes > m_capacity) { reallocate(2 * nbytes); } + memcpy(m_data + m_bytes, str, in_bytes); + m_bytes = nbytes; + m_data[m_bytes] = 0; + return *this; +} + +__device__ inline udf_string& udf_string::append(char const* str) +{ + return append(str, detail::bytes_in_null_terminated_string(str)); +} + +__device__ inline udf_string& udf_string::append(cudf::char_utf8 chr, cudf::size_type count) +{ + if (count <= 0) { return *this; } + auto const char_bytes = cudf::strings::detail::bytes_in_char_utf8(chr) * count; + auto const nbytes = m_bytes + char_bytes; + if (nbytes > m_capacity) { reallocate(2 * nbytes); } + auto out_ptr = m_data + m_bytes; + for (auto idx = 0; idx < count; ++idx) { + out_ptr += cudf::strings::detail::from_char_utf8(chr, out_ptr); + } + m_bytes = nbytes; + m_data[m_bytes] = 0; + return *this; +} + +__device__ inline udf_string& udf_string::append(cudf::string_view const in) +{ + return append(in.data(), in.size_bytes()); +} + +__device__ inline udf_string& udf_string::operator+=(cudf::string_view const in) +{ + return append(in); +} + +__device__ inline udf_string& udf_string::operator+=(cudf::char_utf8 chr) { return append(chr); } + +__device__ inline udf_string& udf_string::operator+=(char const* str) { return append(str); } + +__device__ inline udf_string& udf_string::insert(cudf::size_type pos, + char const* str, + cudf::size_type in_bytes) +{ + return replace(pos, 0, str, in_bytes); +} + +__device__ inline udf_string& udf_string::insert(cudf::size_type pos, char const* str) +{ + return insert(pos, str, detail::bytes_in_null_terminated_string(str)); +} + +__device__ inline udf_string& udf_string::insert(cudf::size_type pos, cudf::string_view const in) +{ + return insert(pos, in.data(), in.size_bytes()); +} + +__device__ inline udf_string& udf_string::insert(cudf::size_type pos, + cudf::size_type count, + cudf::char_utf8 chr) +{ + return replace(pos, 0, count, chr); +} + +__device__ inline udf_string udf_string::substr(cudf::size_type pos, cudf::size_type count) const +{ + if (pos < 0) { return udf_string{"", 0}; } + auto const spos = byte_offset(pos); + if (spos >= m_bytes) { return udf_string{"", 0}; } + auto const epos = count < 0 ? m_bytes : std::min(byte_offset(pos + count), m_bytes); + return udf_string{data() + spos, epos - spos}; +} + +// utility for replace() +__device__ void udf_string::shift_bytes(cudf::size_type spos, + cudf::size_type epos, + cudf::size_type nbytes) +{ + if (nbytes < m_bytes) { + // shift bytes to the left [...wxyz] -> [wxyzxyz] + auto src = epos; + auto tgt = spos; + while (tgt < nbytes) { m_data[tgt++] = m_data[src++]; } + } else if (nbytes > m_bytes) { + // shift bytes to the right [abcd...] -> [abcabcd] + auto src = m_bytes; + auto tgt = nbytes; + while (src > epos) { m_data[--tgt] = m_data[--src]; } + } +} + +__device__ inline udf_string& udf_string::replace(cudf::size_type pos, + cudf::size_type count, + char const* str, + cudf::size_type in_bytes) +{ + if (pos < 0 || in_bytes < 0) { return *this; } + auto const spos = byte_offset(pos); + if (spos > m_bytes) { return *this; } + auto const epos = count < 0 ? m_bytes : std::min(byte_offset(pos + count), m_bytes); + + // compute new size + auto const nbytes = m_bytes + in_bytes - (epos - spos); + if (nbytes > m_capacity) { reallocate(2 * nbytes); } + + // move bytes -- make room for replacement + shift_bytes(spos + in_bytes, epos, nbytes); + + // insert the replacement + memcpy(m_data + spos, str, in_bytes); + + m_bytes = nbytes; + m_data[m_bytes] = 0; + return *this; +} + +__device__ inline udf_string& udf_string::replace(cudf::size_type pos, + cudf::size_type count, + char const* str) +{ + return replace(pos, count, str, detail::bytes_in_null_terminated_string(str)); +} + +__device__ inline udf_string& udf_string::replace(cudf::size_type pos, + cudf::size_type count, + cudf::string_view const in) +{ + return replace(pos, count, in.data(), in.size_bytes()); +} + +__device__ inline udf_string& udf_string::replace(cudf::size_type pos, + cudf::size_type count, + cudf::size_type chr_count, + cudf::char_utf8 chr) +{ + if (pos < 0 || chr_count < 0) { return *this; } + auto const spos = byte_offset(pos); + if (spos > m_bytes) { return *this; } + auto const epos = count < 0 ? m_bytes : std::min(byte_offset(pos + count), m_bytes); + + // compute input size + auto const char_bytes = cudf::strings::detail::bytes_in_char_utf8(chr) * chr_count; + // compute new output size + auto const nbytes = m_bytes + char_bytes - (epos - spos); + if (nbytes > m_capacity) { reallocate(2 * nbytes); } + + // move bytes -- make room for the new character(s) + shift_bytes(spos + char_bytes, epos, nbytes); + + // copy chr chr_count times + auto out_ptr = m_data + spos; + for (auto idx = 0; idx < chr_count; ++idx) { + out_ptr += cudf::strings::detail::from_char_utf8(chr, out_ptr); + } + + m_bytes = nbytes; + m_data[m_bytes] = 0; + return *this; +} + +__device__ udf_string& udf_string::erase(cudf::size_type pos, cudf::size_type count) +{ + return replace(pos, count, nullptr, 0); +} + +__device__ inline cudf::size_type udf_string::char_offset(cudf::size_type bytepos) const +{ + return cudf::strings::detail::characters_in_string(data(), bytepos); +} + +} // namespace udf +} // namespace strings +} // namespace cudf diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp new file mode 100644 index 00000000000..a0da0186b24 --- /dev/null +++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp @@ -0,0 +1,523 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +namespace cudf { +namespace strings { +namespace udf { + +class udf_string { + public: + /** + * @brief Represents unknown character position or length. + */ + static constexpr cudf::size_type npos = static_cast(-1); + + /** + * @brief Cast to cudf::string_view operator + */ + __device__ operator cudf::string_view() const { return cudf::string_view(m_data, m_bytes); } + + /** + * @brief Create an empty string. + */ + udf_string() = default; + + /** + * @brief Create a string using existing device memory. + * + * The given memory is copied into the instance returned. + * + * @param data Device pointer to UTF-8 encoded string + * @param bytes Number of bytes in `data` + */ + __device__ udf_string(char const* data, cudf::size_type bytes); + + /** + * @brief Create a string object from a null-terminated character array. + * + * The given memory is copied into the instance returned. + * + * @param data Device pointer to UTF-8 encoded null-terminated + * character array. + */ + __device__ udf_string(char const* data); + + /** + * @brief Create a string object from a cudf::string_view. + * + * The input string data is copied into the instance returned. + * + * @param str String to copy + */ + __device__ udf_string(cudf::string_view const str); + + /** + * @brief Create a string object with `count` copies of character `chr`. + * + * @param count Number of times to copy `chr` + * @param chr Character from which to create the string + */ + __device__ udf_string(cudf::size_type count, cudf::char_utf8 chr); + + /** + * @brief Create a string object from another instance. + * + * The string data is copied from the `src` into the instance returned. + * + * @param src String to copy + */ + __device__ udf_string(udf_string const& src); + + /** + * @brief Create a string object from a move reference. + * + * The string data is moved from `src` into the instance returned. + * The `src` will have no content. + * + * @param src String to copy + */ + __device__ udf_string(udf_string&& src); + + __device__ ~udf_string(); + + __device__ udf_string& operator=(udf_string const&); + __device__ udf_string& operator=(udf_string&&); + __device__ udf_string& operator=(cudf::string_view const); + __device__ udf_string& operator=(char const*); + + /** + * @brief Return the number of bytes in this string. + */ + __device__ cudf::size_type size_bytes() const; + + /** + * @brief Return the number of characters in this string. + */ + __device__ cudf::size_type length() const; + + /** + * @brief Return the maximum number of bytes a udf_string can hold. + */ + __device__ cudf::size_type max_size() const; + + /** + * @brief Return the internal pointer to the character array for this object. + */ + __device__ char* data(); + __device__ char const* data() const; + + /** + * @brief Returns true if there are no characters in this string. + */ + __device__ bool is_empty() const; + + /** + * @brief Returns true if `data()==nullptr` + * + * This is experimental and may be removed in the futre. + */ + __device__ bool is_null() const; + + /** + * @brief Returns an iterator that can be used to navigate through + * the UTF-8 characters in this string. + * + * This returns a `cudf::string_view::const_iterator` which is read-only. + */ + __device__ cudf::string_view::const_iterator begin() const; + __device__ cudf::string_view::const_iterator end() const; + + /** + * @brief Returns the character at the specified position. + * + * This will return 0 if `pos >= length()`. + * + * @param pos Index position of character to return + * @return Character at position `pos` + */ + __device__ cudf::char_utf8 at(cudf::size_type pos) const; + + /** + * @brief Returns the character at the specified index. + * + * This will return 0 if `pos >= length()`. + * Note this is read-only. Use replace() to modify a character. + * + * @param pos Index position of character to return + * @return Character at position `pos` + */ + __device__ cudf::char_utf8 operator[](cudf::size_type pos) const; + + /** + * @brief Return the byte offset for a given character position. + * + * The byte offset for the character at `pos` such that + * `data() + byte_offset(pos)` points to the memory location + * the character at position `pos`. + * + * @param pos Index position of character to return byte offset. + * @return Byte offset for character at `pos` + */ + __device__ cudf::size_type byte_offset(cudf::size_type pos) const; + + /** + * @brief Comparing target string with this string + * + * @param str Target string to compare with this string + * @return 0 If they compare equal + * <0 Either the value of the first character of this string that does + * not match is ordered before the corresponding character in `str`, + * or all compared characters match but the `str` string is shorter. + * >0 Either the value of the first character of this string that does + * not match is ordered after the corresponding character in `str`, + * or all compared characters match but the `str` string is longer. + */ + __device__ int compare(cudf::string_view const str) const; + + /** + * @brief Comparing target character array with this string + * + * @param str Target array of UTF-8 characters. + * @param bytes Number of bytes in `str`. + * @return 0 If they compare equal + * <0 Either the value of the first character of this string that does + * not match is ordered before the corresponding character in `str`, + * or all compared characters match but `bytes < size_bytes()`. + * >0 Either the value of the first character of this string that does + * not match is ordered after the corresponding character in `str`, + * or all compared characters match but `bytes > size_bytes()`. + */ + __device__ int compare(char const* str, cudf::size_type bytes) const; + + /** + * @brief Returns true if `rhs` matches this string exactly + */ + __device__ bool operator==(cudf::string_view const rhs) const; + + /** + * @brief Returns true if `rhs` does not match this string + */ + __device__ bool operator!=(cudf::string_view const rhs) const; + + /** + * @brief Returns true if this string is ordered before `rhs` + */ + __device__ bool operator<(cudf::string_view const rhs) const; + + /** + * @brief Returns true if `rhs` is ordered before this string + */ + __device__ bool operator>(cudf::string_view const rhs) const; + + /** + * @brief Returns true if this string matches or is ordered before `rhs` + */ + __device__ bool operator<=(cudf::string_view const rhs) const; + + /** + * @brief Returns true if `rhs` matches or is ordered before this string + */ + __device__ bool operator>=(cudf::string_view const rhs) const; + + /** + * @brief Remove all bytes from this string. + * + * All pointers, references, and iterators are invalidated. + */ + __device__ void clear(); + + /** + * @brief Resizes string to contain `count` bytes. + * + * If `count > size_bytes()` then zero-padding is added. + * If `count < size_bytes()` then the string is truncated to size `count`. + * + * All pointers, references, and iterators may be invalidated. + * + * @param count Size in bytes of this string. + */ + __device__ void resize(cudf::size_type count); + + /** + * @brief Reserve `count` bytes in this string. + * + * If `count > capacity()`, new memory is allocated and `capacity()` will + * be greater than or equal to `count`. + * There is no effect if `count <= capacity()`. + * + * @param count Total number of bytes to reserve for this string + */ + __device__ void reserve(cudf::size_type count); + + /** + * @brief Returns the number of bytes that the string has allocated. + */ + __device__ cudf::size_type capacity() const; + + /** + * @brief Reduces internal allocation to just `size_bytes()`. + * + * All pointers, references, and iterators may be invalidated. + */ + __device__ void shrink_to_fit(); + + /** + * @brief Moves the contents of `str` into this string instance + * + * @param str String to move + * @return This string new contents + */ + __device__ udf_string& assign(udf_string&& str); + + /** + * @brief Replaces the contents of this string with contents of `str` + * + * @param str String to copy + * @return This string new contents + */ + __device__ udf_string& assign(cudf::string_view const str); + + /** + * @brief Replaces the contents of this string with contents of `str` + * + * @param str Null-terminated UTF-8 character array + * @return This string new contents + */ + __device__ udf_string& assign(char const* str); + + /** + * @brief Replaces the contents of this string with contents of `str` + * + * @param str UTF-8 character array + * @param bytes Number of bytes to copy from `str` + * @return This string new contents + */ + __device__ udf_string& assign(char const* str, cudf::size_type bytes); + + /** + * @brief Append a string to the end of this string. + * + * @param str String to append + * @return This string with the appended argument + */ + __device__ udf_string& operator+=(cudf::string_view const str); + + /** + * @brief Append a character to the end of this string. + * + * @param str Character to append + * @return This string with the appended argument + */ + __device__ udf_string& operator+=(cudf::char_utf8 chr); + + /** + * @brief Append a null-terminated device memory character array + * to the end of this string. + * + * @param str String to append + * @return This string with the appended argument + */ + __device__ udf_string& operator+=(char const* str); + + /** + * @brief Append a null-terminated character array to the end of this string. + * + * @param str String to append + * @return This string with the appended argument + */ + __device__ udf_string& append(char const* str); + + /** + * @brief Append a character array to the end of this string. + * + * @param str Character array to append + * @param bytes Number of bytes from `str` to append. + * @return This string with the appended argument + */ + __device__ udf_string& append(char const* str, cudf::size_type bytes); + + /** + * @brief Append a string to the end of this string. + * + * @param str String to append + * @return This string with the appended argument + */ + __device__ udf_string& append(cudf::string_view const str); + + /** + * @brief Append a character to the end of this string + * a specified number of times. + * + * @param chr Character to append + * @param count Number of times to append `chr` + * @return This string with the append character(s) + */ + __device__ udf_string& append(cudf::char_utf8 chr, cudf::size_type count = 1); + + /** + * @brief Insert a string into the character position specified. + * + * There is no effect if `pos < 0 or pos > length()`. + * + * @param pos Character position to begin insert + * @param str String to insert into this one + * @return This string with the inserted argument + */ + __device__ udf_string& insert(cudf::size_type pos, cudf::string_view const str); + + /** + * @brief Insert a null-terminated character array into the character position specified. + * + * There is no effect if `pos < 0 or pos > length()`. + * + * @param pos Character position to begin insert + * @param data Null-terminated character array to insert + * @return This string with the inserted argument + */ + __device__ udf_string& insert(cudf::size_type pos, char const* data); + + /** + * @brief Insert a character array into the character position specified. + * + * There is no effect if `pos < 0 or pos > length()`. + * + * @param pos Character position to begin insert + * @param data Character array to insert + * @param bytes Number of bytes from `data` to insert + * @return This string with the inserted argument + */ + __device__ udf_string& insert(cudf::size_type pos, char const* data, cudf::size_type bytes); + + /** + * @brief Insert a character one or more times into the character position specified. + * + * There is no effect if `pos < 0 or pos > length()`. + * + * @param pos Character position to begin insert + * @param count Number of times to insert `chr` + * @param chr Character to insert + * @return This string with the inserted argument + */ + __device__ udf_string& insert(cudf::size_type pos, cudf::size_type count, cudf::char_utf8 chr); + + /** + * @brief Returns a substring of this string. + * + * An empty string is returned if `pos < 0 or pos >= length()`. + * + * @param pos Character position to start the substring + * @param count Number of characters for the substring; + * This can be greater than the number of available characters. + * Default npos returns characters in range `[pos, length())`. + * @return New string with the specified characters + */ + __device__ udf_string substr(cudf::size_type pos, cudf::size_type count = npos) const; + + /** + * @brief Replace a range of characters with a given string. + * + * Replaces characters in range `[pos, pos + count]` with `str`. + * There is no effect if `pos < 0 or pos > length()`. + * + * @param pos Position of first character to replace + * @param count Number of characters to replace + * @param str String to replace the given range + * @return This string modified with the replacement + */ + __device__ udf_string& replace(cudf::size_type pos, + cudf::size_type count, + cudf::string_view const str); + + /** + * @brief Replace a range of characters with a null-terminated character array. + * + * Replaces characters in range `[pos, pos + count)` with `data`. + * There is no effect if `pos < 0 or pos > length()`. + * + * @param pos Position of first character to replace + * @param count Number of characters to replace + * @param data Null-terminated character array to replace the given range + * @return This string modified with the replacement + */ + __device__ udf_string& replace(cudf::size_type pos, cudf::size_type count, char const* data); + + /** + * @brief Replace a range of characters with a given character array. + * + * Replaces characters in range `[pos, pos + count)` with `[data, data + bytes)`. + * There is no effect if `pos < 0 or pos > length()`. + * + * @param pos Position of first character to replace + * @param count Number of characters to replace + * @param data String to replace the given range + * @param bytes Number of bytes from data to use for replacement + * @return This string modified with the replacement + */ + __device__ udf_string& replace(cudf::size_type pos, + cudf::size_type count, + char const* data, + cudf::size_type bytes); + + /** + * @brief Replace a range of characters with a character one or more times. + * + * Replaces characters in range `[pos, pos + count)` with `chr` `chr_count` times. + * There is no effect if `pos < 0 or pos > length()`. + * + * @param pos Position of first character to replace + * @param count Number of characters to replace + * @param chr_count Number of times `chr` will repeated + * @param chr Character to use for replacement + * @return This string modified with the replacement + */ + __device__ udf_string& replace(cudf::size_type pos, + cudf::size_type count, + cudf::size_type chr_count, + cudf::char_utf8 chr); + + /** + * @brief Removes specified characters from this string. + * + * Removes `min(count, length() - pos)` characters starting at `pos`. + * There is no effect if `pos < 0 or pos >= length()`. + * + * @param pos Character position to begin insert + * @param count Number of characters to remove starting at `pos` + * @return This string with remove characters + */ + __device__ udf_string& erase(cudf::size_type pos, cudf::size_type count = npos); + + private: + char* m_data{}; + cudf::size_type m_bytes{}; + cudf::size_type m_capacity{}; + + // utilities + __device__ char* allocate(cudf::size_type bytes); + __device__ void deallocate(char* data); + __device__ void reallocate(cudf::size_type bytes); + __device__ cudf::size_type char_offset(cudf::size_type bytepos) const; + __device__ void shift_bytes(cudf::size_type spos, cudf::size_type epos, cudf::size_type nbytes); +}; + +} // namespace udf +} // namespace strings +} // namespace cudf diff --git a/python/strings_udf/cpp/src/strings/udf/udf_apis.cu b/python/strings_udf/cpp/src/strings/udf/udf_apis.cu index dfef1be39f5..2bc74c8ca19 100644 --- a/python/strings_udf/cpp/src/strings/udf/udf_apis.cu +++ b/python/strings_udf/cpp/src/strings/udf/udf_apis.cu @@ -15,17 +15,48 @@ */ #include +#include #include #include #include #include +#include + +#include +#include namespace cudf { namespace strings { namespace udf { namespace detail { +namespace { + +struct free_udf_string_fn { + cudf::strings::udf::udf_string* d_strings; + __device__ void operator()(cudf::size_type idx) { d_strings[idx].clear(); } +}; + +void free_udf_string_array(void* d_buffer, std::size_t buffer_size, rmm::cuda_stream_view stream) +{ + auto const size = + static_cast(buffer_size / sizeof(cudf::strings::udf::udf_string)); + auto d_strings = reinterpret_cast(d_buffer); + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + size, + free_udf_string_fn{d_strings}); +} + +struct udf_string_to_string_view_transform_fn { + __device__ cudf::string_view operator()(cudf::strings::udf::udf_string const& dstr) + { + return cudf::string_view{dstr.data(), dstr.size_bytes()}; + } +}; + +} // namespace std::unique_ptr to_string_view_array(cudf::column_view const input, rmm::cuda_stream_view stream) @@ -36,6 +67,31 @@ std::unique_ptr to_string_view_array(cudf::column_view const .release())); } +std::unique_ptr column_from_udf_string_array(void* d_buffer, + std::size_t buffer_size, + rmm::cuda_stream_view stream) +{ + auto const size = + static_cast(buffer_size / sizeof(cudf::strings::udf::udf_string)); + auto d_input = reinterpret_cast(d_buffer); + + // create string_views of the udf_strings + auto indices = rmm::device_uvector(size, stream); + thrust::transform(rmm::exec_policy(stream), + d_input, + d_input + size, + indices.data(), + udf_string_to_string_view_transform_fn{}); + + auto results = cudf::make_strings_column(indices, cudf::string_view(nullptr, 0), stream); + + // free the individual udf_string elements + free_udf_string_array(d_buffer, buffer_size, stream); + + // return new column + return results; +} + } // namespace detail std::unique_ptr to_string_view_array(cudf::column_view const input) @@ -43,6 +99,11 @@ std::unique_ptr to_string_view_array(cudf::column_view const return detail::to_string_view_array(input, rmm::cuda_stream_default); } +std::unique_ptr column_from_udf_string_array(void* d_buffer, std::size_t buffer_size) +{ + return detail::column_from_udf_string_array(d_buffer, buffer_size, rmm::cuda_stream_default); +} + } // namespace udf } // namespace strings } // namespace cudf From 5317db88d51638e673ba49da905c42dd00ab0b59 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 12 Oct 2022 17:04:53 -0400 Subject: [PATCH 02/54] fix style error --- .../cpp/include/cudf/strings/udf/udf_string.cuh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh index 5ab262fb07d..ff4a4399e40 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh @@ -34,7 +34,8 @@ __device__ inline static cudf::size_type bytes_in_null_terminated_string(char co { if (!str) return 0; cudf::size_type bytes = 0; - while (*str++) ++bytes; + while (*str++) + ++bytes; return bytes; } @@ -379,12 +380,16 @@ __device__ void udf_string::shift_bytes(cudf::size_type spos, // shift bytes to the left [...wxyz] -> [wxyzxyz] auto src = epos; auto tgt = spos; - while (tgt < nbytes) { m_data[tgt++] = m_data[src++]; } + while (tgt < nbytes) { + m_data[tgt++] = m_data[src++]; + } } else if (nbytes > m_bytes) { // shift bytes to the right [abcd...] -> [abcabcd] auto src = m_bytes; auto tgt = nbytes; - while (src > epos) { m_data[--tgt] = m_data[--src]; } + while (src > epos) { + m_data[--tgt] = m_data[--src]; + } } } From 2e36b6a6257fc7739426fa431681ff6cae7553f4 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 13 Oct 2022 08:20:49 -0700 Subject: [PATCH 03/54] support returning strings within strings_udf library --- .../strings_udf/cpp/src/strings/udf/shim.cu | 12 +++++ .../strings_udf/_lib/cpp/strings_udf.pxd | 3 ++ .../strings_udf/_lib/cudf_jit_udf.pyx | 13 +++++ python/strings_udf/strings_udf/_typing.py | 39 +++++++++++---- python/strings_udf/strings_udf/lowering.py | 48 ++++++++++++++++++- .../strings_udf/tests/test_string_udfs.py | 48 +++++++++++++++---- 6 files changed, 144 insertions(+), 19 deletions(-) diff --git a/python/strings_udf/cpp/src/strings/udf/shim.cu b/python/strings_udf/cpp/src/strings/udf/shim.cu index 4d6690468ff..b284d58fe58 100644 --- a/python/strings_udf/cpp/src/strings/udf/shim.cu +++ b/python/strings_udf/cpp/src/strings/udf/shim.cu @@ -17,6 +17,7 @@ #include #include #include +#include using namespace cudf::strings::udf; @@ -215,3 +216,14 @@ extern "C" __device__ int pycount(int* nb_retval, void const* str, void const* s *nb_retval = count(*str_view, *substr_view); return 0; } + +extern "C" __device__ int udf_string_from_string_view(int* nb_retbal, + void const* str, + void* udf_str) +{ + auto str_view_ptr = reinterpret_cast(str); + auto udf_str_ptr = reinterpret_cast(udf_str); + *udf_str_ptr = udf_string(*str_view_ptr); + + return 0; +} diff --git a/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd b/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd index fb8e3a949bf..ee145e4023b 100644 --- a/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd +++ b/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd @@ -14,6 +14,9 @@ from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer cdef extern from "cudf/strings/udf/udf_apis.hpp" namespace \ "cudf::strings::udf" nogil: cdef unique_ptr[device_buffer] to_string_view_array(column_view) except + + cdef unique_ptr[column] column_from_udf_string_array( + void*, size_t + ) except + cdef extern from "cudf/strings/detail/char_tables.hpp" namespace \ "cudf::strings::detail" nogil: diff --git a/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx b/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx index bb1892a4d26..c450c394638 100644 --- a/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx +++ b/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx @@ -10,6 +10,7 @@ from cudf._lib.cpp.column.column cimport column, column_view from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer from strings_udf._lib.cpp.strings_udf cimport ( + column_from_udf_string_array as cpp_column_from_udf_string_array, to_string_view_array as cpp_to_string_view_array, ) @@ -22,3 +23,15 @@ def to_string_view_array(Column strings_col): device_buffer = DeviceBuffer.c_from_unique_ptr(move(c_buffer)) return Buffer(device_buffer) + + +def from_udf_string_array(DeviceBuffer d_buffer): + cdef size_t size = d_buffer.c_size() + cdef void* data = d_buffer.c_data() + cdef unique_ptr[column] c_result + # data = + + with nogil: + c_result = move(cpp_column_from_udf_string_array(data, size)) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/strings_udf/strings_udf/_typing.py b/python/strings_udf/strings_udf/_typing.py index 675507bccde..babda6b2d2a 100644 --- a/python/strings_udf/strings_udf/_typing.py +++ b/python/strings_udf/strings_udf/_typing.py @@ -23,9 +23,9 @@ # String object definitions -class DString(types.Type): +class UDFString(types.Type): def __init__(self): - super().__init__(name="dstring") + super().__init__(name="udf_string") llty = default_manager[self].get_value_type() self.size_bytes = llty.get_abi_size(target_data) @@ -56,9 +56,9 @@ def __init__(self, dmm, fe_type): super().__init__(dmm, fe_type, self._members) -@register_model(DString) -class dstring_model(models.StructModel): - # from dstring.hpp: +@register_model(UDFString) +class udf_string_model(models.StructModel): + # from udf_string.hpp: # private: # char* m_data{}; # cudf::size_type m_bytes{}; @@ -74,8 +74,9 @@ def __init__(self, dmm, fe_type): super().__init__(dmm, fe_type, self._members) -any_string_ty = (StringView, DString, types.StringLiteral) +any_string_ty = (StringView, UDFString, types.StringLiteral) string_view = StringView() +udf_string = UDFString() class StrViewArgHandler: @@ -93,7 +94,9 @@ class StrViewArgHandler: """ def prepare_args(self, ty, val, **kwargs): - if isinstance(ty, types.CPointer) and isinstance(ty.dtype, StringView): + if isinstance(ty, types.CPointer) and isinstance( + ty.dtype, (StringView, UDFString) + ): return types.uint64, val.ptr else: return ty, val @@ -102,6 +105,26 @@ def prepare_args(self, ty, val, **kwargs): str_view_arg_handler = StrViewArgHandler() +# a python object for numba to grab on to, just to have +# something to replace with code +def maybe_post_process_result(result): + pass + + +@cuda_decl_registry.register_global(maybe_post_process_result) +class MaybePostProcessResult(AbstractTemplate): + def generic(self, args, kws): + # a UDF may be typed to return a string_view in some edge cases + # 1. a string is returned unmodified from an input column + # 2. a view of a string variable is returned such as a substring + # in both cases the result must be promoted to udf_string to be + # returned. This requires a copy. + if len(args) == 1 and isinstance(args[0], StringView): + return nb_signature(udf_string, args[0]) + else: + return nb_signature(args[0], args[0]) + + # String functions @cuda_decl_registry.register_global(len) class StringLength(AbstractTemplate): @@ -113,7 +136,7 @@ def generic(self, args, kws): if isinstance(args[0], any_string_ty) and len(args) == 1: # length: # string_view -> int32 - # dstring -> int32 + # udf_string -> int32 # literal -> int32 return nb_signature(size_type, args[0]) diff --git a/python/strings_udf/strings_udf/lowering.py b/python/strings_udf/strings_udf/lowering.py index df0902dfa98..e7deed246c0 100644 --- a/python/strings_udf/strings_udf/lowering.py +++ b/python/strings_udf/strings_udf/lowering.py @@ -5,6 +5,7 @@ from numba import cuda, types from numba.core import cgutils +from numba.core.datamodel import default_manager from numba.core.typing import signature as nb_signature from numba.cuda.cudadrv import nvvm from numba.cuda.cudaimpl import ( @@ -13,7 +14,13 @@ ) from strings_udf._lib.tables import get_character_flags_table_ptr -from strings_udf._typing import size_type, string_view +from strings_udf._typing import ( + StringView, + maybe_post_process_result, + size_type, + string_view, + udf_string, +) character_flags_table_ptr = get_character_flags_table_ptr() @@ -107,6 +114,45 @@ def cast_string_literal_to_string_view(context, builder, fromty, toty, val): return sv._getvalue() +# utilities +_create_udf_string_from_string_view = cuda.declare_device( + "udf_string_from_string_view", + types.void(types.CPointer(string_view), types.CPointer(udf_string)), +) + + +def call_create_udf_string_from_string_view(sv, udf_str): + _create_udf_string_from_string_view(sv, udf_str) + + +@cuda_lower(maybe_post_process_result, types.Any) +def maybe_post_process_result_impl(context, builder, sig, args): + if not isinstance(sig.args[0], StringView): + return args[0] + else: + sv_ptr = builder.alloca(args[0].type) + + # obtain llvm type for udf_string + udf_str_ptr = builder.alloca( + default_manager[udf_string].get_value_type() + ) + builder.store(args[0], sv_ptr) + + _ = context.compile_internal( + builder, + call_create_udf_string_from_string_view, + nb_signature( + types.void, _STR_VIEW_PTR, types.CPointer(udf_string) + ), + (sv_ptr, udf_str_ptr), + ) + result = cgutils.create_struct_proxy(udf_string)( + context, builder, value=builder.load(udf_str_ptr) + ) + + return result._getvalue() + + # String function implementations def call_len_string_view(st): return _string_view_len(st) diff --git a/python/strings_udf/strings_udf/tests/test_string_udfs.py b/python/strings_udf/strings_udf/tests/test_string_udfs.py index f214915ae12..62bb7466696 100644 --- a/python/strings_udf/strings_udf/tests/test_string_udfs.py +++ b/python/strings_udf/strings_udf/tests/test_string_udfs.py @@ -9,17 +9,26 @@ from numba.types import CPointer, void import cudf +import rmm from cudf.testing._utils import assert_eq import strings_udf -from strings_udf._lib.cudf_jit_udf import to_string_view_array -from strings_udf._typing import str_view_arg_handler, string_view +from strings_udf._lib.cudf_jit_udf import ( + from_udf_string_array, + to_string_view_array, +) +from strings_udf._typing import ( + maybe_post_process_result, + str_view_arg_handler, + string_view, + udf_string, +) if not strings_udf.ENABLED: pytest.skip("Strings UDF not enabled.", allow_module_level=True) -def get_kernel(func, dtype): +def get_kernel(func, dtype, size): """ Create a kernel for testing a single scalar string function Allocates an output vector with a dtype specified by the caller @@ -28,17 +37,22 @@ def get_kernel(func, dtype): """ func = cuda.jit(device=True)(func) - outty = numba.np.numpy_support.from_dtype(dtype) - sig = nb_signature(void, CPointer(string_view), outty[::1]) + + if dtype == "str": + outty = CPointer(udf_string) + else: + outty = numba.np.numpy_support.from_dtype(dtype)[::1] + sig = nb_signature(void, CPointer(string_view), outty) @cuda.jit( sig, link=[strings_udf.ptxpath], extensions=[str_view_arg_handler] ) def kernel(input_strings, output_col): id = cuda.grid(1) - if id < len(output_col): + if id < size: st = input_strings[id] result = func(st) + result = maybe_post_process_result(result) output_col[id] = result return kernel @@ -53,14 +67,21 @@ def run_udf_test(data, func, dtype): and then assembles the result back into a cuDF series before comparing it with the equivalent pandas result """ - dtype = np.dtype(dtype) + if dtype == "str": + output_ary = rmm.DeviceBuffer(size=len(data) * udf_string.size_bytes) + else: + dtype = np.dtype(dtype) + output_ary = cudf.core.column.column_empty(len(data), dtype=dtype) + cudf_column = cudf.core.column.as_column(data) str_view_ary = to_string_view_array(cudf_column) - output_ary = cudf.core.column.column_empty(len(data), dtype=dtype) - - kernel = get_kernel(func, dtype) + kernel = get_kernel(func, dtype, len(data)) kernel.forall(len(data))(str_view_ary, output_ary) + + if dtype == "str": + output_ary = from_udf_string_array(output_ary) + got = cudf.Series(output_ary, dtype=dtype) expect = pd.Series(data).apply(func) assert_eq(expect, got, check_dtype=False) @@ -259,3 +280,10 @@ def func(st): return st.startswith(substr) run_udf_test(data, func, "bool") + + +def test_string_udf_return_string(data): + def func(st): + return st + + run_udf_test(data, func, "str") From 238c86224a06ea31ad458b882507ef8eded265d4 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 14 Oct 2022 09:50:41 -0700 Subject: [PATCH 04/54] returning strings working --- python/cudf/cudf/core/udf/__init__.py | 33 +++++++++++++++++-- python/cudf/cudf/core/udf/utils.py | 17 ++++++++-- python/cudf/cudf/tests/test_udf_masked_ops.py | 8 +++++ python/strings_udf/strings_udf/lowering.py | 18 ++++++++++ 4 files changed, 71 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/udf/__init__.py b/python/cudf/cudf/core/udf/__init__.py index 443466b28bd..500ca5e04d0 100644 --- a/python/cudf/cudf/core/udf/__init__.py +++ b/python/cudf/cudf/core/udf/__init__.py @@ -1,4 +1,5 @@ # Copyright (c) 2022, NVIDIA CORPORATION. +import cupy as cp import numpy as np from numba import cuda, types from numba.cuda.cudaimpl import ( @@ -6,6 +7,9 @@ registry as cuda_lowering_registry, ) +import rmm + +from cudf.core.column import as_column from cudf.core.dtypes import dtype from cudf.core.udf import api, row_function, utils from cudf.utils.dtypes import STRING_TYPES @@ -30,8 +34,15 @@ from . import strings_typing # isort: skip from . import strings_lowering # isort: skip from strings_udf import ptxpath - from strings_udf._lib.cudf_jit_udf import to_string_view_array - from strings_udf._typing import str_view_arg_handler, string_view + from strings_udf._lib.cudf_jit_udf import ( + from_udf_string_array, + to_string_view_array, + ) + from strings_udf._typing import ( + str_view_arg_handler, + string_view, + udf_string, + ) # add an overload of MaskedType.__init__(string_view, bool) cuda_lower(api.Masked, strings_typing.string_view, types.boolean)( @@ -49,8 +60,26 @@ utils.JIT_SUPPORTED_TYPES |= STRING_TYPES utils.ptx_files.append(ptxpath) utils.arg_handlers.append(str_view_arg_handler) + utils.udf_return_type_map[string_view] = udf_string row_function.itemsizes[dtype("O")] = string_view.size_bytes + def _return_arr_from_dtype(dt, size): + if dt == np.dtype("O"): + result = rmm.DeviceBuffer(size=size * udf_string.size_bytes) + return result + else: + return cp.empty(size, dtype=dt) + + utils._return_arr_from_dtype = _return_arr_from_dtype + + def _post_process_output_col(col, retty): + if retty == np.dtype("O"): + return from_udf_string_array(col) + else: + return as_column(col, retty) + + utils._post_process_output_col = _post_process_output_col + _STRING_UDFS_ENABLED = True else: del strings_udf diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index fa79088046c..7fb19694b32 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -4,6 +4,7 @@ import cachetools import cupy as cp +import numba import numpy as np from numba import cuda, typeof from numba.core.errors import TypingError @@ -31,6 +32,7 @@ precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32) arg_handlers: List[Any] = [] ptx_files: List[Any] = [] +udf_return_type_map: Dict[Any, Any] = {} @_cudf_nvtx_annotate @@ -54,6 +56,7 @@ def _get_udf_return_type(argty, func: Callable, args=()): # Get the return type. The PTX is also returned by compile_udf, but is not # needed here. ptx, output_type = cudautils.compile_udf(func, compile_sig) + if not isinstance(output_type, MaskedType): numba_output_type = numpy_support.from_dtype(np.dtype(output_type)) else: @@ -64,6 +67,7 @@ def _get_udf_return_type(argty, func: Callable, args=()): if not isinstance(numba_output_type, MaskedType) else numba_output_type.value_type ) + result = udf_return_type_map.get(result, result) # _get_udf_return_type will throw a TypingError if the user tries to use # a field in the row containing an unsupported dtype, except in the @@ -142,9 +146,12 @@ def _construct_signature(frame, return_type, args): actually JIT the kernel itself later, accounting for types and offsets. Skips columns with unsupported dtypes. """ - + if return_type in udf_return_type_map.values(): + return_type = CPointer(return_type) + else: + return_type = return_type[::1] # Tuple of arrays, first the output data array, then the mask - return_type = Tuple((return_type[::1], boolean[::1])) + return_type = Tuple((return_type, boolean[::1])) offsets = [] sig = [return_type, int64] for col in _supported_cols_from_frame(frame).values(): @@ -213,7 +220,11 @@ def _compile_or_get(frame, func, args, kernel_getter=None): # could be a MaskedType or a scalar type. kernel, scalar_return_type = kernel_getter(frame, func, args) - np_return_type = numpy_support.as_dtype(scalar_return_type) + try: + np_return_type = numpy_support.as_dtype(scalar_return_type) + except numba.core.errors.NumbaNotImplementedError: + # TODO: fix + np_return_type = np.dtype("object") precompiled[cache_key] = (kernel, np_return_type) return kernel, np_return_type diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index 20245bd2a20..45a57296cba 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -860,6 +860,14 @@ def func(row): run_masked_udf_test(func, str_udf_data, check_dtype=False) +@string_udf_test +def test_string_udf_return_string(str_udf_data): + def func(row): + return row["str_col"] + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + @pytest.mark.parametrize( "data", [[1.0, 0.0, 1.5], [1, 0, 2], [True, False, True]] ) diff --git a/python/strings_udf/strings_udf/lowering.py b/python/strings_udf/strings_udf/lowering.py index e7deed246c0..d8087e50253 100644 --- a/python/strings_udf/strings_udf/lowering.py +++ b/python/strings_udf/strings_udf/lowering.py @@ -114,6 +114,24 @@ def cast_string_literal_to_string_view(context, builder, fromty, toty, val): return sv._getvalue() +@cuda_lowering_registry.lower_cast(string_view, udf_string) +def cast_string_view_to_udf_string(context, builder, fromty, toty, val): + sv_ptr = builder.alloca(default_manager[fromty].get_value_type()) + udf_str_ptr = builder.alloca(default_manager[toty].get_value_type()) + builder.store(val, sv_ptr) + _ = context.compile_internal( + builder, + call_create_udf_string_from_string_view, + nb_signature(types.void, _STR_VIEW_PTR, types.CPointer(udf_string)), + (sv_ptr, udf_str_ptr), + ) + result = cgutils.create_struct_proxy(udf_string)( + context, builder, value=builder.load(udf_str_ptr) + ) + + return result._getvalue() + + # utilities _create_udf_string_from_string_view = cuda.declare_device( "udf_string_from_string_view", From 0544c2310a6b661455a0721cabafe0adea382474 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 17 Oct 2022 08:28:26 -0700 Subject: [PATCH 05/54] clean up code a bit --- python/cudf/cudf/core/udf/__init__.py | 41 +++++--------------- python/cudf/cudf/core/udf/masked_lowering.py | 9 +++-- python/cudf/cudf/core/udf/masked_typing.py | 2 +- python/cudf/cudf/core/udf/utils.py | 17 +++++--- 4 files changed, 26 insertions(+), 43 deletions(-) diff --git a/python/cudf/cudf/core/udf/__init__.py b/python/cudf/cudf/core/udf/__init__.py index 500ca5e04d0..fcdbb64396c 100644 --- a/python/cudf/cudf/core/udf/__init__.py +++ b/python/cudf/cudf/core/udf/__init__.py @@ -27,6 +27,7 @@ | {types.boolean} ) _STRING_UDFS_ENABLED = False +cudf_str_dtype = dtype(str) try: import strings_udf @@ -44,42 +45,16 @@ udf_string, ) - # add an overload of MaskedType.__init__(string_view, bool) - cuda_lower(api.Masked, strings_typing.string_view, types.boolean)( - masked_lowering.masked_constructor - ) - - # add an overload of pack_return(string_view) - cuda_lower(api.pack_return, strings_typing.string_view)( - masked_lowering.pack_return_scalar_impl - ) - _supported_masked_types |= {strings_typing.string_view} - utils.launch_arg_getters[dtype("O")] = to_string_view_array - utils.masked_array_types[dtype("O")] = string_view + utils.launch_arg_getters[cudf_str_dtype] = to_string_view_array + utils.output_col_getters[cudf_str_dtype] = from_udf_string_array + utils.masked_array_types[cudf_str_dtype] = string_view + row_function.itemsizes[cudf_str_dtype] = string_view.size_bytes + utils.JIT_SUPPORTED_TYPES |= STRING_TYPES utils.ptx_files.append(ptxpath) utils.arg_handlers.append(str_view_arg_handler) utils.udf_return_type_map[string_view] = udf_string - row_function.itemsizes[dtype("O")] = string_view.size_bytes - - def _return_arr_from_dtype(dt, size): - if dt == np.dtype("O"): - result = rmm.DeviceBuffer(size=size * udf_string.size_bytes) - return result - else: - return cp.empty(size, dtype=dt) - - utils._return_arr_from_dtype = _return_arr_from_dtype - - def _post_process_output_col(col, retty): - if retty == np.dtype("O"): - return from_udf_string_array(col) - else: - return as_column(col, retty) - - utils._post_process_output_col = _post_process_output_col - _STRING_UDFS_ENABLED = True else: del strings_udf @@ -87,4 +62,6 @@ def _post_process_output_col(col, retty): except ImportError as e: # allow cuDF to work without strings_udf pass -masked_typing.register_masked_constructor(_supported_masked_types) + +masked_typing._register_masked_constructor_typing(_supported_masked_types) +masked_lowering._register_masked_constructor_lowering(_supported_masked_types) diff --git a/python/cudf/cudf/core/udf/masked_lowering.py b/python/cudf/cudf/core/udf/masked_lowering.py index f825b6538bf..37f3117e756 100644 --- a/python/cudf/cudf/core/udf/masked_lowering.py +++ b/python/cudf/cudf/core/udf/masked_lowering.py @@ -372,10 +372,6 @@ def cast_masked_to_masked(context, builder, fromty, toty, val): # Masked constructor for use in a kernel for testing -@lower_builtin(api.Masked, types.Boolean, types.boolean) -@lower_builtin(api.Masked, types.Number, types.boolean) -@lower_builtin(api.Masked, types.NPDatetime, types.boolean) -@lower_builtin(api.Masked, types.NPTimedelta, types.boolean) def masked_constructor(context, builder, sig, args): ty = sig.return_type value, valid = args @@ -385,6 +381,11 @@ def masked_constructor(context, builder, sig, args): return masked._getvalue() +def _register_masked_constructor_lowering(supported_masked_types): + for ty in supported_masked_types: + lower_builtin(api.Masked, ty, types.boolean)(masked_constructor) + + # Allows us to make an instance of MaskedType a global variable # and properly use it inside functions we will later compile @cuda_lowering_registry.lower_constant(MaskedType) diff --git a/python/cudf/cudf/core/udf/masked_typing.py b/python/cudf/cudf/core/udf/masked_typing.py index a815a9f6dae..7baf2d585e2 100644 --- a/python/cudf/cudf/core/udf/masked_typing.py +++ b/python/cudf/cudf/core/udf/masked_typing.py @@ -169,7 +169,7 @@ def typeof_masked(val, c): # Implemented typing for Masked(value, valid) - the construction of a Masked # type in a kernel. -def register_masked_constructor(supported_masked_types): +def _register_masked_constructor_typing(supported_masked_types): class MaskedConstructor(ConcreteTemplate): key = api.Masked cases = [ diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index 7fb19694b32..b4912882b56 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -11,6 +11,8 @@ from numba.np import numpy_support from numba.types import CPointer, Poison, Tuple, boolean, int64, void +import rmm + from cudf.core.column.column import as_column from cudf.core.dtypes import CategoricalDtype from cudf.core.udf.masked_typing import MaskedType @@ -33,6 +35,9 @@ arg_handlers: List[Any] = [] ptx_files: List[Any] = [] udf_return_type_map: Dict[Any, Any] = {} +masked_array_types: Dict[Any, Any] = {} +launch_arg_getters: Dict[Any, Any] = {} +output_col_getters: Dict[Any, Any] = {} @_cudf_nvtx_annotate @@ -116,9 +121,6 @@ def _supported_cols_from_frame(frame): } -masked_array_types: Dict[Any, Any] = {} - - def _masked_array_type_from_col(col): """ Return a type representing a tuple of arrays, @@ -241,9 +243,6 @@ def _get_kernel(kernel_string, globals_, sig, func): return kernel -launch_arg_getters: Dict[Any, Any] = {} - - def _get_input_args_from_frame(fr): args = [] offsets = [] @@ -265,8 +264,14 @@ def _get_input_args_from_frame(fr): def _return_arr_from_dtype(dt, size): + extensionty = udf_return_type_map.get(masked_array_types.get(dt)) + if extensionty: + return rmm.DeviceBuffer(size=size * extensionty.size_bytes) return cp.empty(size, dtype=dt) def _post_process_output_col(col, retty): + getter = output_col_getters.get(retty) + if getter: + col = getter(col) return as_column(col, retty) From a5661bcf7fc0ff21bc12482d7cd51e8ec001f852 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 18 Oct 2022 10:26:47 -0400 Subject: [PATCH 06/54] change void* to udf_string* --- .../cpp/include/cudf/strings/udf/case.cuh | 4 +- .../include/cudf/strings/udf/char_types.cuh | 2 +- .../cpp/include/cudf/strings/udf/numeric.cuh | 4 +- .../cpp/include/cudf/strings/udf/search.cuh | 2 +- .../cpp/include/cudf/strings/udf/split.cuh | 12 ++-- .../include/cudf/strings/udf/starts_with.cuh | 8 +-- .../cpp/include/cudf/strings/udf/strip.cuh | 6 +- .../cpp/include/cudf/strings/udf/udf_apis.hpp | 18 ++++-- .../cpp/src/strings/udf/udf_apis.cu | 55 ++++++++++++------- 9 files changed, 67 insertions(+), 44 deletions(-) diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/case.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/case.cuh index 98b25f85f74..17ee98ace6f 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/case.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/case.cuh @@ -25,7 +25,9 @@ namespace cudf { namespace strings { namespace udf { -// global variables for character-type flags and case conversion +/** + * @brief Global variables for character-type flags and case conversion + */ struct chars_tables { cudf::strings::detail::character_flags_table_type* flags_table; cudf::strings::detail::character_cases_table_type* cases_table; diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/char_types.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/char_types.cuh index 9320686442b..2c73328843f 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/char_types.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/char_types.cuh @@ -25,7 +25,7 @@ namespace strings { namespace udf { /** - * @brief Returns true if all characters in the string are of the type specified. + * @brief Returns true if all characters in the string are of the type specified * * The output will be false if the string is empty or has at least one character * not of the specified type. If all characters fit the type then true is returned. diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh index 48f709ae318..2cd6eaaa298 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh @@ -27,7 +27,7 @@ namespace strings { namespace udf { /** - * @brief Converts a string into an integer. + * @brief Converts a string into an integer * * The '+' and '-' are allowed but only at the beginning of the string. * The string is expected to contain base-10 [0-9] characters only. @@ -59,7 +59,7 @@ __device__ inline udf_string to_string(int64_t value) } /** - * @brief Converts a string into a double. + * @brief Converts a string into a double * * Support scientific notation as well. * Overflow goes to inf or -inf and underflow may go to 0. diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/search.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/search.cuh index ef15886f1f5..ed3a873b655 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/search.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/search.cuh @@ -23,7 +23,7 @@ namespace udf { /** * @brief Returns the number of times that the target string appears - * in the source string. + * in the source string * * If `start <= 0` the search begins at the beginning of the `source` string. * If `end <=0` or `end` is greater the length of the `source` string, diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh index f7fa32d0e45..58e3e6fcfb4 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh @@ -68,7 +68,7 @@ __device__ inline cudf::size_type split(cudf::string_view const d_str, } /** - * @brief Split string using given target array. + * @brief Split string using given target array * * @param d_str String to split * @param tgt Character array encoded in UTF-8 used for identifying split points @@ -86,7 +86,7 @@ __device__ inline int split(cudf::string_view const d_str, } /** - * @brief Split string using given target array. + * @brief Split string using given target array * * @param d_str String to split * @param tgt Null-terminated character array encoded in UTF-8 used for identifying split points @@ -102,7 +102,7 @@ __device__ inline int split(cudf::string_view const d_str, } /** - * @brief Split string on whitespace. + * @brief Split string on whitespace * * This will create tokens by splitting on one or more consecutive whitespace characters * found in `d_str`. @@ -125,7 +125,7 @@ __device__ inline cudf::size_type split(cudf::string_view const d_str, cudf::str } /** - * @brief Join an array of strings with a separator. + * @brief Join an array of strings with a separator * * @code{.cpp} * auto separator = cudf::string_view{"::", 2}; @@ -156,7 +156,7 @@ __device__ inline udf_string join(cudf::string_view const separator, } /** - * @brief Join an array of strings with a separator. + * @brief Join an array of strings with a separator * * @param separator Null-terminated UTF-8 string * @param bytes Number of bytes to read from `separator` @@ -173,7 +173,7 @@ __device__ inline udf_string join(char const* separator, } /** - * @brief Join an array of strings with a separator. + * @brief Join an array of strings with a separator * * @param separator Null-terminated UTF-8 string * @param input An array of strings to join diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/starts_with.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/starts_with.cuh index 38c609ae505..8e9ffce4f52 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/starts_with.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/starts_with.cuh @@ -24,7 +24,7 @@ namespace udf { /** * @brief Returns true if the beginning of the specified string - * matches the given character array. + * matches the given character array * * @param dstr String to check * @param tgt Character array encoded in UTF-8 @@ -42,7 +42,7 @@ __device__ inline bool starts_with(cudf::string_view const dstr, /** * @brief Returns true if the beginning of the specified string - * matches the given target string. + * matches the given target string * * @param dstr String to check * @param tgt String to match @@ -55,7 +55,7 @@ __device__ inline bool starts_with(cudf::string_view const dstr, cudf::string_vi /** * @brief Returns true if the end of the specified string - * matches the given character array. + * matches the given character array * * @param dstr String to check * @param tgt Character array encoded in UTF-8 @@ -73,7 +73,7 @@ __device__ inline bool ends_with(cudf::string_view const dstr, /** * @brief Returns true if the end of the specified string - * matches the given target` string. + * matches the given target` string * * @param dstr String to check * @param tgt String to match diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/strip.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/strip.cuh index 1cd7dfbaa78..6c6639835cd 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/strip.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/strip.cuh @@ -26,7 +26,7 @@ namespace strings { namespace udf { /** - * @brief Strip characters from the beginning and/or end of the given string. + * @brief Strip characters from the beginning and/or end of the given string * * The `d_to_strip` is interpretted as an array of characters to be removed. * If `d_to_strip` is an empty string, whitespace characters are stripped. @@ -56,7 +56,7 @@ __device__ udf_string strip(cudf::string_view const d_str, } /** - * @brief Strip characters from the beginning of the given string. + * @brief Strip characters from the beginning of the given string * * The `d_to_strip` is interpretted as an array of characters to be removed. * If `d_to_strip` is an empty string, whitespace characters are stripped. @@ -81,7 +81,7 @@ __device__ udf_string lstrip(cudf::string_view const d_str, cudf::string_view d_ } /** - * @brief Strip characters from the end of the given string. + * @brief Strip characters from the end of the given string * * The `d_to_strip` is interpretted as an array of characters to be removed. * If `d_to_strip` is an empty string, whitespace characters are stripped. diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp b/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp index f5807be151b..9ae986d2327 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp +++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp @@ -27,22 +27,30 @@ namespace cudf { namespace strings { namespace udf { +class udf_string; + /** * @brief Return a cudf::string_view array for the given strings column * + * No string data is copied so the input column controls the lifetime of the + * underlying strings. + * + * New device memory is allocated and returned to hold just the string_view instances. + * * @param input Strings column to convert to a string_view array. - * @throw cudf::logic_error if input is not a strings column. + * @return Array of string_view objects in device memory */ std::unique_ptr to_string_view_array(cudf::column_view const input); /** - * @brief Return a cudf::column given an array of udf_string objects. + * @brief Return a STRINGS column given an array of udf_string objects * - * @param d_buffer Pointer to device memory of udf_string objects - * @param d_size The number of bytes in the d_buffer + * @param d_strings Pointer to device memory of udf_string objects + * @param size The number of elements in the buffer array * @return A strings column copy of the udf_string objects */ -std::unique_ptr column_from_udf_string_array(void* d_buffer, std::size_t size); +std::unique_ptr column_from_udf_string_array(udf_string* d_strings, + cudf::size_type size); } // namespace udf } // namespace strings diff --git a/python/strings_udf/cpp/src/strings/udf/udf_apis.cu b/python/strings_udf/cpp/src/strings/udf/udf_apis.cu index 2bc74c8ca19..8ad255eda9f 100644 --- a/python/strings_udf/cpp/src/strings/udf/udf_apis.cu +++ b/python/strings_udf/cpp/src/strings/udf/udf_apis.cu @@ -33,22 +33,26 @@ namespace udf { namespace detail { namespace { -struct free_udf_string_fn { - cudf::strings::udf::udf_string* d_strings; - __device__ void operator()(cudf::size_type idx) { d_strings[idx].clear(); } -}; - -void free_udf_string_array(void* d_buffer, std::size_t buffer_size, rmm::cuda_stream_view stream) +/** + * @brief Frees udf_strings device memory + * + * @param d_buffer Array of udf_strings + */ +void free_udf_string_array(cudf::strings::udf::udf_string* d_strings, + cudf::size_type size, + rmm::cuda_stream_view stream) { - auto const size = - static_cast(buffer_size / sizeof(cudf::strings::udf::udf_string)); - auto d_strings = reinterpret_cast(d_buffer); thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), size, - free_udf_string_fn{d_strings}); + [d_strings] __device__(auto idx) { d_strings[idx].clear(); }); } +/** + * @brief Functor wraps string_view objects around udf_string objects + * + * No string data is copied. + */ struct udf_string_to_string_view_transform_fn { __device__ cudf::string_view operator()(cudf::strings::udf::udf_string const& dstr) { @@ -58,6 +62,11 @@ struct udf_string_to_string_view_transform_fn { } // namespace +/** + * @copydoc to_string_view_array + * + * @param stream CUDA stream used for allocating/copying device memory and launching kernels + */ std::unique_ptr to_string_view_array(cudf::column_view const input, rmm::cuda_stream_view stream) { @@ -67,26 +76,27 @@ std::unique_ptr to_string_view_array(cudf::column_view const .release())); } -std::unique_ptr column_from_udf_string_array(void* d_buffer, - std::size_t buffer_size, +/** + * @copydoc column_from_udf_string_array + + * @param stream CUDA stream used for allocating/copying device memory and launching kernels + */ +std::unique_ptr column_from_udf_string_array(udf_string* d_strings, + cudf::size_type size, rmm::cuda_stream_view stream) { - auto const size = - static_cast(buffer_size / sizeof(cudf::strings::udf::udf_string)); - auto d_input = reinterpret_cast(d_buffer); - // create string_views of the udf_strings auto indices = rmm::device_uvector(size, stream); thrust::transform(rmm::exec_policy(stream), - d_input, - d_input + size, + d_strings, + d_strings + size, indices.data(), udf_string_to_string_view_transform_fn{}); auto results = cudf::make_strings_column(indices, cudf::string_view(nullptr, 0), stream); // free the individual udf_string elements - free_udf_string_array(d_buffer, buffer_size, stream); + free_udf_string_array(d_strings, size, stream); // return new column return results; @@ -94,14 +104,17 @@ std::unique_ptr column_from_udf_string_array(void* d_buffer, } // namespace detail +// external APIs + std::unique_ptr to_string_view_array(cudf::column_view const input) { return detail::to_string_view_array(input, rmm::cuda_stream_default); } -std::unique_ptr column_from_udf_string_array(void* d_buffer, std::size_t buffer_size) +std::unique_ptr column_from_udf_string_array(udf_string* d_strings, + cudf::size_type size) { - return detail::column_from_udf_string_array(d_buffer, buffer_size, rmm::cuda_stream_default); + return detail::column_from_udf_string_array(d_strings, size, rmm::cuda_stream_default); } } // namespace udf From 9661c4ed2c2af19c76aac294c8987e7c40f64c06 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 18 Oct 2022 12:03:57 -0400 Subject: [PATCH 07/54] update doxygens --- .../cpp/include/cudf/strings/udf/udf_apis.hpp | 5 + .../include/cudf/strings/udf/udf_string.cuh | 2 - .../include/cudf/strings/udf/udf_string.hpp | 97 ++++++++++--------- 3 files changed, 58 insertions(+), 46 deletions(-) diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp b/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp index 9ae986d2327..540dcaf4c18 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp +++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp @@ -45,6 +45,11 @@ std::unique_ptr to_string_view_array(cudf::column_view const /** * @brief Return a STRINGS column given an array of udf_string objects * + * This will make a copy of the strings in d_string in order to build + * the output column. + * The individual udf_strings are also cleared freeing each of their internal + * device memory buffers. + * * @param d_strings Pointer to device memory of udf_string objects * @param size The number of elements in the buffer array * @return A strings column copy of the udf_string objects diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh index ff4a4399e40..d651f9faa2b 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh @@ -178,8 +178,6 @@ __device__ inline char const* udf_string::data() const { return m_data; } __device__ inline bool udf_string::is_empty() const { return m_bytes == 0; } -__device__ inline bool udf_string::is_null() const { return m_data == nullptr; } - __device__ inline cudf::string_view::const_iterator udf_string::begin() const { return cudf::string_view::const_iterator(cudf::string_view(m_data, m_bytes), 0); diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp index a0da0186b24..c57ab473017 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp +++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp @@ -19,14 +19,30 @@ #include +// This header contains all class and function declarations so that it +// can be included in a .cpp file which only has declaration requirements +// (i.e. sizeof, conditionally-comparable, explicit conversions, etc). +// The definitions are coded in udf_string.cuh which is to be included +// in .cu files that use this class in kernel calls. + namespace cudf { namespace strings { namespace udf { +/** + * @brief Device string class for use with user-defined functions + * + * This class manages a device buffer of UTF-8 encoded characters + * for string manipulation in a device kernel. + * + * It's methods and behavior are modelled after std::string but + * with special consideration for UTF-8 encoded strings and for + * use within a cuDF UDF. + */ class udf_string { public: /** - * @brief Represents unknown character position or length. + * @brief Represents unknown character position or length */ static constexpr cudf::size_type npos = static_cast(-1); @@ -41,7 +57,7 @@ class udf_string { udf_string() = default; /** - * @brief Create a string using existing device memory. + * @brief Create a string using existing device memory * * The given memory is copied into the instance returned. * @@ -51,7 +67,7 @@ class udf_string { __device__ udf_string(char const* data, cudf::size_type bytes); /** - * @brief Create a string object from a null-terminated character array. + * @brief Create a string object from a null-terminated character array * * The given memory is copied into the instance returned. * @@ -61,7 +77,7 @@ class udf_string { __device__ udf_string(char const* data); /** - * @brief Create a string object from a cudf::string_view. + * @brief Create a string object from a cudf::string_view * * The input string data is copied into the instance returned. * @@ -70,7 +86,7 @@ class udf_string { __device__ udf_string(cudf::string_view const str); /** - * @brief Create a string object with `count` copies of character `chr`. + * @brief Create a string object with `count` copies of character `chr` * * @param count Number of times to copy `chr` * @param chr Character from which to create the string @@ -78,7 +94,7 @@ class udf_string { __device__ udf_string(cudf::size_type count, cudf::char_utf8 chr); /** - * @brief Create a string object from another instance. + * @brief Create a string object from another instance * * The string data is copied from the `src` into the instance returned. * @@ -87,7 +103,7 @@ class udf_string { __device__ udf_string(udf_string const& src); /** - * @brief Create a string object from a move reference. + * @brief Create a string object from a move reference * * The string data is moved from `src` into the instance returned. * The `src` will have no content. @@ -104,41 +120,34 @@ class udf_string { __device__ udf_string& operator=(char const*); /** - * @brief Return the number of bytes in this string. + * @brief Return the number of bytes in this string */ __device__ cudf::size_type size_bytes() const; /** - * @brief Return the number of characters in this string. + * @brief Return the number of characters in this string */ __device__ cudf::size_type length() const; /** - * @brief Return the maximum number of bytes a udf_string can hold. + * @brief Return the maximum number of bytes a udf_string can hold */ __device__ cudf::size_type max_size() const; /** - * @brief Return the internal pointer to the character array for this object. + * @brief Return the internal pointer to the character array for this object */ __device__ char* data(); __device__ char const* data() const; /** - * @brief Returns true if there are no characters in this string. + * @brief Returns true if there are no characters in this string */ __device__ bool is_empty() const; - /** - * @brief Returns true if `data()==nullptr` - * - * This is experimental and may be removed in the futre. - */ - __device__ bool is_null() const; - /** * @brief Returns an iterator that can be used to navigate through - * the UTF-8 characters in this string. + * the UTF-8 characters in this string * * This returns a `cudf::string_view::const_iterator` which is read-only. */ @@ -146,7 +155,7 @@ class udf_string { __device__ cudf::string_view::const_iterator end() const; /** - * @brief Returns the character at the specified position. + * @brief Returns the character at the specified position * * This will return 0 if `pos >= length()`. * @@ -156,7 +165,7 @@ class udf_string { __device__ cudf::char_utf8 at(cudf::size_type pos) const; /** - * @brief Returns the character at the specified index. + * @brief Returns the character at the specified index * * This will return 0 if `pos >= length()`. * Note this is read-only. Use replace() to modify a character. @@ -167,7 +176,7 @@ class udf_string { __device__ cudf::char_utf8 operator[](cudf::size_type pos) const; /** - * @brief Return the byte offset for a given character position. + * @brief Return the byte offset for a given character position * * The byte offset for the character at `pos` such that * `data() + byte_offset(pos)` points to the memory location @@ -238,14 +247,14 @@ class udf_string { __device__ bool operator>=(cudf::string_view const rhs) const; /** - * @brief Remove all bytes from this string. + * @brief Remove all bytes from this string * * All pointers, references, and iterators are invalidated. */ __device__ void clear(); /** - * @brief Resizes string to contain `count` bytes. + * @brief Resizes string to contain `count` bytes * * If `count > size_bytes()` then zero-padding is added. * If `count < size_bytes()` then the string is truncated to size `count`. @@ -257,7 +266,7 @@ class udf_string { __device__ void resize(cudf::size_type count); /** - * @brief Reserve `count` bytes in this string. + * @brief Reserve `count` bytes in this string * * If `count > capacity()`, new memory is allocated and `capacity()` will * be greater than or equal to `count`. @@ -268,12 +277,12 @@ class udf_string { __device__ void reserve(cudf::size_type count); /** - * @brief Returns the number of bytes that the string has allocated. + * @brief Returns the number of bytes that the string has allocated */ __device__ cudf::size_type capacity() const; /** - * @brief Reduces internal allocation to just `size_bytes()`. + * @brief Reduces internal allocation to just `size_bytes()` * * All pointers, references, and iterators may be invalidated. */ @@ -313,7 +322,7 @@ class udf_string { __device__ udf_string& assign(char const* str, cudf::size_type bytes); /** - * @brief Append a string to the end of this string. + * @brief Append a string to the end of this string * * @param str String to append * @return This string with the appended argument @@ -321,7 +330,7 @@ class udf_string { __device__ udf_string& operator+=(cudf::string_view const str); /** - * @brief Append a character to the end of this string. + * @brief Append a character to the end of this string * * @param str Character to append * @return This string with the appended argument @@ -330,7 +339,7 @@ class udf_string { /** * @brief Append a null-terminated device memory character array - * to the end of this string. + * to the end of this string * * @param str String to append * @return This string with the appended argument @@ -338,7 +347,7 @@ class udf_string { __device__ udf_string& operator+=(char const* str); /** - * @brief Append a null-terminated character array to the end of this string. + * @brief Append a null-terminated character array to the end of this string * * @param str String to append * @return This string with the appended argument @@ -346,7 +355,7 @@ class udf_string { __device__ udf_string& append(char const* str); /** - * @brief Append a character array to the end of this string. + * @brief Append a character array to the end of this string * * @param str Character array to append * @param bytes Number of bytes from `str` to append. @@ -355,7 +364,7 @@ class udf_string { __device__ udf_string& append(char const* str, cudf::size_type bytes); /** - * @brief Append a string to the end of this string. + * @brief Append a string to the end of this string * * @param str String to append * @return This string with the appended argument @@ -373,7 +382,7 @@ class udf_string { __device__ udf_string& append(cudf::char_utf8 chr, cudf::size_type count = 1); /** - * @brief Insert a string into the character position specified. + * @brief Insert a string into the character position specified * * There is no effect if `pos < 0 or pos > length()`. * @@ -384,7 +393,7 @@ class udf_string { __device__ udf_string& insert(cudf::size_type pos, cudf::string_view const str); /** - * @brief Insert a null-terminated character array into the character position specified. + * @brief Insert a null-terminated character array into the character position specified * * There is no effect if `pos < 0 or pos > length()`. * @@ -395,7 +404,7 @@ class udf_string { __device__ udf_string& insert(cudf::size_type pos, char const* data); /** - * @brief Insert a character array into the character position specified. + * @brief Insert a character array into the character position specified * * There is no effect if `pos < 0 or pos > length()`. * @@ -407,7 +416,7 @@ class udf_string { __device__ udf_string& insert(cudf::size_type pos, char const* data, cudf::size_type bytes); /** - * @brief Insert a character one or more times into the character position specified. + * @brief Insert a character one or more times into the character position specified * * There is no effect if `pos < 0 or pos > length()`. * @@ -419,7 +428,7 @@ class udf_string { __device__ udf_string& insert(cudf::size_type pos, cudf::size_type count, cudf::char_utf8 chr); /** - * @brief Returns a substring of this string. + * @brief Returns a substring of this string * * An empty string is returned if `pos < 0 or pos >= length()`. * @@ -432,7 +441,7 @@ class udf_string { __device__ udf_string substr(cudf::size_type pos, cudf::size_type count = npos) const; /** - * @brief Replace a range of characters with a given string. + * @brief Replace a range of characters with a given string * * Replaces characters in range `[pos, pos + count]` with `str`. * There is no effect if `pos < 0 or pos > length()`. @@ -447,7 +456,7 @@ class udf_string { cudf::string_view const str); /** - * @brief Replace a range of characters with a null-terminated character array. + * @brief Replace a range of characters with a null-terminated character array * * Replaces characters in range `[pos, pos + count)` with `data`. * There is no effect if `pos < 0 or pos > length()`. @@ -460,7 +469,7 @@ class udf_string { __device__ udf_string& replace(cudf::size_type pos, cudf::size_type count, char const* data); /** - * @brief Replace a range of characters with a given character array. + * @brief Replace a range of characters with a given character array * * Replaces characters in range `[pos, pos + count)` with `[data, data + bytes)`. * There is no effect if `pos < 0 or pos > length()`. @@ -477,7 +486,7 @@ class udf_string { cudf::size_type bytes); /** - * @brief Replace a range of characters with a character one or more times. + * @brief Replace a range of characters with a character one or more times * * Replaces characters in range `[pos, pos + count)` with `chr` `chr_count` times. * There is no effect if `pos < 0 or pos > length()`. @@ -494,7 +503,7 @@ class udf_string { cudf::char_utf8 chr); /** - * @brief Removes specified characters from this string. + * @brief Removes specified characters from this string * * Removes `min(count, length() - pos)` characters starting at `pos`. * There is no effect if `pos < 0 or pos >= length()`. From a6f03a334aae76216d0c4ee9e3ce165dcc848fea Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 18 Oct 2022 11:58:38 -0700 Subject: [PATCH 08/54] remove unnecessary explicit casting --- python/strings_udf/strings_udf/_typing.py | 20 ----------- python/strings_udf/strings_udf/lowering.py | 36 +------------------ .../strings_udf/tests/test_string_udfs.py | 8 +---- 3 files changed, 2 insertions(+), 62 deletions(-) diff --git a/python/strings_udf/strings_udf/_typing.py b/python/strings_udf/strings_udf/_typing.py index babda6b2d2a..63d2cdf69aa 100644 --- a/python/strings_udf/strings_udf/_typing.py +++ b/python/strings_udf/strings_udf/_typing.py @@ -105,26 +105,6 @@ def prepare_args(self, ty, val, **kwargs): str_view_arg_handler = StrViewArgHandler() -# a python object for numba to grab on to, just to have -# something to replace with code -def maybe_post_process_result(result): - pass - - -@cuda_decl_registry.register_global(maybe_post_process_result) -class MaybePostProcessResult(AbstractTemplate): - def generic(self, args, kws): - # a UDF may be typed to return a string_view in some edge cases - # 1. a string is returned unmodified from an input column - # 2. a view of a string variable is returned such as a substring - # in both cases the result must be promoted to udf_string to be - # returned. This requires a copy. - if len(args) == 1 and isinstance(args[0], StringView): - return nb_signature(udf_string, args[0]) - else: - return nb_signature(args[0], args[0]) - - # String functions @cuda_decl_registry.register_global(len) class StringLength(AbstractTemplate): diff --git a/python/strings_udf/strings_udf/lowering.py b/python/strings_udf/strings_udf/lowering.py index d8087e50253..f7e67129ebe 100644 --- a/python/strings_udf/strings_udf/lowering.py +++ b/python/strings_udf/strings_udf/lowering.py @@ -14,13 +14,7 @@ ) from strings_udf._lib.tables import get_character_flags_table_ptr -from strings_udf._typing import ( - StringView, - maybe_post_process_result, - size_type, - string_view, - udf_string, -) +from strings_udf._typing import size_type, string_view, udf_string character_flags_table_ptr = get_character_flags_table_ptr() @@ -143,34 +137,6 @@ def call_create_udf_string_from_string_view(sv, udf_str): _create_udf_string_from_string_view(sv, udf_str) -@cuda_lower(maybe_post_process_result, types.Any) -def maybe_post_process_result_impl(context, builder, sig, args): - if not isinstance(sig.args[0], StringView): - return args[0] - else: - sv_ptr = builder.alloca(args[0].type) - - # obtain llvm type for udf_string - udf_str_ptr = builder.alloca( - default_manager[udf_string].get_value_type() - ) - builder.store(args[0], sv_ptr) - - _ = context.compile_internal( - builder, - call_create_udf_string_from_string_view, - nb_signature( - types.void, _STR_VIEW_PTR, types.CPointer(udf_string) - ), - (sv_ptr, udf_str_ptr), - ) - result = cgutils.create_struct_proxy(udf_string)( - context, builder, value=builder.load(udf_str_ptr) - ) - - return result._getvalue() - - # String function implementations def call_len_string_view(st): return _string_view_len(st) diff --git a/python/strings_udf/strings_udf/tests/test_string_udfs.py b/python/strings_udf/strings_udf/tests/test_string_udfs.py index 62bb7466696..d40b186fdfc 100644 --- a/python/strings_udf/strings_udf/tests/test_string_udfs.py +++ b/python/strings_udf/strings_udf/tests/test_string_udfs.py @@ -17,12 +17,7 @@ from_udf_string_array, to_string_view_array, ) -from strings_udf._typing import ( - maybe_post_process_result, - str_view_arg_handler, - string_view, - udf_string, -) +from strings_udf._typing import str_view_arg_handler, string_view, udf_string if not strings_udf.ENABLED: pytest.skip("Strings UDF not enabled.", allow_module_level=True) @@ -52,7 +47,6 @@ def kernel(input_strings, output_col): if id < size: st = input_strings[id] result = func(st) - result = maybe_post_process_result(result) output_col[id] = result return kernel From ebaf0881358d66e690c60b5de0a3c7d2c76b4248 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 19 Oct 2022 15:16:09 -0400 Subject: [PATCH 09/54] add pad utility functions --- .../cpp/include/cudf/strings/udf/pad.cuh | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh new file mode 100644 index 00000000000..38dbee72a84 --- /dev/null +++ b/python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh @@ -0,0 +1,72 @@ + +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "udf_string.cuh" + +#include + +namespace cudf { +namespace strings { +namespace udf { + +/** + * @brief Pad beginning and/or end of a string with the given fill character + * + * The side_type::BOTH will attempt to center the text using the `fill_char`. + * If the `width` is smaller than the length of `d_str` no change occurs. + * + * @tparam side Specify where the padding should occur + * @param d_str String to pad + * @param width Minimum length in characters of the output string + * @param fill_char Character used for padding + */ +template +__device__ udf_string pad(cudf::string_view const d_str, + cudf::size_type width, + cudf::string_view fill_char = cudf::string_view{" ", 1}) +{ + if (fill_char.empty()) { return udf_string{d_str}; } + + udf_string result; + result.resize(cudf::strings::detail::compute_padded_size(d_str, width, fill_char.size_bytes())); + cudf::strings::detail::pad_impl(d_str, width, *fill_char.begin(), result.data()); + return result; +} + +/** + * @brief Pad beginning of a string with zero '0' + * + * If the `width` is smaller than the length of `d_str` no change occurs. + * + * If d_str starts with a sign character ('-' or '+') then '0' padding + * starts after the sign. + * + * @param d_str String to fill + * @param width Minimum length in characters of the output string + */ +__device__ udf_string zfill(cudf::string_view const d_str, cudf::size_type width) +{ + udf_string result; + result.resize(cudf::strings::detail::compute_padded_size(d_str, width, 1)); + cudf::strings::detail::zfill_impl(d_str, width, result.data()); + return result; +} + +} // namespace udf +} // namespace strings +} // namespace cudf From c3e17acd9ebea9054f2e8d439aa6cf61b45273d7 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 19 Oct 2022 15:30:48 -0400 Subject: [PATCH 10/54] fix doxygen for udf_apis.hpp --- python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp b/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp index 540dcaf4c18..9eb1c72dd5b 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp +++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp @@ -35,7 +35,7 @@ class udf_string; * No string data is copied so the input column controls the lifetime of the * underlying strings. * - * New device memory is allocated and returned to hold just the string_view instances. + * New device memory is allocated and returned to hold just the string_view instances. * * @param input Strings column to convert to a string_view array. * @return Array of string_view objects in device memory @@ -51,7 +51,7 @@ std::unique_ptr to_string_view_array(cudf::column_view const * device memory buffers. * * @param d_strings Pointer to device memory of udf_string objects - * @param size The number of elements in the buffer array + * @param size The number of elements in the d_strings array * @return A strings column copy of the udf_string objects */ std::unique_ptr column_from_udf_string_array(udf_string* d_strings, From 2dae45d904de295f5b6e2ad018131b3c2159194f Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 20 Oct 2022 09:38:51 -0400 Subject: [PATCH 11/54] fix to_string to use count_digits --- python/strings_udf/cpp/include/cudf/strings/udf/case.cuh | 2 +- python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/case.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/case.cuh index 17ee98ace6f..7539aeef8e3 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/case.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/case.cuh @@ -166,7 +166,7 @@ __device__ inline udf_string to_upper(chars_tables const tables, string_view d_s * @param d_str Input string to convert * @return New string containing the converted characters */ -__device__ inline udf_string swapcase(chars_tables const tables, string_view d_str) +__device__ inline udf_string swap_case(chars_tables const tables, string_view d_str) { cudf::strings::detail::character_flags_table_type case_flag = cudf::strings::detail::IS_LOWER(0xFF) | cudf::strings::detail::IS_UPPER(0xFF); diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh index 2cd6eaaa298..6d176d36cfb 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh @@ -51,8 +51,7 @@ __device__ inline udf_string to_string(int64_t value) result.append("0"); return result; } - auto const d_value = static_cast(abs(value)); - auto digits = static_cast(log10(d_value)) + 1 + (value < 0); + auto digits = cudf::strings::detail::count_digits(value); result.resize(digits); cudf::strings::detail::integer_to_string(value, result.data()); return result; From 3467f34db786ba9a58a9b6ffa0fff9c2b88c25cf Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 20 Oct 2022 09:46:59 -0400 Subject: [PATCH 12/54] add ALL_FLAGS --- cpp/include/cudf/strings/detail/char_tables.hpp | 1 + .../cpp/include/cudf/strings/udf/case.cuh | 13 +++++++------ .../cpp/include/cudf/strings/udf/numeric.cuh | 3 +-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/cpp/include/cudf/strings/detail/char_tables.hpp b/cpp/include/cudf/strings/detail/char_tables.hpp index 4ea7e3ee952..275b7223a3b 100644 --- a/cpp/include/cudf/strings/detail/char_tables.hpp +++ b/cpp/include/cudf/strings/detail/char_tables.hpp @@ -46,6 +46,7 @@ constexpr uint8_t IS_LOWER(uint8_t x) { return ((x) & (1 << 6)); } constexpr uint8_t IS_SPECIAL(uint8_t x) { return ((x) & (1 << 7)); } constexpr uint8_t IS_ALPHANUM(uint8_t x) { return ((x) & (0x0F)); } constexpr uint8_t IS_UPPER_OR_LOWER(uint8_t x) { return ((x) & ((1 << 5) | (1 << 6))); } +constexpr uint8_t ALL_FLAGS = 0xFF; // Type for the character cases table. using character_cases_table_type = uint16_t; diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/case.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/case.cuh index 7539aeef8e3..b11297cb721 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/case.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/case.cuh @@ -100,7 +100,7 @@ __device__ inline udf_string convert_case( /** * @brief Utility for capitalize and title functions * - * @tparam CaptializeNextFn returns true if the next candidate character should be capitalized + * @tparam CapitalizeNextFn returns true if the next candidate character should be capitalized * @param tables The char tables required for conversion * @param d_str Input string to convert * @return New string containing the converted characters @@ -137,8 +137,8 @@ __device__ inline udf_string capitalize(chars_tables const tables, */ __device__ inline udf_string to_lower(chars_tables const tables, string_view d_str) { - cudf::strings::detail::character_flags_table_type case_flag = - cudf::strings::detail::IS_UPPER(0xFF); // convert only upper case characters + cudf::strings::detail::character_flags_table_type case_flag = cudf::strings::detail::IS_UPPER( + cudf::strings::detail::ALL_FLAGS); // convert only upper case characters return detail::convert_case(tables, d_str, case_flag); } @@ -151,8 +151,8 @@ __device__ inline udf_string to_lower(chars_tables const tables, string_view d_s */ __device__ inline udf_string to_upper(chars_tables const tables, string_view d_str) { - cudf::strings::detail::character_flags_table_type case_flag = - cudf::strings::detail::IS_LOWER(0xFF); // convert only lower case characters + cudf::strings::detail::character_flags_table_type case_flag = cudf::strings::detail::IS_LOWER( + cudf::strings::detail::ALL_FLAGS); // convert only lower case characters return detail::convert_case(tables, d_str, case_flag); } @@ -169,7 +169,8 @@ __device__ inline udf_string to_upper(chars_tables const tables, string_view d_s __device__ inline udf_string swap_case(chars_tables const tables, string_view d_str) { cudf::strings::detail::character_flags_table_type case_flag = - cudf::strings::detail::IS_LOWER(0xFF) | cudf::strings::detail::IS_UPPER(0xFF); + cudf::strings::detail::IS_LOWER(cudf::strings::detail::ALL_FLAGS) | + cudf::strings::detail::IS_UPPER(cudf::strings::detail::ALL_FLAGS); return detail::convert_case(tables, d_str, case_flag); } diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh index 6d176d36cfb..4f9e8796fb5 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh @@ -51,8 +51,7 @@ __device__ inline udf_string to_string(int64_t value) result.append("0"); return result; } - auto digits = cudf::strings::detail::count_digits(value); - result.resize(digits); + result.resize(cudf::strings::detail::count_digits(value)); cudf::strings::detail::integer_to_string(value, result.data()); return result; } From cf72fc8113fa33dfe4fd0dbad250d782142436e9 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 25 Oct 2022 10:19:34 -0400 Subject: [PATCH 13/54] add noexcept decl to appropriate member functions --- .../include/cudf/strings/udf/char_types.cuh | 2 +- .../cpp/include/cudf/strings/udf/search.cuh | 2 +- .../include/cudf/strings/udf/starts_with.cuh | 8 +-- .../include/cudf/strings/udf/udf_string.cuh | 40 +++++++------- .../include/cudf/strings/udf/udf_string.hpp | 54 ++++++++++--------- 5 files changed, 54 insertions(+), 52 deletions(-) diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/char_types.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/char_types.cuh index 2c73328843f..9320686442b 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/char_types.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/char_types.cuh @@ -25,7 +25,7 @@ namespace strings { namespace udf { /** - * @brief Returns true if all characters in the string are of the type specified + * @brief Returns true if all characters in the string are of the type specified. * * The output will be false if the string is empty or has at least one character * not of the specified type. If all characters fit the type then true is returned. diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/search.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/search.cuh index ed3a873b655..ef15886f1f5 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/search.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/search.cuh @@ -23,7 +23,7 @@ namespace udf { /** * @brief Returns the number of times that the target string appears - * in the source string + * in the source string. * * If `start <= 0` the search begins at the beginning of the `source` string. * If `end <=0` or `end` is greater the length of the `source` string, diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/starts_with.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/starts_with.cuh index 8e9ffce4f52..38c609ae505 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/starts_with.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/starts_with.cuh @@ -24,7 +24,7 @@ namespace udf { /** * @brief Returns true if the beginning of the specified string - * matches the given character array + * matches the given character array. * * @param dstr String to check * @param tgt Character array encoded in UTF-8 @@ -42,7 +42,7 @@ __device__ inline bool starts_with(cudf::string_view const dstr, /** * @brief Returns true if the beginning of the specified string - * matches the given target string + * matches the given target string. * * @param dstr String to check * @param tgt String to match @@ -55,7 +55,7 @@ __device__ inline bool starts_with(cudf::string_view const dstr, cudf::string_vi /** * @brief Returns true if the end of the specified string - * matches the given character array + * matches the given character array. * * @param dstr String to check * @param tgt Character array encoded in UTF-8 @@ -73,7 +73,7 @@ __device__ inline bool ends_with(cudf::string_view const dstr, /** * @brief Returns true if the end of the specified string - * matches the given target` string + * matches the given target` string. * * @param dstr String to check * @param tgt String to match diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh index d651f9faa2b..aa39b94d1c5 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh @@ -94,7 +94,7 @@ __device__ inline udf_string::udf_string(udf_string const& src) memcpy(m_data, src.m_data, m_bytes); } -__device__ inline udf_string::udf_string(udf_string&& src) +__device__ inline udf_string::udf_string(udf_string&& src) noexcept : m_data(src.m_data), m_bytes(src.m_bytes), m_capacity(src.m_capacity) { src.m_data = nullptr; @@ -113,7 +113,7 @@ __device__ inline udf_string::~udf_string() { deallocate(m_data); } __device__ inline udf_string& udf_string::operator=(udf_string const& str) { return assign(str); } -__device__ inline udf_string& udf_string::operator=(udf_string&& str) +__device__ inline udf_string& udf_string::operator=(udf_string&& str) noexcept { return assign(std::move(str)); } @@ -125,7 +125,7 @@ __device__ inline udf_string& udf_string::operator=(cudf::string_view const str) __device__ inline udf_string& udf_string::operator=(char const* str) { return assign(str); } -__device__ udf_string& udf_string::assign(udf_string&& str) +__device__ udf_string& udf_string::assign(udf_string&& str) noexcept { if (this == &str) { return *this; } m_data = str.m_data; @@ -160,30 +160,30 @@ __device__ udf_string& udf_string::assign(char const* str, cudf::size_type bytes return *this; } -__device__ inline cudf::size_type udf_string::size_bytes() const { return m_bytes; } +__device__ inline cudf::size_type udf_string::size_bytes() const noexcept { return m_bytes; } -__device__ inline cudf::size_type udf_string::length() const +__device__ inline cudf::size_type udf_string::length() const noexcept { return cudf::strings::detail::characters_in_string(m_data, m_bytes); } -__device__ cudf::size_type udf_string::max_size() const +__device__ cudf::size_type udf_string::max_size() const noexcept { return std::numeric_limits::max() - 1; } -__device__ inline char* udf_string::data() { return m_data; } +__device__ inline char* udf_string::data() noexcept { return m_data; } -__device__ inline char const* udf_string::data() const { return m_data; } +__device__ inline char const* udf_string::data() const noexcept { return m_data; } -__device__ inline bool udf_string::is_empty() const { return m_bytes == 0; } +__device__ inline bool udf_string::is_empty() const noexcept { return m_bytes == 0; } -__device__ inline cudf::string_view::const_iterator udf_string::begin() const +__device__ inline cudf::string_view::const_iterator udf_string::begin() const noexcept { return cudf::string_view::const_iterator(cudf::string_view(m_data, m_bytes), 0); } -__device__ inline cudf::string_view::const_iterator udf_string::end() const +__device__ inline cudf::string_view::const_iterator udf_string::end() const noexcept { return cudf::string_view::const_iterator(cudf::string_view(m_data, m_bytes), length()); } @@ -216,7 +216,7 @@ __device__ inline cudf::size_type udf_string::byte_offset(cudf::size_type pos) c return offset; } -__device__ inline int udf_string::compare(cudf::string_view const in) const +__device__ inline int udf_string::compare(cudf::string_view const in) const noexcept { return compare(in.data(), in.size_bytes()); } @@ -227,39 +227,39 @@ __device__ inline int udf_string::compare(char const* data, cudf::size_type byte return view.compare(data, bytes); } -__device__ inline bool udf_string::operator==(cudf::string_view const rhs) const +__device__ inline bool udf_string::operator==(cudf::string_view const rhs) const noexcept { return m_bytes == rhs.size_bytes() && compare(rhs) == 0; } -__device__ inline bool udf_string::operator!=(cudf::string_view const rhs) const +__device__ inline bool udf_string::operator!=(cudf::string_view const rhs) const noexcept { return compare(rhs) != 0; } -__device__ inline bool udf_string::operator<(cudf::string_view const rhs) const +__device__ inline bool udf_string::operator<(cudf::string_view const rhs) const noexcept { return compare(rhs) < 0; } -__device__ inline bool udf_string::operator>(cudf::string_view const rhs) const +__device__ inline bool udf_string::operator>(cudf::string_view const rhs) const noexcept { return compare(rhs) > 0; } -__device__ inline bool udf_string::operator<=(cudf::string_view const rhs) const +__device__ inline bool udf_string::operator<=(cudf::string_view const rhs) const noexcept { int rc = compare(rhs); return (rc == 0) || (rc < 0); } -__device__ inline bool udf_string::operator>=(cudf::string_view const rhs) const +__device__ inline bool udf_string::operator>=(cudf::string_view const rhs) const noexcept { int rc = compare(rhs); return (rc == 0) || (rc > 0); } -__device__ inline void udf_string::clear() +__device__ inline void udf_string::clear() noexcept { deallocate(m_data); m_data = nullptr; @@ -284,7 +284,7 @@ __device__ void udf_string::reserve(cudf::size_type count) if (count < max_size() && count > m_capacity) { reallocate(count); } } -__device__ cudf::size_type udf_string::capacity() const { return m_capacity; } +__device__ cudf::size_type udf_string::capacity() const noexcept { return m_capacity; } __device__ void udf_string::shrink_to_fit() { diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp index c57ab473017..2400b5ce71c 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp +++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp @@ -35,7 +35,7 @@ namespace udf { * This class manages a device buffer of UTF-8 encoded characters * for string manipulation in a device kernel. * - * It's methods and behavior are modelled after std::string but + * Its methods and behavior are modelled after std::string but * with special consideration for UTF-8 encoded strings and for * use within a cuDF UDF. */ @@ -103,47 +103,47 @@ class udf_string { __device__ udf_string(udf_string const& src); /** - * @brief Create a string object from a move reference + * @brief Move a string object from an rvalue reference * * The string data is moved from `src` into the instance returned. * The `src` will have no content. * * @param src String to copy */ - __device__ udf_string(udf_string&& src); + __device__ udf_string(udf_string&& src) noexcept; __device__ ~udf_string(); __device__ udf_string& operator=(udf_string const&); - __device__ udf_string& operator=(udf_string&&); + __device__ udf_string& operator=(udf_string&&) noexcept; __device__ udf_string& operator=(cudf::string_view const); __device__ udf_string& operator=(char const*); /** * @brief Return the number of bytes in this string */ - __device__ cudf::size_type size_bytes() const; + __device__ cudf::size_type size_bytes() const noexcept; /** * @brief Return the number of characters in this string */ - __device__ cudf::size_type length() const; + __device__ cudf::size_type length() const noexcept; /** * @brief Return the maximum number of bytes a udf_string can hold */ - __device__ cudf::size_type max_size() const; + __device__ cudf::size_type max_size() const noexcept; /** * @brief Return the internal pointer to the character array for this object */ - __device__ char* data(); - __device__ char const* data() const; + __device__ char* data() noexcept; + __device__ char const* data() const noexcept; /** * @brief Returns true if there are no characters in this string */ - __device__ bool is_empty() const; + __device__ bool is_empty() const noexcept; /** * @brief Returns an iterator that can be used to navigate through @@ -151,8 +151,8 @@ class udf_string { * * This returns a `cudf::string_view::const_iterator` which is read-only. */ - __device__ cudf::string_view::const_iterator begin() const; - __device__ cudf::string_view::const_iterator end() const; + __device__ cudf::string_view::const_iterator begin() const noexcept; + __device__ cudf::string_view::const_iterator end() const noexcept; /** * @brief Returns the character at the specified position @@ -199,7 +199,7 @@ class udf_string { * not match is ordered after the corresponding character in `str`, * or all compared characters match but the `str` string is longer. */ - __device__ int compare(cudf::string_view const str) const; + __device__ int compare(cudf::string_view const str) const noexcept; /** * @brief Comparing target character array with this string @@ -219,39 +219,39 @@ class udf_string { /** * @brief Returns true if `rhs` matches this string exactly */ - __device__ bool operator==(cudf::string_view const rhs) const; + __device__ bool operator==(cudf::string_view const rhs) const noexcept; /** * @brief Returns true if `rhs` does not match this string */ - __device__ bool operator!=(cudf::string_view const rhs) const; + __device__ bool operator!=(cudf::string_view const rhs) const noexcept; /** * @brief Returns true if this string is ordered before `rhs` */ - __device__ bool operator<(cudf::string_view const rhs) const; + __device__ bool operator<(cudf::string_view const rhs) const noexcept; /** * @brief Returns true if `rhs` is ordered before this string */ - __device__ bool operator>(cudf::string_view const rhs) const; + __device__ bool operator>(cudf::string_view const rhs) const noexcept; /** * @brief Returns true if this string matches or is ordered before `rhs` */ - __device__ bool operator<=(cudf::string_view const rhs) const; + __device__ bool operator<=(cudf::string_view const rhs) const noexcept; /** * @brief Returns true if `rhs` matches or is ordered before this string */ - __device__ bool operator>=(cudf::string_view const rhs) const; + __device__ bool operator>=(cudf::string_view const rhs) const noexcept; /** * @brief Remove all bytes from this string * * All pointers, references, and iterators are invalidated. */ - __device__ void clear(); + __device__ void clear() noexcept; /** * @brief Resizes string to contain `count` bytes @@ -279,7 +279,7 @@ class udf_string { /** * @brief Returns the number of bytes that the string has allocated */ - __device__ cudf::size_type capacity() const; + __device__ cudf::size_type capacity() const noexcept; /** * @brief Reduces internal allocation to just `size_bytes()` @@ -291,16 +291,18 @@ class udf_string { /** * @brief Moves the contents of `str` into this string instance * + * On return, the `str` will have no contents. + * * @param str String to move - * @return This string new contents + * @return This string with new contents */ - __device__ udf_string& assign(udf_string&& str); + __device__ udf_string& assign(udf_string&& str) noexcept; /** * @brief Replaces the contents of this string with contents of `str` * * @param str String to copy - * @return This string new contents + * @return This string with new contents */ __device__ udf_string& assign(cudf::string_view const str); @@ -308,7 +310,7 @@ class udf_string { * @brief Replaces the contents of this string with contents of `str` * * @param str Null-terminated UTF-8 character array - * @return This string new contents + * @return This string with new contents */ __device__ udf_string& assign(char const* str); @@ -317,7 +319,7 @@ class udf_string { * * @param str UTF-8 character array * @param bytes Number of bytes to copy from `str` - * @return This string new contents + * @return This string with new contents */ __device__ udf_string& assign(char const* str, cudf::size_type bytes); From 28e917b5482088a8b8bd9d551dc55c516672c92d Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 25 Oct 2022 10:20:12 -0400 Subject: [PATCH 14/54] fix return types for split --- .../cpp/include/cudf/strings/udf/split.cuh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh index 58e3e6fcfb4..bd2f1e9d3d9 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh @@ -77,10 +77,10 @@ __device__ inline cudf::size_type split(cudf::string_view const d_str, * Pass `nullptr` to just get the token count. * @return Number of tokens returned */ -__device__ inline int split(cudf::string_view const d_str, - char const* tgt, - cudf::size_type bytes, - cudf::string_view* result) +__device__ inline cudf::size_type split(cudf::string_view const d_str, + char const* tgt, + cudf::size_type bytes, + cudf::string_view* result) { return split(d_str, cudf::string_view{tgt, bytes}, result); } @@ -94,9 +94,9 @@ __device__ inline int split(cudf::string_view const d_str, * Pass `nullptr` to just get the token count. * @return Number of tokens returned */ -__device__ inline int split(cudf::string_view const d_str, - char const* tgt, - cudf::string_view* result) +__device__ inline cudf::size_type split(cudf::string_view const d_str, + char const* tgt, + cudf::string_view* result) { return split(d_str, tgt, detail::bytes_in_null_terminated_string(tgt), result); } From f82c454534eb44ef11c7c8266cf02d5f1db842b7 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 25 Oct 2022 10:20:29 -0400 Subject: [PATCH 15/54] fix doxygen for various functions --- python/strings_udf/cpp/include/cudf/strings/udf/case.cuh | 1 + python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh | 4 ++-- python/strings_udf/cpp/include/cudf/strings/udf/strip.cuh | 6 +++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/case.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/case.cuh index b11297cb721..472101959a6 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/case.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/case.cuh @@ -103,6 +103,7 @@ __device__ inline udf_string convert_case( * @tparam CapitalizeNextFn returns true if the next candidate character should be capitalized * @param tables The char tables required for conversion * @param d_str Input string to convert + * @param next_fn Function for next character capitalized * @return New string containing the converted characters */ template diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh index 38dbee72a84..0f0506898c2 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh @@ -53,11 +53,11 @@ __device__ udf_string pad(cudf::string_view const d_str, * * If the `width` is smaller than the length of `d_str` no change occurs. * - * If d_str starts with a sign character ('-' or '+') then '0' padding + * If `d_str` starts with a sign character ('-' or '+') then '0' padding * starts after the sign. * * @param d_str String to fill - * @param width Minimum length in characters of the output string + * @param width Minimum length in characters of the output string (including the sign character) */ __device__ udf_string zfill(cudf::string_view const d_str, cudf::size_type width) { diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/strip.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/strip.cuh index 6c6639835cd..9c1c00a44fb 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/strip.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/strip.cuh @@ -28,7 +28,7 @@ namespace udf { /** * @brief Strip characters from the beginning and/or end of the given string * - * The `d_to_strip` is interpretted as an array of characters to be removed. + * The `d_to_strip` is interpreted as an array of characters to be removed. * If `d_to_strip` is an empty string, whitespace characters are stripped. * * @code{.cpp} @@ -58,7 +58,7 @@ __device__ udf_string strip(cudf::string_view const d_str, /** * @brief Strip characters from the beginning of the given string * - * The `d_to_strip` is interpretted as an array of characters to be removed. + * The `d_to_strip` is interpreted as an array of characters to be removed. * If `d_to_strip` is an empty string, whitespace characters are stripped. * * @code{.cpp} @@ -83,7 +83,7 @@ __device__ udf_string lstrip(cudf::string_view const d_str, cudf::string_view d_ /** * @brief Strip characters from the end of the given string * - * The `d_to_strip` is interpretted as an array of characters to be removed. + * The `d_to_strip` is interpreted as an array of characters to be removed. * If `d_to_strip` is an empty string, whitespace characters are stripped. * * @code{.cpp} From 7b9718c25661b6f4fe9157eae6c81524093c9022 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 31 Oct 2022 11:42:25 -0400 Subject: [PATCH 16/54] create free_udf_strings_array function --- .../cpp/include/cudf/strings/udf/udf_apis.hpp | 11 +++++ .../cpp/src/strings/udf/udf_apis.cu | 43 +++++++++---------- 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp b/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp index 9eb1c72dd5b..68834afa082 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp +++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp @@ -57,6 +57,17 @@ std::unique_ptr to_string_view_array(cudf::column_view const std::unique_ptr column_from_udf_string_array(udf_string* d_strings, cudf::size_type size); +/** + * @brief Frees a vector of udf_string objects + * + * The individual udf_strings are cleared freeing each of their internal + * device memory buffers. + * + * @param d_strings Pointer to device memory of udf_string objects + * @param size The number of elements in the d_strings array + */ +void free_udf_string_array(udf_string* d_strings, cudf::size_type size); + } // namespace udf } // namespace strings } // namespace cudf diff --git a/python/strings_udf/cpp/src/strings/udf/udf_apis.cu b/python/strings_udf/cpp/src/strings/udf/udf_apis.cu index ca258dce370..e96f691598d 100644 --- a/python/strings_udf/cpp/src/strings/udf/udf_apis.cu +++ b/python/strings_udf/cpp/src/strings/udf/udf_apis.cu @@ -33,21 +33,6 @@ namespace udf { namespace detail { namespace { -/** - * @brief Frees udf_strings device memory - * - * @param d_buffer Array of udf_strings - */ -void free_udf_string_array(cudf::strings::udf::udf_string* d_strings, - cudf::size_type size, - rmm::cuda_stream_view stream) -{ - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - size, - [d_strings] __device__(auto idx) { d_strings[idx].clear(); }); -} - /** * @brief Functor wraps string_view objects around udf_string objects * @@ -78,7 +63,7 @@ std::unique_ptr to_string_view_array(cudf::column_view const /** * @copydoc column_from_udf_string_array - + * * @param stream CUDA stream used for allocating/copying device memory and launching kernels */ std::unique_ptr column_from_udf_string_array(udf_string* d_strings, @@ -93,13 +78,22 @@ std::unique_ptr column_from_udf_string_array(udf_string* d_strings indices.data(), udf_string_to_string_view_transform_fn{}); - auto results = cudf::make_strings_column(indices, cudf::string_view(nullptr, 0), stream); - - // free the individual udf_string elements - free_udf_string_array(d_strings, size, stream); + return cudf::make_strings_column(indices, cudf::string_view(nullptr, 0), stream); +} - // return new column - return results; +/** + * @copydoc free_udf_string_array + * + * @param stream CUDA stream used for allocating/copying device memory and launching kernels + */ +void free_udf_string_array(cudf::strings::udf::udf_string* d_strings, + cudf::size_type size, + rmm::cuda_stream_view stream) +{ + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + size, + [d_strings] __device__(auto idx) { d_strings[idx].clear(); }); } } // namespace detail @@ -117,6 +111,11 @@ std::unique_ptr column_from_udf_string_array(udf_string* d_strings return detail::column_from_udf_string_array(d_strings, size, rmm::cuda_stream_default); } +void free_udf_string_array(udf_string* d_strings, cudf::size_type size) +{ + detail::free_udf_string_array(d_strings, size, rmm::cuda_stream_default); +} + } // namespace udf } // namespace strings } // namespace cudf From 68e54e8ff91388e51dc91ef85f1b6178f1b00d97 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 31 Oct 2022 11:44:18 -0400 Subject: [PATCH 17/54] fix compare returns, null assignment, reuse ctors --- .../include/cudf/strings/udf/udf_string.cuh | 62 ++++++++++++------- .../include/cudf/strings/udf/udf_string.hpp | 6 +- 2 files changed, 45 insertions(+), 23 deletions(-) diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh index aa39b94d1c5..5590fa67c70 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh @@ -30,6 +30,12 @@ namespace strings { namespace udf { namespace detail { +/** + * @brief Count the bytes in a null-terminated character array + * + * @param str Null-terminated string + * @return Number of bytes in `str` upto but not including the null-terminator + */ __device__ inline static cudf::size_type bytes_in_null_terminated_string(char const* str) { if (!str) return 0; @@ -41,18 +47,38 @@ __device__ inline static cudf::size_type bytes_in_null_terminated_string(char co } // namespace detail +/** + * @brief Allocate memory for strings operation + * + * @param bytes Number of bytes in to allocate + * @return Pointer to allocated memory + */ __device__ inline char* udf_string::allocate(cudf::size_type bytes) { char* data = static_cast(malloc(bytes + 1)); - data[bytes] = 0; // add null-terminator so we can printf strings in device code + data[bytes] = '\0'; // add null-terminator so we can printf strings in device code return data; } +/** + * @brief Free memory created by allocate() + * + * @param data Pointer to allocated memory + */ __device__ inline void udf_string::deallocate(char* data) { if (data) free(data); } +/** + * @brief Allocate memory for strings operation + * + * Reallocates memory for `m_data` with new size `bytes` + * The original data in `m_data` is preserved up to `min(bytes,m_bytes)` + * + * @param bytes Number of bytes in to allocate + * @return Pointer to allocated memory + */ __device__ void udf_string::reallocate(cudf::size_type bytes) { m_capacity = bytes; @@ -75,23 +101,19 @@ __device__ udf_string::udf_string(cudf::size_type count, cudf::char_utf8 chr) m_bytes = m_capacity = cudf::strings::detail::bytes_in_char_utf8(chr) * count; m_data = allocate(m_capacity); auto out_ptr = m_data; - for (auto idx = 0; idx < count; ++idx) { + for (cudf::size_type idx = 0; idx < count; ++idx) { out_ptr += cudf::strings::detail::from_char_utf8(chr, out_ptr); } } __device__ inline udf_string::udf_string(char const* data) + : udf_string(data, detail::bytes_in_null_terminated_string(data)) { - m_bytes = m_capacity = detail::bytes_in_null_terminated_string(data); - m_data = allocate(m_capacity); - memcpy(m_data, data, m_bytes); } __device__ inline udf_string::udf_string(udf_string const& src) - : m_bytes(src.m_bytes), m_capacity(src.m_bytes) + : udf_string(src.m_data, src.m_bytes) { - m_data = allocate(m_capacity); - memcpy(m_data, src.m_data, m_bytes); } __device__ inline udf_string::udf_string(udf_string&& src) noexcept @@ -103,10 +125,8 @@ __device__ inline udf_string::udf_string(udf_string&& src) noexcept } __device__ inline udf_string::udf_string(cudf::string_view const str) - : m_bytes(str.size_bytes()), m_capacity(str.size_bytes()) + : udf_string(str.data(), str.size_bytes()) { - m_data = allocate(m_capacity); - memcpy(m_data, str.data(), m_bytes); } __device__ inline udf_string::~udf_string() { deallocate(m_data); } @@ -156,7 +176,7 @@ __device__ udf_string& udf_string::assign(char const* str, cudf::size_type bytes } m_bytes = bytes; memcpy(m_data, str, bytes); - m_data[m_bytes] = 0; + m_data[m_bytes] = '\0'; return *this; } @@ -167,7 +187,7 @@ __device__ inline cudf::size_type udf_string::length() const noexcept return cudf::strings::detail::characters_in_string(m_data, m_bytes); } -__device__ cudf::size_type udf_string::max_size() const noexcept +__device__ constexpr cudf::size_type udf_string::max_size() const noexcept { return std::numeric_limits::max() - 1; } @@ -249,14 +269,12 @@ __device__ inline bool udf_string::operator>(cudf::string_view const rhs) const __device__ inline bool udf_string::operator<=(cudf::string_view const rhs) const noexcept { - int rc = compare(rhs); - return (rc == 0) || (rc < 0); + return compare(rhs) <= 0; } __device__ inline bool udf_string::operator>=(cudf::string_view const rhs) const noexcept { - int rc = compare(rhs); - return (rc == 0) || (rc > 0); + return compare(rhs) >= 0; } __device__ inline void udf_string::clear() noexcept @@ -276,7 +294,7 @@ __device__ inline void udf_string::resize(cudf::size_type count) if (count > m_bytes) { memset(m_data + m_bytes, 0, count - m_bytes); } m_bytes = count; - m_data[m_bytes] = 0; + m_data[m_bytes] = '\0'; } __device__ void udf_string::reserve(cudf::size_type count) @@ -298,7 +316,7 @@ __device__ inline udf_string& udf_string::append(char const* str, cudf::size_typ if (nbytes > m_capacity) { reallocate(2 * nbytes); } memcpy(m_data + m_bytes, str, in_bytes); m_bytes = nbytes; - m_data[m_bytes] = 0; + m_data[m_bytes] = '\0'; return *this; } @@ -318,7 +336,7 @@ __device__ inline udf_string& udf_string::append(cudf::char_utf8 chr, cudf::size out_ptr += cudf::strings::detail::from_char_utf8(chr, out_ptr); } m_bytes = nbytes; - m_data[m_bytes] = 0; + m_data[m_bytes] = '\0'; return *this; } @@ -412,7 +430,7 @@ __device__ inline udf_string& udf_string::replace(cudf::size_type pos, memcpy(m_data + spos, str, in_bytes); m_bytes = nbytes; - m_data[m_bytes] = 0; + m_data[m_bytes] = '\0'; return *this; } @@ -456,7 +474,7 @@ __device__ inline udf_string& udf_string::replace(cudf::size_type pos, } m_bytes = nbytes; - m_data[m_bytes] = 0; + m_data[m_bytes] = '\0'; return *this; } diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp index 2400b5ce71c..1dee976b299 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp +++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp @@ -132,7 +132,7 @@ class udf_string { /** * @brief Return the maximum number of bytes a udf_string can hold */ - __device__ cudf::size_type max_size() const noexcept; + __device__ constexpr cudf::size_type max_size() const noexcept; /** * @brief Return the internal pointer to the character array for this object @@ -182,6 +182,8 @@ class udf_string { * `data() + byte_offset(pos)` points to the memory location * the character at position `pos`. * + * The behavior is undefined if `pos < 0 or pos >= length()` + * * @param pos Index position of character to return byte offset. * @return Byte offset for character at `pos` */ @@ -261,6 +263,8 @@ class udf_string { * * All pointers, references, and iterators may be invalidated. * + * The behavior is undefined if `count > max_size()` + * * @param count Size in bytes of this string. */ __device__ void resize(cudf::size_type count); From 6eef0a4d8805f88491b97a3d7f15c4986c98056c Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 31 Oct 2022 11:44:56 -0400 Subject: [PATCH 18/54] fix some doxygen wording --- python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh | 2 +- python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh | 2 +- python/strings_udf/cpp/include/cudf/strings/udf/split.cuh | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh index 4f9e8796fb5..c8c9f6e46f4 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh @@ -59,7 +59,7 @@ __device__ inline udf_string to_string(int64_t value) /** * @brief Converts a string into a double * - * Support scientific notation as well. + * This function supports scientific notation. * Overflow goes to inf or -inf and underflow may go to 0. */ __device__ inline double stod(string_view const& d_str) diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh index 0f0506898c2..c43c67c9ddf 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh @@ -28,7 +28,7 @@ namespace udf { * @brief Pad beginning and/or end of a string with the given fill character * * The side_type::BOTH will attempt to center the text using the `fill_char`. - * If the `width` is smaller than the length of `d_str` no change occurs. + * If the `width <= d_str.length()` no change occurs. * * @tparam side Specify where the padding should occur * @param d_str String to pad diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh index bd2f1e9d3d9..6a11e5bf655 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh @@ -34,11 +34,11 @@ namespace udf { * * @code{.cpp} * auto d_str = cudf::string_view{"the best of times ", 19}; - * auto tgt = cudf::string_view{}; // empty string + * auto tgt = cudf::string_view{" ", 1}; * auto token_count = split(d_str, tgt, nullptr); * auto result = new cudf::string_view[token_count]; * split(d_str, tgt, result); - * // result is array like ["the", "best", "of", "times"] + * // result is array like ["the", "best", "", "of", "times", ""] * @endcode * * @param d_str String to split From 69e0d7c0b77dc87e6f6f1472d4a1c296307ab8bd Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 31 Oct 2022 17:36:27 -0400 Subject: [PATCH 19/54] remove string_view const parameter decl --- .../include/cudf/strings/udf/udf_string.cuh | 34 ++++++++----------- .../include/cudf/strings/udf/udf_string.hpp | 28 +++++++-------- 2 files changed, 27 insertions(+), 35 deletions(-) diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh index 5590fa67c70..92b5536364e 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh @@ -124,7 +124,7 @@ __device__ inline udf_string::udf_string(udf_string&& src) noexcept src.m_capacity = 0; } -__device__ inline udf_string::udf_string(cudf::string_view const str) +__device__ inline udf_string::udf_string(cudf::string_view str) : udf_string(str.data(), str.size_bytes()) { } @@ -138,10 +138,7 @@ __device__ inline udf_string& udf_string::operator=(udf_string&& str) noexcept return assign(std::move(str)); } -__device__ inline udf_string& udf_string::operator=(cudf::string_view const str) -{ - return assign(str); -} +__device__ inline udf_string& udf_string::operator=(cudf::string_view str) { return assign(str); } __device__ inline udf_string& udf_string::operator=(char const* str) { return assign(str); } @@ -157,7 +154,7 @@ __device__ udf_string& udf_string::assign(udf_string&& str) noexcept return *this; } -__device__ udf_string& udf_string::assign(cudf::string_view const str) +__device__ udf_string& udf_string::assign(cudf::string_view str) { return assign(str.data(), str.size_bytes()); } @@ -236,7 +233,7 @@ __device__ inline cudf::size_type udf_string::byte_offset(cudf::size_type pos) c return offset; } -__device__ inline int udf_string::compare(cudf::string_view const in) const noexcept +__device__ inline int udf_string::compare(cudf::string_view in) const noexcept { return compare(in.data(), in.size_bytes()); } @@ -247,32 +244,32 @@ __device__ inline int udf_string::compare(char const* data, cudf::size_type byte return view.compare(data, bytes); } -__device__ inline bool udf_string::operator==(cudf::string_view const rhs) const noexcept +__device__ inline bool udf_string::operator==(cudf::string_view rhs) const noexcept { return m_bytes == rhs.size_bytes() && compare(rhs) == 0; } -__device__ inline bool udf_string::operator!=(cudf::string_view const rhs) const noexcept +__device__ inline bool udf_string::operator!=(cudf::string_view rhs) const noexcept { return compare(rhs) != 0; } -__device__ inline bool udf_string::operator<(cudf::string_view const rhs) const noexcept +__device__ inline bool udf_string::operator<(cudf::string_view rhs) const noexcept { return compare(rhs) < 0; } -__device__ inline bool udf_string::operator>(cudf::string_view const rhs) const noexcept +__device__ inline bool udf_string::operator>(cudf::string_view rhs) const noexcept { return compare(rhs) > 0; } -__device__ inline bool udf_string::operator<=(cudf::string_view const rhs) const noexcept +__device__ inline bool udf_string::operator<=(cudf::string_view rhs) const noexcept { return compare(rhs) <= 0; } -__device__ inline bool udf_string::operator>=(cudf::string_view const rhs) const noexcept +__device__ inline bool udf_string::operator>=(cudf::string_view rhs) const noexcept { return compare(rhs) >= 0; } @@ -340,15 +337,12 @@ __device__ inline udf_string& udf_string::append(cudf::char_utf8 chr, cudf::size return *this; } -__device__ inline udf_string& udf_string::append(cudf::string_view const in) +__device__ inline udf_string& udf_string::append(cudf::string_view in) { return append(in.data(), in.size_bytes()); } -__device__ inline udf_string& udf_string::operator+=(cudf::string_view const in) -{ - return append(in); -} +__device__ inline udf_string& udf_string::operator+=(cudf::string_view in) { return append(in); } __device__ inline udf_string& udf_string::operator+=(cudf::char_utf8 chr) { return append(chr); } @@ -366,7 +360,7 @@ __device__ inline udf_string& udf_string::insert(cudf::size_type pos, char const return insert(pos, str, detail::bytes_in_null_terminated_string(str)); } -__device__ inline udf_string& udf_string::insert(cudf::size_type pos, cudf::string_view const in) +__device__ inline udf_string& udf_string::insert(cudf::size_type pos, cudf::string_view in) { return insert(pos, in.data(), in.size_bytes()); } @@ -443,7 +437,7 @@ __device__ inline udf_string& udf_string::replace(cudf::size_type pos, __device__ inline udf_string& udf_string::replace(cudf::size_type pos, cudf::size_type count, - cudf::string_view const in) + cudf::string_view in) { return replace(pos, count, in.data(), in.size_bytes()); } diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp index 1dee976b299..728bb0b5adf 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp +++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp @@ -83,7 +83,7 @@ class udf_string { * * @param str String to copy */ - __device__ udf_string(cudf::string_view const str); + __device__ udf_string(cudf::string_view str); /** * @brief Create a string object with `count` copies of character `chr` @@ -201,7 +201,7 @@ class udf_string { * not match is ordered after the corresponding character in `str`, * or all compared characters match but the `str` string is longer. */ - __device__ int compare(cudf::string_view const str) const noexcept; + __device__ int compare(cudf::string_view str) const noexcept; /** * @brief Comparing target character array with this string @@ -221,32 +221,32 @@ class udf_string { /** * @brief Returns true if `rhs` matches this string exactly */ - __device__ bool operator==(cudf::string_view const rhs) const noexcept; + __device__ bool operator==(cudf::string_view rhs) const noexcept; /** * @brief Returns true if `rhs` does not match this string */ - __device__ bool operator!=(cudf::string_view const rhs) const noexcept; + __device__ bool operator!=(cudf::string_view rhs) const noexcept; /** * @brief Returns true if this string is ordered before `rhs` */ - __device__ bool operator<(cudf::string_view const rhs) const noexcept; + __device__ bool operator<(cudf::string_view rhs) const noexcept; /** * @brief Returns true if `rhs` is ordered before this string */ - __device__ bool operator>(cudf::string_view const rhs) const noexcept; + __device__ bool operator>(cudf::string_view rhs) const noexcept; /** * @brief Returns true if this string matches or is ordered before `rhs` */ - __device__ bool operator<=(cudf::string_view const rhs) const noexcept; + __device__ bool operator<=(cudf::string_view rhs) const noexcept; /** * @brief Returns true if `rhs` matches or is ordered before this string */ - __device__ bool operator>=(cudf::string_view const rhs) const noexcept; + __device__ bool operator>=(cudf::string_view rhs) const noexcept; /** * @brief Remove all bytes from this string @@ -308,7 +308,7 @@ class udf_string { * @param str String to copy * @return This string with new contents */ - __device__ udf_string& assign(cudf::string_view const str); + __device__ udf_string& assign(cudf::string_view str); /** * @brief Replaces the contents of this string with contents of `str` @@ -333,7 +333,7 @@ class udf_string { * @param str String to append * @return This string with the appended argument */ - __device__ udf_string& operator+=(cudf::string_view const str); + __device__ udf_string& operator+=(cudf::string_view str); /** * @brief Append a character to the end of this string @@ -375,7 +375,7 @@ class udf_string { * @param str String to append * @return This string with the appended argument */ - __device__ udf_string& append(cudf::string_view const str); + __device__ udf_string& append(cudf::string_view str); /** * @brief Append a character to the end of this string @@ -396,7 +396,7 @@ class udf_string { * @param str String to insert into this one * @return This string with the inserted argument */ - __device__ udf_string& insert(cudf::size_type pos, cudf::string_view const str); + __device__ udf_string& insert(cudf::size_type pos, cudf::string_view str); /** * @brief Insert a null-terminated character array into the character position specified @@ -457,9 +457,7 @@ class udf_string { * @param str String to replace the given range * @return This string modified with the replacement */ - __device__ udf_string& replace(cudf::size_type pos, - cudf::size_type count, - cudf::string_view const str); + __device__ udf_string& replace(cudf::size_type pos, cudf::size_type count, cudf::string_view str); /** * @brief Replace a range of characters with a null-terminated character array From a95c03070af37c5e81c16970bfb0946f0357152f Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 31 Oct 2022 18:46:50 -0400 Subject: [PATCH 20/54] fix default-stream --- .../cpp/include/cudf/strings/udf/udf_string.cuh | 8 ++++---- python/strings_udf/cpp/src/strings/udf/udf_apis.cu | 5 +++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh index 92b5536364e..e5c689fb87e 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh @@ -306,12 +306,12 @@ __device__ void udf_string::shrink_to_fit() if (m_bytes < m_capacity) { reallocate(m_bytes); } } -__device__ inline udf_string& udf_string::append(char const* str, cudf::size_type in_bytes) +__device__ inline udf_string& udf_string::append(char const* str, cudf::size_type bytes) { - if (in_bytes <= 0) { return *this; } - auto const nbytes = m_bytes + in_bytes; + if (bytes <= 0) { return *this; } + auto const nbytes = m_bytes + bytes; if (nbytes > m_capacity) { reallocate(2 * nbytes); } - memcpy(m_data + m_bytes, str, in_bytes); + memcpy(m_data + m_bytes, str, bytes); m_bytes = nbytes; m_data[m_bytes] = '\0'; return *this; diff --git a/python/strings_udf/cpp/src/strings/udf/udf_apis.cu b/python/strings_udf/cpp/src/strings/udf/udf_apis.cu index e96f691598d..7927740fd49 100644 --- a/python/strings_udf/cpp/src/strings/udf/udf_apis.cu +++ b/python/strings_udf/cpp/src/strings/udf/udf_apis.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -108,12 +109,12 @@ std::unique_ptr to_string_view_array(cudf::column_view const std::unique_ptr column_from_udf_string_array(udf_string* d_strings, cudf::size_type size) { - return detail::column_from_udf_string_array(d_strings, size, rmm::cuda_stream_default); + return detail::column_from_udf_string_array(d_strings, size, cudf::get_default_stream()); } void free_udf_string_array(udf_string* d_strings, cudf::size_type size) { - detail::free_udf_string_array(d_strings, size, rmm::cuda_stream_default); + detail::free_udf_string_array(d_strings, size, cudf::get_default_stream()); } } // namespace udf From e0526e6afa4774c179569089de19ab469a9489fe Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 31 Oct 2022 19:16:29 -0400 Subject: [PATCH 21/54] remove lstrip and rstrip --- .../cpp/include/cudf/strings/udf/strip.cuh | 54 +++++-------------- 1 file changed, 12 insertions(+), 42 deletions(-) diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/strip.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/strip.cuh index 9c1c00a44fb..f2db3073460 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/strip.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/strip.cuh @@ -41,68 +41,38 @@ namespace udf { * // result is "b" ('a' or ' ' removed from the ends) * @endcode * - * @param d_str String to strip characters from - * @param d_to_strip Characters to remove - * @param stype From where to strip the characters; - * Default `BOTH` indicates stripping characters from the - * beginning and the end of the input string `d_str` - * @return New string with characters removed - */ -__device__ udf_string strip(cudf::string_view const d_str, - cudf::string_view const d_to_strip, - side_type stype = side_type::BOTH) -{ - return udf_string{cudf::strings::detail::strip(d_str, d_to_strip, stype)}; -} - -/** - * @brief Strip characters from the beginning of the given string - * - * The `d_to_strip` is interpreted as an array of characters to be removed. - * If `d_to_strip` is an empty string, whitespace characters are stripped. - * * @code{.cpp} * auto d_str = cudf::string_view{" aba ", 5}; * auto d_to_strip = cudf::string_view{}; // empty string - * auto result = lstrip(d_str, d_to_strip); + * auto result = strip(d_str, d_to_strip, side_type::LEFT); * // result is "aba " - * d_to_strip = cudf::string_view{"a ", 2}; // space and 'a' - * result = lstrip(d_str, d_to_strip); + * d_to_strip = cudf::string_view{"a ", 2}; // 'a' and space + * result = strip(d_str, d_to_strip, side_type::LEFT); * // result is "ba " ('a' or ' ' removed from the beginning) * @endcode * - * @param d_str String to strip characters from - * @param d_to_strip Characters to remove - * @return New string with characters removed - */ -__device__ udf_string lstrip(cudf::string_view const d_str, cudf::string_view d_to_strip) -{ - return strip(d_str, d_to_strip, side_type::LEFT); -} - -/** - * @brief Strip characters from the end of the given string - * - * The `d_to_strip` is interpreted as an array of characters to be removed. - * If `d_to_strip` is an empty string, whitespace characters are stripped. - * * @code{.cpp} * auto d_str = cudf::string_view{" aba ", 5}; * auto d_to_strip = cudf::string_view{}; // empty string - * auto result = rstrip(d_str, d_to_strip); + * auto result = strip(d_str, d_to_strip, side_type::RIGHT); * // result is " aba" * d_to_strip = cudf::string_view{" a", 2}; // space and 'a' - * result = rstrip(d_str, d_to_strip); + * result = rstrip(d_str, d_to_strip, side_type::RIGHT); * // result is " ab" ('a' or ' ' removed from the end) * @endcode * * @param d_str String to strip characters from * @param d_to_strip Characters to remove + * @param stype From where to strip the characters; + * Default `BOTH` indicates stripping characters from the + * beginning and the end of the input string `d_str` * @return New string with characters removed */ -__device__ udf_string rstrip(cudf::string_view const d_str, cudf::string_view d_to_strip) +__device__ udf_string strip(cudf::string_view const d_str, + cudf::string_view const d_to_strip, + side_type stype = side_type::BOTH) { - return strip(d_str, d_to_strip, side_type::RIGHT); + return udf_string{cudf::strings::detail::strip(d_str, d_to_strip, stype)}; } } // namespace udf From bc903d6d293319945c4485acc91737066a7d1c98 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 31 Oct 2022 19:21:14 -0400 Subject: [PATCH 22/54] reword split doxygen text for result=nullptr --- python/strings_udf/cpp/include/cudf/strings/udf/split.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh index 6a11e5bf655..7eeeb94271b 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh @@ -29,8 +29,8 @@ namespace udf { * @brief Split string using given string * * The caller must allocate an array of cudf::string_view to be filled - * in by this function. Calling this with a `nullptr` for the `result` - * will return the number of elements. + * in by this function. This function can be called with a `result=nullptr` + * to compute the number of tokens. * * @code{.cpp} * auto d_str = cudf::string_view{"the best of times ", 19}; From eb6532e1e0d86ac92596297b08383668d0207035 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 1 Nov 2022 08:21:02 -0400 Subject: [PATCH 23/54] add cuda_runtime.h to resolve device refs --- cpp/include/cudf/strings/string_view.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp index 03bf538b1b2..265adc60392 100644 --- a/cpp/include/cudf/strings/string_view.hpp +++ b/cpp/include/cudf/strings/string_view.hpp @@ -17,6 +17,8 @@ #include +#include + #include /** From a8fca127c81322a02b6c5ceb035f587940eba873 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 1 Nov 2022 08:21:39 -0400 Subject: [PATCH 24/54] fix doxygen wording for pad() --- python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh index c43c67c9ddf..d6d4ed637e9 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh @@ -28,7 +28,7 @@ namespace udf { * @brief Pad beginning and/or end of a string with the given fill character * * The side_type::BOTH will attempt to center the text using the `fill_char`. - * If the `width <= d_str.length()` no change occurs. + * If `width <= d_str.length()` no change occurs and the input `d_str` is returned. * * @tparam side Specify where the padding should occur * @param d_str String to pad From a249d133d145cd1135d0baf3729166fbb2e22259 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 1 Nov 2022 08:22:02 -0400 Subject: [PATCH 25/54] refactor split; add count_tokens function --- .../cpp/include/cudf/strings/udf/split.cuh | 104 ++++++++++++++++-- 1 file changed, 97 insertions(+), 7 deletions(-) diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh index 7eeeb94271b..ca31425aa62 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh @@ -24,6 +24,7 @@ namespace cudf { namespace strings { namespace udf { +namespace detail { /** * @brief Split string using given string @@ -66,6 +67,54 @@ __device__ inline cudf::size_type split(cudf::string_view const d_str, return count; } +} // namespace detail + +/** + * @brief Count tokens in a string without performing the split + * + * @code{.cpp} + * auto d_str = cudf::string_view{"the best of times ", 19}; + * auto tgt = cudf::string_view{" ", 1}; + * auto token_count = count_tokens(d_str, tgt); + * // token_count is 6 + * @endcode + * + * @param d_str String to split + * @param tgt String to split on + * @return Number of tokens returned + */ +__device__ inline cudf::size_type count_tokens(cudf::string_view const d_str, + cudf::string_view const tgt) +{ + return detail::split(d_str, tgt, nullptr); +} + +/** + * @brief Split string using given string + * + * The caller must allocate an array of cudf::string_view to be filled + * in by this function. + * + * @code{.cpp} + * auto d_str = cudf::string_view{"the best of times ", 19}; + * auto tgt = cudf::string_view{" ", 1}; + * auto token_count = count_tokens(d_str, tgt); + * auto result = new cudf::string_view[token_count]; + * split(d_str, tgt, result); + * // result is array like ["the", "best", "", "of", "times", ""] + * @endcode + * + * @param d_str String to split + * @param tgt String to split on + * @param result Empty array to populate with output objects. + * @return Number of tokens returned + */ +__device__ inline cudf::size_type split(cudf::string_view const d_str, + cudf::string_view const tgt, + cudf::string_view* result) +{ + return detail::split(d_str, tgt, result); +} /** * @brief Split string using given target array @@ -73,8 +122,7 @@ __device__ inline cudf::size_type split(cudf::string_view const d_str, * @param d_str String to split * @param tgt Character array encoded in UTF-8 used for identifying split points * @param bytes Number of bytes to read from `tgt` - * @param result Empty array to populate with output objects. - * Pass `nullptr` to just get the token count. + * @param result Empty array to populate with output objects * @return Number of tokens returned */ __device__ inline cudf::size_type split(cudf::string_view const d_str, @@ -82,7 +130,7 @@ __device__ inline cudf::size_type split(cudf::string_view const d_str, cudf::size_type bytes, cudf::string_view* result) { - return split(d_str, cudf::string_view{tgt, bytes}, result); + return detail::split(d_str, cudf::string_view{tgt, bytes}, result); } /** @@ -90,8 +138,7 @@ __device__ inline cudf::size_type split(cudf::string_view const d_str, * * @param d_str String to split * @param tgt Null-terminated character array encoded in UTF-8 used for identifying split points - * @param result Empty array to populate with output objects. - * Pass `nullptr` to just get the token count. + * @param result Empty array to populate with output objects * @return Number of tokens returned */ __device__ inline cudf::size_type split(cudf::string_view const d_str, @@ -101,11 +148,21 @@ __device__ inline cudf::size_type split(cudf::string_view const d_str, return split(d_str, tgt, detail::bytes_in_null_terminated_string(tgt), result); } +namespace detail { /** * @brief Split string on whitespace * - * This will create tokens by splitting on one or more consecutive whitespace characters - * found in `d_str`. + * The caller must allocate an array of cudf::string_view to be filled + * in by this function. This function can be called with a `result=nullptr` + * to compute the number of tokens. + * + * @code{.cpp} + * auto d_str = cudf::string_view{"the best of times ", 19}; + * auto token_count = split(d_str, nullptr); + * auto result = new cudf::string_view[token_count]; + * split(d_str, result); + * // result is array like ["the", "best", "of", "times"] + * @endcode * * @param d_str String to split * @param result Empty array to populate with output objects. @@ -123,6 +180,39 @@ __device__ inline cudf::size_type split(cudf::string_view const d_str, cudf::str } return count; } +} // namespace detail + +/** + * @brief Count tokens in a string without performing the split on whitespace + * + * @code{.cpp} + * auto d_str = cudf::string_view{"the best of times ", 19}; + * auto token_count = count_tokens(d_str); + * // token_count is 4 + * @endcode + * + * @param d_str String to split + * @return Number of tokens returned + */ +__device__ inline cudf::size_type count_tokens(cudf::string_view const d_str) +{ + return detail::split(d_str, nullptr); +} + +/** + * @brief Split string on whitespace + * + * This will create tokens by splitting on one or more consecutive whitespace characters + * found in `d_str`. + * + * @param d_str String to split + * @param result Empty array to populate with output objects. + * @return Number of tokens returned + */ +__device__ inline cudf::size_type split(cudf::string_view const d_str, cudf::string_view* result) +{ + return detail::split(d_str, result); +} /** * @brief Join an array of strings with a separator From 96b06f632d878a424fb5a5bf16b5832e2d1f62e0 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 1 Nov 2022 08:22:38 -0400 Subject: [PATCH 26/54] refactor append, replace for better reuse --- .../include/cudf/strings/udf/udf_string.cuh | 38 ++----------------- 1 file changed, 4 insertions(+), 34 deletions(-) diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh index e5c689fb87e..2bd8ceb9bba 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh @@ -324,17 +324,8 @@ __device__ inline udf_string& udf_string::append(char const* str) __device__ inline udf_string& udf_string::append(cudf::char_utf8 chr, cudf::size_type count) { - if (count <= 0) { return *this; } - auto const char_bytes = cudf::strings::detail::bytes_in_char_utf8(chr) * count; - auto const nbytes = m_bytes + char_bytes; - if (nbytes > m_capacity) { reallocate(2 * nbytes); } - auto out_ptr = m_data + m_bytes; - for (auto idx = 0; idx < count; ++idx) { - out_ptr += cudf::strings::detail::from_char_utf8(chr, out_ptr); - } - m_bytes = nbytes; - m_data[m_bytes] = '\0'; - return *this; + auto d_str = udf_string(count, chr); + return append(d_str); } __device__ inline udf_string& udf_string::append(cudf::string_view in) @@ -447,29 +438,8 @@ __device__ inline udf_string& udf_string::replace(cudf::size_type pos, cudf::size_type chr_count, cudf::char_utf8 chr) { - if (pos < 0 || chr_count < 0) { return *this; } - auto const spos = byte_offset(pos); - if (spos > m_bytes) { return *this; } - auto const epos = count < 0 ? m_bytes : std::min(byte_offset(pos + count), m_bytes); - - // compute input size - auto const char_bytes = cudf::strings::detail::bytes_in_char_utf8(chr) * chr_count; - // compute new output size - auto const nbytes = m_bytes + char_bytes - (epos - spos); - if (nbytes > m_capacity) { reallocate(2 * nbytes); } - - // move bytes -- make room for the new character(s) - shift_bytes(spos + char_bytes, epos, nbytes); - - // copy chr chr_count times - auto out_ptr = m_data + spos; - for (auto idx = 0; idx < chr_count; ++idx) { - out_ptr += cudf::strings::detail::from_char_utf8(chr, out_ptr); - } - - m_bytes = nbytes; - m_data[m_bytes] = '\0'; - return *this; + auto d_str = udf_string(chr_count, chr); + return replace(pos, count, d_str); } __device__ udf_string& udf_string::erase(cudf::size_type pos, cudf::size_type count) From 7849307c86aa5369a873e724808f372a634a6849 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 1 Nov 2022 12:02:53 -0400 Subject: [PATCH 27/54] expand spos/epos var names --- .../include/cudf/strings/udf/udf_string.cuh | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh index 2bd8ceb9bba..5c9a02a9510 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh @@ -222,10 +222,10 @@ __device__ inline cudf::size_type udf_string::byte_offset(cudf::size_type pos) c { cudf::size_type offset = 0; - auto sptr = m_data; - auto eptr = sptr + m_bytes; - while ((pos > 0) && (sptr < eptr)) { - auto const byte = static_cast(*sptr++); + auto start = m_data; + auto end = start + m_bytes; + while ((pos > 0) && (start < end)) { + auto const byte = static_cast(*start++); auto const char_bytes = cudf::strings::detail::bytes_in_utf8_byte(byte); if (char_bytes) { --pos; } offset += char_bytes; @@ -366,21 +366,21 @@ __device__ inline udf_string& udf_string::insert(cudf::size_type pos, __device__ inline udf_string udf_string::substr(cudf::size_type pos, cudf::size_type count) const { if (pos < 0) { return udf_string{"", 0}; } - auto const spos = byte_offset(pos); - if (spos >= m_bytes) { return udf_string{"", 0}; } - auto const epos = count < 0 ? m_bytes : std::min(byte_offset(pos + count), m_bytes); - return udf_string{data() + spos, epos - spos}; + auto const start_pos = byte_offset(pos); + if (start_pos >= m_bytes) { return udf_string{"", 0}; } + auto const end_pos = count < 0 ? m_bytes : std::min(byte_offset(pos + count), m_bytes); + return udf_string{data() + start_pos, end_pos - start_pos}; } // utility for replace() -__device__ void udf_string::shift_bytes(cudf::size_type spos, - cudf::size_type epos, +__device__ void udf_string::shift_bytes(cudf::size_type start_pos, + cudf::size_type end_pos, cudf::size_type nbytes) { if (nbytes < m_bytes) { // shift bytes to the left [...wxyz] -> [wxyzxyz] - auto src = epos; - auto tgt = spos; + auto src = end_pos; + auto tgt = start_pos; while (tgt < nbytes) { m_data[tgt++] = m_data[src++]; } @@ -388,7 +388,7 @@ __device__ void udf_string::shift_bytes(cudf::size_type spos, // shift bytes to the right [abcd...] -> [abcabcd] auto src = m_bytes; auto tgt = nbytes; - while (src > epos) { + while (src > end_pos) { m_data[--tgt] = m_data[--src]; } } @@ -400,19 +400,19 @@ __device__ inline udf_string& udf_string::replace(cudf::size_type pos, cudf::size_type in_bytes) { if (pos < 0 || in_bytes < 0) { return *this; } - auto const spos = byte_offset(pos); - if (spos > m_bytes) { return *this; } - auto const epos = count < 0 ? m_bytes : std::min(byte_offset(pos + count), m_bytes); + auto const start_pos = byte_offset(pos); + if (start_pos > m_bytes) { return *this; } + auto const end_pos = count < 0 ? m_bytes : std::min(byte_offset(pos + count), m_bytes); // compute new size - auto const nbytes = m_bytes + in_bytes - (epos - spos); + auto const nbytes = m_bytes + in_bytes - (end_pos - start_pos); if (nbytes > m_capacity) { reallocate(2 * nbytes); } // move bytes -- make room for replacement - shift_bytes(spos + in_bytes, epos, nbytes); + shift_bytes(start_pos + in_bytes, end_pos, nbytes); // insert the replacement - memcpy(m_data + spos, str, in_bytes); + memcpy(m_data + start_pos, str, in_bytes); m_bytes = nbytes; m_data[m_bytes] = '\0'; @@ -447,9 +447,9 @@ __device__ udf_string& udf_string::erase(cudf::size_type pos, cudf::size_type co return replace(pos, count, nullptr, 0); } -__device__ inline cudf::size_type udf_string::char_offset(cudf::size_type bytepos) const +__device__ inline cudf::size_type udf_string::char_offset(cudf::size_type byte_pos) const { - return cudf::strings::detail::characters_in_string(data(), bytepos); + return cudf::strings::detail::characters_in_string(data(), byte_pos); } } // namespace udf From cadcf79e079b938dcc0be755c1308ed7a147f1c5 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 1 Nov 2022 12:03:31 -0400 Subject: [PATCH 28/54] add more doc to replace() for count parm --- .../include/cudf/strings/udf/udf_string.hpp | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp index 728bb0b5adf..2bbda357cee 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp +++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp @@ -452,6 +452,9 @@ class udf_string { * Replaces characters in range `[pos, pos + count]` with `str`. * There is no effect if `pos < 0 or pos > length()`. * + * If `count==0` then `str` is inserted starting at `pos`. + * If `count==npos` then the replacement range is `[pos,length())`. + * * @param pos Position of first character to replace * @param count Number of characters to replace * @param str String to replace the given range @@ -465,6 +468,9 @@ class udf_string { * Replaces characters in range `[pos, pos + count)` with `data`. * There is no effect if `pos < 0 or pos > length()`. * + * If `count==0` then `data` is inserted starting at `pos`. + * If `count==npos` then the replacement range is `[pos,length())`. + * * @param pos Position of first character to replace * @param count Number of characters to replace * @param data Null-terminated character array to replace the given range @@ -478,6 +484,9 @@ class udf_string { * Replaces characters in range `[pos, pos + count)` with `[data, data + bytes)`. * There is no effect if `pos < 0 or pos > length()`. * + * If `count==0` then `data` is inserted starting at `pos`. + * If `count==npos` then the replacement range is `[pos,length())`. + * * @param pos Position of first character to replace * @param count Number of characters to replace * @param data String to replace the given range @@ -495,6 +504,9 @@ class udf_string { * Replaces characters in range `[pos, pos + count)` with `chr` `chr_count` times. * There is no effect if `pos < 0 or pos > length()`. * + * If `count==0` then `chr` is inserted starting at `pos`. + * If `count==npos` then the replacement range is `[pos,length())`. + * * @param pos Position of first character to replace * @param count Number of characters to replace * @param chr_count Number of times `chr` will repeated @@ -527,8 +539,10 @@ class udf_string { __device__ char* allocate(cudf::size_type bytes); __device__ void deallocate(char* data); __device__ void reallocate(cudf::size_type bytes); - __device__ cudf::size_type char_offset(cudf::size_type bytepos) const; - __device__ void shift_bytes(cudf::size_type spos, cudf::size_type epos, cudf::size_type nbytes); + __device__ cudf::size_type char_offset(cudf::size_type byte_pos) const; + __device__ void shift_bytes(cudf::size_type start_pos, + cudf::size_type end_pos, + cudf::size_type nbytes); }; } // namespace udf From 1e02c262eacf68d0a25b96dbcf6c72d3cbe50b20 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 1 Nov 2022 12:16:22 -0700 Subject: [PATCH 29/54] adjust for changes --- .../strings_udf/_lib/cpp/strings_udf.pxd | 7 +++++++ .../strings_udf/strings_udf/_lib/cudf_jit_udf.pyx | 14 ++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd b/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd index ee145e4023b..648fab0abe7 100644 --- a/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd +++ b/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd @@ -11,12 +11,19 @@ from cudf._lib.cpp.types cimport size_type from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer +cdef extern from "cudf/strings/udf/udf_string.hpp" namespace \ + "cudf::strings::udf" nogil: + cdef cppclass udf_string + cdef extern from "cudf/strings/udf/udf_apis.hpp" namespace \ "cudf::strings::udf" nogil: cdef unique_ptr[device_buffer] to_string_view_array(column_view) except + cdef unique_ptr[column] column_from_udf_string_array( void*, size_t ) except + + cdef void free_udf_string_array( + udf_string* d_strings, size_type size + ) except + cdef extern from "cudf/strings/detail/char_tables.hpp" namespace \ "cudf::strings::detail" nogil: diff --git a/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx b/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx index 19fbf51bd1c..c1c99225133 100644 --- a/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx +++ b/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx @@ -6,12 +6,14 @@ from libcpp.utility cimport move from cudf.core.buffer import Buffer from cudf._lib.column cimport Column -from cudf._lib.cpp.column.column cimport column_view +from cudf._lib.cpp.column.column cimport column, column_view from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer from strings_udf._lib.cpp.strings_udf cimport ( column_from_udf_string_array as cpp_column_from_udf_string_array, + free_udf_string_array as cpp_free_udf_string_array, to_string_view_array as cpp_to_string_view_array, + udf_string, ) @@ -26,12 +28,16 @@ def to_string_view_array(Column strings_col): def from_udf_string_array(DeviceBuffer d_buffer): - cdef size_t size = d_buffer.c_size() - cdef void* data = d_buffer.c_data() + cdef size_t size = d_buffer.c_size() // 16 + cdef udf_string* data = d_buffer.c_data() cdef unique_ptr[column] c_result # data = with nogil: c_result = move(cpp_column_from_udf_string_array(data, size)) - return Column.from_unique_ptr(move(c_result)) + result = Column.from_unique_ptr(move(c_result)) + with nogil: + cpp_free_udf_string_array(data, size) + + return result From 1218c08b96a63e98287ad36d3f45970dee2749e6 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 2 Nov 2022 08:06:37 -0700 Subject: [PATCH 30/54] fix up cython --- python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd | 2 +- python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd b/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd index 648fab0abe7..4835325dbd7 100644 --- a/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd +++ b/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd @@ -19,7 +19,7 @@ cdef extern from "cudf/strings/udf/udf_apis.hpp" namespace \ "cudf::strings::udf" nogil: cdef unique_ptr[device_buffer] to_string_view_array(column_view) except + cdef unique_ptr[column] column_from_udf_string_array( - void*, size_t + udf_string* d_strings, size_type size, ) except + cdef void free_udf_string_array( udf_string* d_strings, size_type size diff --git a/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx b/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx index c1c99225133..186d30984bb 100644 --- a/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx +++ b/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx @@ -28,10 +28,9 @@ def to_string_view_array(Column strings_col): def from_udf_string_array(DeviceBuffer d_buffer): - cdef size_t size = d_buffer.c_size() // 16 + cdef size_t size = int(d_buffer.c_size() / sizeof(udf_string)) cdef udf_string* data = d_buffer.c_data() cdef unique_ptr[column] c_result - # data = with nogil: c_result = move(cpp_column_from_udf_string_array(data, size)) From e864deae2e7933dc796cbf910a6caae1da11b028 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 3 Nov 2022 08:42:50 -0700 Subject: [PATCH 31/54] from_udf_string_array -> column_from_udf_string_array, to_string_view_array->column_to_string_view_array --- python/cudf/cudf/core/udf/__init__.py | 8 ++++---- python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx | 4 ++-- python/strings_udf/strings_udf/tests/test_string_udfs.py | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/udf/__init__.py b/python/cudf/cudf/core/udf/__init__.py index 194868659a7..a5aa9bb89da 100644 --- a/python/cudf/cudf/core/udf/__init__.py +++ b/python/cudf/cudf/core/udf/__init__.py @@ -36,8 +36,8 @@ utils.ptx_files.append(ptxpath) from strings_udf._lib.cudf_jit_udf import ( - from_udf_string_array, - to_string_view_array, + column_from_udf_string_array, + column_to_string_view_array, ) from strings_udf._typing import ( str_view_arg_handler, @@ -53,8 +53,8 @@ ) _supported_masked_types |= {string_view} - utils.launch_arg_getters[cudf_str_dtype] = to_string_view_array - utils.output_col_getters[cudf_str_dtype] = from_udf_string_array + utils.launch_arg_getters[cudf_str_dtype] = column_to_string_view_array + utils.output_col_getters[cudf_str_dtype] = column_from_udf_string_array utils.masked_array_types[cudf_str_dtype] = string_view row_function.itemsizes[cudf_str_dtype] = string_view.size_bytes diff --git a/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx b/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx index 8b628dd5229..32d4d6cb073 100644 --- a/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx +++ b/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx @@ -17,7 +17,7 @@ from strings_udf._lib.cpp.strings_udf cimport ( ) -def to_string_view_array(Column strings_col): +def column_to_string_view_array(Column strings_col): cdef unique_ptr[device_buffer] c_buffer cdef column_view input_view = strings_col.view() with nogil: @@ -27,7 +27,7 @@ def to_string_view_array(Column strings_col): return as_buffer(device_buffer) -def from_udf_string_array(DeviceBuffer d_buffer): +def column_from_udf_string_array(DeviceBuffer d_buffer): cdef size_t size = int(d_buffer.c_size() / sizeof(udf_string)) cdef udf_string* data = d_buffer.c_data() cdef unique_ptr[column] c_result diff --git a/python/strings_udf/strings_udf/tests/test_string_udfs.py b/python/strings_udf/strings_udf/tests/test_string_udfs.py index b5d44229e00..f29f6345045 100644 --- a/python/strings_udf/strings_udf/tests/test_string_udfs.py +++ b/python/strings_udf/strings_udf/tests/test_string_udfs.py @@ -14,8 +14,8 @@ import strings_udf from strings_udf._lib.cudf_jit_udf import ( - from_udf_string_array, - to_string_view_array, + column_from_udf_string_array, + column_to_string_view_array, ) from strings_udf._typing import str_view_arg_handler, string_view, udf_string @@ -65,13 +65,13 @@ def run_udf_test(data, func, dtype): output_ary = cudf.core.column.column_empty(len(data), dtype=dtype) cudf_column = cudf.core.column.as_column(data) - str_view_ary = to_string_view_array(cudf_column) + str_view_ary = column_to_string_view_array(cudf_column) kernel = get_kernel(func, dtype, len(data)) kernel.forall(len(data))(str_view_ary, output_ary) if dtype == "str": - output_ary = from_udf_string_array(output_ary) + output_ary = column_from_udf_string_array(output_ary) got = cudf.Series(output_ary, dtype=dtype) expect = pd.Series(data).apply(func) From 9fccc9b31ece3409d48ea49ca6ad7912a100ff4d Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 3 Nov 2022 12:25:23 -0700 Subject: [PATCH 32/54] refactor --- python/cudf/cudf/core/udf/__init__.py | 6 ++--- python/cudf/cudf/core/udf/utils.py | 28 ++++++++++++++--------- python/strings_udf/strings_udf/_typing.py | 15 ++++++++++++ 3 files changed, 34 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/core/udf/__init__.py b/python/cudf/cudf/core/udf/__init__.py index a5aa9bb89da..e5451fa7fa1 100644 --- a/python/cudf/cudf/core/udf/__init__.py +++ b/python/cudf/cudf/core/udf/__init__.py @@ -51,17 +51,15 @@ cuda_lower(api.Masked, string_view, types.boolean)( masked_lowering.masked_constructor ) - + utils.JIT_SUPPORTED_TYPES |= STRING_TYPES _supported_masked_types |= {string_view} + utils.launch_arg_getters[cudf_str_dtype] = column_to_string_view_array utils.output_col_getters[cudf_str_dtype] = column_from_udf_string_array utils.masked_array_types[cudf_str_dtype] = string_view row_function.itemsizes[cudf_str_dtype] = string_view.size_bytes - utils.JIT_SUPPORTED_TYPES |= STRING_TYPES - utils.arg_handlers.append(str_view_arg_handler) - utils.udf_return_type_map[string_view] = udf_string _STRING_UDFS_ENABLED = True except ImportError as e: diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index b4912882b56..f7ec0fd5ce0 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -4,7 +4,6 @@ import cachetools import cupy as cp -import numba import numpy as np from numba import cuda, typeof from numba.core.errors import TypingError @@ -34,7 +33,6 @@ precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32) arg_handlers: List[Any] = [] ptx_files: List[Any] = [] -udf_return_type_map: Dict[Any, Any] = {} masked_array_types: Dict[Any, Any] = {} launch_arg_getters: Dict[Any, Any] = {} output_col_getters: Dict[Any, Any] = {} @@ -72,7 +70,7 @@ def _get_udf_return_type(argty, func: Callable, args=()): if not isinstance(numba_output_type, MaskedType) else numba_output_type.value_type ) - result = udf_return_type_map.get(result, result) + result = result if result.is_internal else result.return_type # _get_udf_return_type will throw a TypingError if the user tries to use # a field in the row containing an unsupported dtype, except in the @@ -148,7 +146,7 @@ def _construct_signature(frame, return_type, args): actually JIT the kernel itself later, accounting for types and offsets. Skips columns with unsupported dtypes. """ - if return_type in udf_return_type_map.values(): + if not return_type.is_internal: return_type = CPointer(return_type) else: return_type = return_type[::1] @@ -222,11 +220,19 @@ def _compile_or_get(frame, func, args, kernel_getter=None): # could be a MaskedType or a scalar type. kernel, scalar_return_type = kernel_getter(frame, func, args) - try: - np_return_type = numpy_support.as_dtype(scalar_return_type) - except numba.core.errors.NumbaNotImplementedError: - # TODO: fix - np_return_type = np.dtype("object") + np_return_type = ( + numpy_support.as_dtype(scalar_return_type) + if scalar_return_type.is_internal + else scalar_return_type.np_dtype + ) + + # try: + # np_return_type = numpy_support.as_dtype(scalar_return_type) + # except numba.core.errors.NumbaNotImplementedError: + # # TODO: fix + # np_return_type = np.dtype("object") + # np_return_type = numpy_support.as_dtype(scalar_return_type) + precompiled[cache_key] = (kernel, np_return_type) return kernel, np_return_type @@ -264,9 +270,9 @@ def _get_input_args_from_frame(fr): def _return_arr_from_dtype(dt, size): - extensionty = udf_return_type_map.get(masked_array_types.get(dt)) + extensionty = masked_array_types.get(dt) if extensionty: - return rmm.DeviceBuffer(size=size * extensionty.size_bytes) + return rmm.DeviceBuffer(size=size * extensionty.return_type.size_bytes) return cp.empty(size, dtype=dt) diff --git a/python/strings_udf/strings_udf/_typing.py b/python/strings_udf/strings_udf/_typing.py index 63d2cdf69aa..320958960cd 100644 --- a/python/strings_udf/strings_udf/_typing.py +++ b/python/strings_udf/strings_udf/_typing.py @@ -3,6 +3,7 @@ import operator import llvmlite.binding as ll +import numpy as np from numba import types from numba.core.datamodel import default_manager from numba.core.extending import models, register_model @@ -24,18 +25,32 @@ # String object definitions class UDFString(types.Type): + + np_dtype = np.dtype("object") + def __init__(self): super().__init__(name="udf_string") llty = default_manager[self].get_value_type() self.size_bytes = llty.get_abi_size(target_data) + @property + def return_type(self): + return self + class StringView(types.Type): + + np_dtype = np.dtype("object") + def __init__(self): super().__init__(name="string_view") llty = default_manager[self].get_value_type() self.size_bytes = llty.get_abi_size(target_data) + @property + def return_type(self): + return UDFString() + @register_model(StringView) class stringview_model(models.StructModel): From d5c37a8c16477404e1d838fbc4764432503f5148 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 3 Nov 2022 12:27:33 -0700 Subject: [PATCH 33/54] prune imports --- python/cudf/cudf/core/udf/__init__.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/core/udf/__init__.py b/python/cudf/cudf/core/udf/__init__.py index e5451fa7fa1..926d2ea6cbf 100644 --- a/python/cudf/cudf/core/udf/__init__.py +++ b/python/cudf/cudf/core/udf/__init__.py @@ -1,15 +1,8 @@ # Copyright (c) 2022, NVIDIA CORPORATION. -import cupy as cp -import numpy as np -from numba import cuda, types -from numba.cuda.cudaimpl import ( - lower as cuda_lower, - registry as cuda_lowering_registry, -) -import rmm +from numba import types +from numba.cuda.cudaimpl import lower as cuda_lower -from cudf.core.column import as_column from cudf.core.dtypes import dtype from cudf.core.udf import api, row_function, utils from cudf.utils.dtypes import STRING_TYPES @@ -39,11 +32,7 @@ column_from_udf_string_array, column_to_string_view_array, ) - from strings_udf._typing import ( - str_view_arg_handler, - string_view, - udf_string, - ) + from strings_udf._typing import str_view_arg_handler, string_view from . import strings_typing # isort: skip from . import strings_lowering # isort: skip From b7c1b1d0cf381010db8043272baf3fa861bdb3f1 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 3 Nov 2022 12:29:42 -0700 Subject: [PATCH 34/54] cleanup --- python/cudf/cudf/core/udf/utils.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index f7ec0fd5ce0..750ab0bded0 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -226,13 +226,6 @@ def _compile_or_get(frame, func, args, kernel_getter=None): else scalar_return_type.np_dtype ) - # try: - # np_return_type = numpy_support.as_dtype(scalar_return_type) - # except numba.core.errors.NumbaNotImplementedError: - # # TODO: fix - # np_return_type = np.dtype("object") - # np_return_type = numpy_support.as_dtype(scalar_return_type) - precompiled[cache_key] = (kernel, np_return_type) return kernel, np_return_type From 267b90497570ab15afbc42ab58ac4fd0a5ab0645 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 4 Nov 2022 07:03:26 -0700 Subject: [PATCH 35/54] begin to address reviews --- python/strings_udf/strings_udf/lowering.py | 8 +------- .../strings_udf/tests/test_string_udfs.py | 12 ++++++------ 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/python/strings_udf/strings_udf/lowering.py b/python/strings_udf/strings_udf/lowering.py index f7e67129ebe..909b0e56187 100644 --- a/python/strings_udf/strings_udf/lowering.py +++ b/python/strings_udf/strings_udf/lowering.py @@ -18,17 +18,11 @@ character_flags_table_ptr = get_character_flags_table_ptr() - -# read-only functions -# We will provide only one overload for this set of functions, which will -# expect a string_view. When a literal is encountered, numba will promote it to -# a string_view whereas when a dstring is encountered, numba will convert it to -# a view via its native view() method. - _STR_VIEW_PTR = types.CPointer(string_view) # CUDA function declarations +# read-only (input is a string_view, output is a fixed with type) _string_view_len = cuda.declare_device("len", size_type(_STR_VIEW_PTR)) diff --git a/python/strings_udf/strings_udf/tests/test_string_udfs.py b/python/strings_udf/strings_udf/tests/test_string_udfs.py index f29f6345045..ca3fbda4eb1 100644 --- a/python/strings_udf/strings_udf/tests/test_string_udfs.py +++ b/python/strings_udf/strings_udf/tests/test_string_udfs.py @@ -59,21 +59,21 @@ def run_udf_test(data, func, dtype): comparing it with the equivalent pandas result """ if dtype == "str": - output_ary = rmm.DeviceBuffer(size=len(data) * udf_string.size_bytes) + output = rmm.DeviceBuffer(size=len(data) * udf_string.size_bytes) else: dtype = np.dtype(dtype) - output_ary = cudf.core.column.column_empty(len(data), dtype=dtype) + output = cudf.core.column.column_empty(len(data), dtype=dtype) cudf_column = cudf.core.column.as_column(data) - str_view_ary = column_to_string_view_array(cudf_column) + str_views = column_to_string_view_array(cudf_column) kernel = get_kernel(func, dtype, len(data)) - kernel.forall(len(data))(str_view_ary, output_ary) + kernel.forall(len(data))(str_views, output) if dtype == "str": - output_ary = column_from_udf_string_array(output_ary) + output = column_from_udf_string_array(output) - got = cudf.Series(output_ary, dtype=dtype) + got = cudf.Series(output, dtype=dtype) expect = pd.Series(data).apply(func) assert_eq(expect, got, check_dtype=False) From 8b7a412e469c444642cad3d4f990e386ce3bfa7d Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Fri, 4 Nov 2022 09:04:09 -0500 Subject: [PATCH 36/54] Update python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx Co-authored-by: Vyas Ramasubramani --- python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx b/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx index 32d4d6cb073..4fc9e473fa3 100644 --- a/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx +++ b/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx @@ -34,9 +34,8 @@ def column_from_udf_string_array(DeviceBuffer d_buffer): with nogil: c_result = move(cpp_column_from_udf_string_array(data, size)) + cpp_free_udf_string_array(data, size) result = Column.from_unique_ptr(move(c_result)) - with nogil: - cpp_free_udf_string_array(data, size) return result From 4f821ca89e07af6ccdb7deedd8876634ead58910 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 4 Nov 2022 07:25:06 -0700 Subject: [PATCH 37/54] finish addressing reviews, walrus everywhere! --- python/cudf/cudf/core/udf/utils.py | 6 ++---- python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index 750ab0bded0..4d40d41f9c3 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -263,14 +263,12 @@ def _get_input_args_from_frame(fr): def _return_arr_from_dtype(dt, size): - extensionty = masked_array_types.get(dt) - if extensionty: + if extensionty := masked_array_types.get(dt): return rmm.DeviceBuffer(size=size * extensionty.return_type.size_bytes) return cp.empty(size, dtype=dt) def _post_process_output_col(col, retty): - getter = output_col_getters.get(retty) - if getter: + if getter := output_col_getters.get(retty): col = getter(col) return as_column(col, retty) diff --git a/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd b/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd index 4835325dbd7..7b90760abcc 100644 --- a/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd +++ b/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd @@ -19,10 +19,10 @@ cdef extern from "cudf/strings/udf/udf_apis.hpp" namespace \ "cudf::strings::udf" nogil: cdef unique_ptr[device_buffer] to_string_view_array(column_view) except + cdef unique_ptr[column] column_from_udf_string_array( - udf_string* d_strings, size_type size, + udf_string* strings, size_type size, ) except + cdef void free_udf_string_array( - udf_string* d_strings, size_type size + udf_string* strings, size_type size ) except + cdef extern from "cudf/strings/detail/char_tables.hpp" namespace \ From b0a8681334d283b02ecfcdecd8fb6670e8ee8a53 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 7 Nov 2022 11:50:24 -0800 Subject: [PATCH 38/54] support strip --- .../strings_udf/cpp/src/strings/udf/shim.cu | 15 +++++++ python/strings_udf/strings_udf/_typing.py | 19 +++++++-- python/strings_udf/strings_udf/lowering.py | 40 ++++++++++++++++++- .../strings_udf/tests/test_string_udfs.py | 8 ++++ 4 files changed, 78 insertions(+), 4 deletions(-) diff --git a/python/strings_udf/cpp/src/strings/udf/shim.cu b/python/strings_udf/cpp/src/strings/udf/shim.cu index b284d58fe58..8c19a59d45c 100644 --- a/python/strings_udf/cpp/src/strings/udf/shim.cu +++ b/python/strings_udf/cpp/src/strings/udf/shim.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include using namespace cudf::strings::udf; @@ -227,3 +228,17 @@ extern "C" __device__ int udf_string_from_string_view(int* nb_retbal, return 0; } + +extern "C" __device__ int strip(int* nb_retval, + void* udf_str, + void* const* to_strip, + void* const* strip_str) +{ + auto to_strip_ptr = reinterpret_cast(to_strip); + auto strip_str_ptr = reinterpret_cast(strip_str); + auto udf_str_ptr = reinterpret_cast(udf_str); + + *udf_str_ptr = strip(*to_strip_ptr, *strip_str_ptr); + + return 0; +} diff --git a/python/strings_udf/strings_udf/_typing.py b/python/strings_udf/strings_udf/_typing.py index 320958960cd..e70ab2c8a63 100644 --- a/python/strings_udf/strings_udf/_typing.py +++ b/python/strings_udf/strings_udf/_typing.py @@ -181,7 +181,7 @@ def attr(self, mod): return attr -def create_identifier_attr(attrname): +def create_identifier_attr(attrname, retty): """ Helper function wrapping numba's low level extension API. Provides the boilerplate needed to register a unary function of a string @@ -192,7 +192,7 @@ class StringViewIdentifierAttr(AbstractTemplate): key = f"StringView.{attrname}" def generic(self, args, kws): - return nb_signature(types.boolean, recvr=self.this) + return nb_signature(retty, recvr=self.this) def attr(self, mod): return types.BoundFunction(StringViewIdentifierAttr, string_view) @@ -229,6 +229,7 @@ def resolve_count(self, mod): "isnumeric", "istitle", ] +string_binary_funcs = ["strip"] for func in bool_binary_funcs: setattr( @@ -237,12 +238,24 @@ def resolve_count(self, mod): create_binary_attr(func, types.boolean), ) +for func in string_binary_funcs: + setattr( + StringViewAttrs, + f"resolve_{func}", + create_binary_attr(func, udf_string), + ) + + for func in int_binary_funcs: setattr( StringViewAttrs, f"resolve_{func}", create_binary_attr(func, size_type) ) for func in id_unary_funcs: - setattr(StringViewAttrs, f"resolve_{func}", create_identifier_attr(func)) + setattr( + StringViewAttrs, + f"resolve_{func}", + create_identifier_attr(func, types.boolean), + ) cuda_decl_registry.register_attr(StringViewAttrs) diff --git a/python/strings_udf/strings_udf/lowering.py b/python/strings_udf/strings_udf/lowering.py index 909b0e56187..26c019e9b1b 100644 --- a/python/strings_udf/strings_udf/lowering.py +++ b/python/strings_udf/strings_udf/lowering.py @@ -19,6 +19,7 @@ character_flags_table_ptr = get_character_flags_table_ptr() _STR_VIEW_PTR = types.CPointer(string_view) +_UDF_STRING_PTR = types.CPointer(udf_string) # CUDA function declarations @@ -55,7 +56,9 @@ def _declare_binary_func(lhs, rhs, out, name): _string_view_find = _declare_size_type_str_str_func("find") _string_view_rfind = _declare_size_type_str_str_func("rfind") _string_view_contains = _declare_bool_str_str_func("contains") - +_string_view_strip = cuda.declare_device( + "strip", types.int32(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR) +) # A binary function of the form f(string, int) -> bool _declare_bool_str_int_func = partial( @@ -179,6 +182,36 @@ def binary_func_impl(context, builder, sig, args): return deco +def create_binary_string_func_return_string(binary_func): + def deco(cuda_func): + @cuda_lower(binary_func, string_view, string_view) + def binary_func_return_string_impl(context, builder, sig, args): + lhs_ptr = builder.alloca(args[0].type) + rhs_ptr = builder.alloca(args[1].type) + builder.store(args[0], lhs_ptr) + builder.store(args[1], rhs_ptr) + + udf_str_ptr = builder.alloca( + default_manager[udf_string].get_value_type() + ) + + _ = context.compile_internal( + builder, + cuda_func, + size_type(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR), + (udf_str_ptr, lhs_ptr, rhs_ptr), + ) + + result = cgutils.create_struct_proxy(udf_string)( + context, builder, value=builder.load(udf_str_ptr) + ) + return result._getvalue() + + return binary_func_return_string_impl + + return deco + + @create_binary_string_func(operator.contains, types.boolean) def contains_impl(st, substr): return _string_view_contains(st, substr) @@ -214,6 +247,11 @@ def lt_impl(st, rhs): return _string_view_lt(st, rhs) +@create_binary_string_func_return_string("StringView.strip") +def strip_impl(result, to_strip, strip_char): + return _string_view_strip(result, to_strip, strip_char) + + @create_binary_string_func("StringView.startswith", types.boolean) def startswith_impl(sv, substr): return _string_view_startswith(sv, substr) diff --git a/python/strings_udf/strings_udf/tests/test_string_udfs.py b/python/strings_udf/strings_udf/tests/test_string_udfs.py index ca3fbda4eb1..025e98b3b86 100644 --- a/python/strings_udf/strings_udf/tests/test_string_udfs.py +++ b/python/strings_udf/strings_udf/tests/test_string_udfs.py @@ -278,3 +278,11 @@ def func(st): return st run_udf_test(data, func, "str") + + +@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) +def test_string_udf_strip(data, strip_char): + def func(st): + return st.strip(strip_char) + + run_udf_test(data, func, "str") From 18aee5a574a8abd85d048d8bc3580aac9908146f Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 8 Nov 2022 06:46:50 -0800 Subject: [PATCH 39/54] updates --- .../strings_udf/cpp/src/strings/udf/shim.cu | 28 +++++++++++++++++++ python/strings_udf/strings_udf/_typing.py | 2 +- python/strings_udf/strings_udf/lowering.py | 16 +++++++++++ .../strings_udf/tests/test_string_udfs.py | 16 +++++++++++ 4 files changed, 61 insertions(+), 1 deletion(-) diff --git a/python/strings_udf/cpp/src/strings/udf/shim.cu b/python/strings_udf/cpp/src/strings/udf/shim.cu index 8c19a59d45c..63e740c5226 100644 --- a/python/strings_udf/cpp/src/strings/udf/shim.cu +++ b/python/strings_udf/cpp/src/strings/udf/shim.cu @@ -242,3 +242,31 @@ extern "C" __device__ int strip(int* nb_retval, return 0; } + +extern "C" __device__ int lstrip(int* nb_retval, + void* udf_str, + void* const* to_strip, + void* const* strip_str) +{ + auto to_strip_ptr = reinterpret_cast(to_strip); + auto strip_str_ptr = reinterpret_cast(strip_str); + auto udf_str_ptr = reinterpret_cast(udf_str); + + *udf_str_ptr = strip(*to_strip_ptr, *strip_str_ptr, cudf::strings::side_type::LEFT); + + return 0; +} + +extern "C" __device__ int rstrip(int* nb_retval, + void* udf_str, + void* const* to_strip, + void* const* strip_str) +{ + auto to_strip_ptr = reinterpret_cast(to_strip); + auto strip_str_ptr = reinterpret_cast(strip_str); + auto udf_str_ptr = reinterpret_cast(udf_str); + + *udf_str_ptr = strip(*to_strip_ptr, *strip_str_ptr, cudf::strings::side_type::RIGHT); + + return 0; +} diff --git a/python/strings_udf/strings_udf/_typing.py b/python/strings_udf/strings_udf/_typing.py index e70ab2c8a63..5da73413cef 100644 --- a/python/strings_udf/strings_udf/_typing.py +++ b/python/strings_udf/strings_udf/_typing.py @@ -229,7 +229,7 @@ def resolve_count(self, mod): "isnumeric", "istitle", ] -string_binary_funcs = ["strip"] +string_binary_funcs = ["strip", "lstrip", "rstrip"] for func in bool_binary_funcs: setattr( diff --git a/python/strings_udf/strings_udf/lowering.py b/python/strings_udf/strings_udf/lowering.py index 26c019e9b1b..2b445094861 100644 --- a/python/strings_udf/strings_udf/lowering.py +++ b/python/strings_udf/strings_udf/lowering.py @@ -59,6 +59,12 @@ def _declare_binary_func(lhs, rhs, out, name): _string_view_strip = cuda.declare_device( "strip", types.int32(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR) ) +_string_view_lstrip = cuda.declare_device( + "strip", types.int32(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR) +) +_string_view_rstrip = cuda.declare_device( + "strip", types.int32(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR) +) # A binary function of the form f(string, int) -> bool _declare_bool_str_int_func = partial( @@ -252,6 +258,16 @@ def strip_impl(result, to_strip, strip_char): return _string_view_strip(result, to_strip, strip_char) +@create_binary_string_func_return_string("StringView.lstrip") +def lstrip_impl(result, to_strip, strip_char): + return _string_view_lstrip(result, to_strip, strip_char) + + +@create_binary_string_func_return_string("StringView.rstrip") +def rstrip_impl(result, to_strip, strip_char): + return _string_view_rstrip(result, to_strip, strip_char) + + @create_binary_string_func("StringView.startswith", types.boolean) def startswith_impl(sv, substr): return _string_view_startswith(sv, substr) diff --git a/python/strings_udf/strings_udf/tests/test_string_udfs.py b/python/strings_udf/strings_udf/tests/test_string_udfs.py index 025e98b3b86..522433d404f 100644 --- a/python/strings_udf/strings_udf/tests/test_string_udfs.py +++ b/python/strings_udf/strings_udf/tests/test_string_udfs.py @@ -286,3 +286,19 @@ def func(st): return st.strip(strip_char) run_udf_test(data, func, "str") + + +@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) +def test_string_udf_lstrip(data, strip_char): + def func(st): + return st.lstrip(strip_char) + + run_udf_test(data, func, "str") + + +@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) +def test_string_udf_rstrip(data, strip_char): + def func(st): + return st.rstrip(strip_char) + + run_udf_test(data, func, "str") From d7556b078fa337a19b78e771201c07947b67d5d7 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 8 Nov 2022 07:03:05 -0800 Subject: [PATCH 40/54] fix bad merge --- python/strings_udf/cpp/src/strings/udf/shim.cu | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/strings_udf/cpp/src/strings/udf/shim.cu b/python/strings_udf/cpp/src/strings/udf/shim.cu index ed254b8e4e1..63e740c5226 100644 --- a/python/strings_udf/cpp/src/strings/udf/shim.cu +++ b/python/strings_udf/cpp/src/strings/udf/shim.cu @@ -17,10 +17,7 @@ #include #include #include -<<<<<<< HEAD #include -======= ->>>>>>> branch-22.12 #include using namespace cudf::strings::udf; From c4f884716b379f2e4c68a7159c81de06944cb59f Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 8 Nov 2022 07:04:14 -0800 Subject: [PATCH 41/54] add tests to cudf --- python/cudf/cudf/tests/test_udf_masked_ops.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index b4c7cef3a4c..2d9951982f9 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -876,6 +876,30 @@ def func(row): run_masked_udf_test(func, str_udf_data, check_dtype=False) +@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) +def test_string_udf_strip(str_udf_data, strip_char): + def func(row): + return row["str_col"].strip(strip_char) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) +def test_string_udf_lstrip(str_udf_data, strip_char): + def func(row): + return row["str_col"].lstrip(strip_char) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) +def test_string_udf_rstrip(str_udf_data, strip_char): + def func(row): + return row["str_col"].rstrip(strip_char) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + @pytest.mark.parametrize( "data", [[1.0, 0.0, 1.5], [1, 0, 2], [True, False, True]] ) From 7030108082cc2c7a54d1c722dab6a04c996280cb Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 8 Nov 2022 07:31:05 -0800 Subject: [PATCH 42/54] plumb to maskedtype --- python/cudf/cudf/core/udf/__init__.py | 11 +++++++++-- python/cudf/cudf/core/udf/masked_typing.py | 1 + python/cudf/cudf/core/udf/strings_lowering.py | 12 +++++++++++- python/cudf/cudf/core/udf/strings_typing.py | 9 +++++++++ 4 files changed, 30 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/udf/__init__.py b/python/cudf/cudf/core/udf/__init__.py index 926d2ea6cbf..8421d763167 100644 --- a/python/cudf/cudf/core/udf/__init__.py +++ b/python/cudf/cudf/core/udf/__init__.py @@ -32,7 +32,11 @@ column_from_udf_string_array, column_to_string_view_array, ) - from strings_udf._typing import str_view_arg_handler, string_view + from strings_udf._typing import ( + str_view_arg_handler, + string_view, + udf_string, + ) from . import strings_typing # isort: skip from . import strings_lowering # isort: skip @@ -41,7 +45,7 @@ masked_lowering.masked_constructor ) utils.JIT_SUPPORTED_TYPES |= STRING_TYPES - _supported_masked_types |= {string_view} + _supported_masked_types |= {string_view, udf_string} utils.launch_arg_getters[cudf_str_dtype] = column_to_string_view_array utils.output_col_getters[cudf_str_dtype] = column_from_udf_string_array @@ -49,6 +53,9 @@ row_function.itemsizes[cudf_str_dtype] = string_view.size_bytes utils.arg_handlers.append(str_view_arg_handler) + + masked_typing.MASKED_INIT_MAP[udf_string] = udf_string + _STRING_UDFS_ENABLED = True except ImportError as e: diff --git a/python/cudf/cudf/core/udf/masked_typing.py b/python/cudf/cudf/core/udf/masked_typing.py index 7baf2d585e2..f600e33ff64 100644 --- a/python/cudf/cudf/core/udf/masked_typing.py +++ b/python/cudf/cudf/core/udf/masked_typing.py @@ -70,6 +70,7 @@ def _type_to_masked_type(t): "attempting to use a column of unsupported dtype in a UDF. " f"Supported dtypes are:\n{supported_type_str}" ) + breakpoint() return types.Poison(err) else: return result diff --git a/python/cudf/cudf/core/udf/strings_lowering.py b/python/cudf/cudf/core/udf/strings_lowering.py index 59041977f87..fdfd013bad7 100644 --- a/python/cudf/cudf/core/udf/strings_lowering.py +++ b/python/cudf/cudf/core/udf/strings_lowering.py @@ -7,7 +7,7 @@ from numba.core.typing import signature as nb_signature from numba.cuda.cudaimpl import lower as cuda_lower -from strings_udf._typing import size_type, string_view +from strings_udf._typing import size_type, string_view, udf_string from strings_udf.lowering import ( contains_impl, count_impl, @@ -22,8 +22,11 @@ istitle_impl, isupper_impl, len_impl, + lstrip_impl, rfind_impl, + rstrip_impl, startswith_impl, + strip_impl, ) from cudf.core.udf.masked_typing import MaskedType @@ -79,6 +82,13 @@ def masked_binary_func_impl(context, builder, sig, args): ) +create_binary_string_func("MaskedType.strip", strip_impl, udf_string) + +create_binary_string_func("MaskedType.lstrip", lstrip_impl, udf_string) + +create_binary_string_func("MaskedType.rstrip", rstrip_impl, udf_string) + + create_binary_string_func( "MaskedType.startswith", startswith_impl, diff --git a/python/cudf/cudf/core/udf/strings_typing.py b/python/cudf/cudf/core/udf/strings_typing.py index 1179688651f..93a45680cf1 100644 --- a/python/cudf/cudf/core/udf/strings_typing.py +++ b/python/cudf/cudf/core/udf/strings_typing.py @@ -13,7 +13,9 @@ id_unary_funcs, int_binary_funcs, size_type, + string_binary_funcs, string_view, + udf_string, ) from cudf.core.udf import masked_typing @@ -172,6 +174,13 @@ def resolve_valid(self, mod): create_masked_binary_attr(f"MaskedType.{func}", size_type), ) +for func in string_binary_funcs: + setattr( + MaskedStringViewAttrs, + f"resolve_{func}", + create_masked_binary_attr(f"MaskedType.{func}", udf_string), + ) + for func in id_unary_funcs: setattr( MaskedStringViewAttrs, From 11e966c4d45c0b3c1ef4b2302a1541f87b4ccc8e Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 8 Nov 2022 07:45:21 -0800 Subject: [PATCH 43/54] cleanup --- python/strings_udf/strings_udf/lowering.py | 75 ++++++++++------------ 1 file changed, 34 insertions(+), 41 deletions(-) diff --git a/python/strings_udf/strings_udf/lowering.py b/python/strings_udf/strings_udf/lowering.py index 2b445094861..3dfc2b1c413 100644 --- a/python/strings_udf/strings_udf/lowering.py +++ b/python/strings_udf/strings_udf/lowering.py @@ -171,53 +171,46 @@ def deco(cuda_func): def binary_func_impl(context, builder, sig, args): lhs_ptr = builder.alloca(args[0].type) rhs_ptr = builder.alloca(args[1].type) - builder.store(args[0], lhs_ptr) builder.store(args[1], rhs_ptr) - result = context.compile_internal( - builder, - cuda_func, - nb_signature(retty, _STR_VIEW_PTR, _STR_VIEW_PTR), - (lhs_ptr, rhs_ptr), - ) - return result + # these conditional statements should compile out + if retty != udf_string: + # binary function of two strings yielding a fixed-width type + # example: str.startswith(other) -> bool + # shim functions can return the value through nb_retval + result = context.compile_internal( + builder, + cuda_func, + nb_signature(retty, _STR_VIEW_PTR, _STR_VIEW_PTR), + (lhs_ptr, rhs_ptr), + ) + return result + else: + # binary function of two strings yielding a new string + # example: str.strip(other) -> str + # shim functions can not return a struct due to C linkage + # so we operate on an extra void ptr and throw away nb_retval + udf_str_ptr = builder.alloca( + default_manager[udf_string].get_value_type() + ) + + _ = context.compile_internal( + builder, + cuda_func, + size_type(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR), + (udf_str_ptr, lhs_ptr, rhs_ptr), + ) + result = cgutils.create_struct_proxy(udf_string)( + context, builder, value=builder.load(udf_str_ptr) + ) + return result._getvalue() return binary_func_impl return deco -def create_binary_string_func_return_string(binary_func): - def deco(cuda_func): - @cuda_lower(binary_func, string_view, string_view) - def binary_func_return_string_impl(context, builder, sig, args): - lhs_ptr = builder.alloca(args[0].type) - rhs_ptr = builder.alloca(args[1].type) - builder.store(args[0], lhs_ptr) - builder.store(args[1], rhs_ptr) - - udf_str_ptr = builder.alloca( - default_manager[udf_string].get_value_type() - ) - - _ = context.compile_internal( - builder, - cuda_func, - size_type(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR), - (udf_str_ptr, lhs_ptr, rhs_ptr), - ) - - result = cgutils.create_struct_proxy(udf_string)( - context, builder, value=builder.load(udf_str_ptr) - ) - return result._getvalue() - - return binary_func_return_string_impl - - return deco - - @create_binary_string_func(operator.contains, types.boolean) def contains_impl(st, substr): return _string_view_contains(st, substr) @@ -253,17 +246,17 @@ def lt_impl(st, rhs): return _string_view_lt(st, rhs) -@create_binary_string_func_return_string("StringView.strip") +@create_binary_string_func("StringView.strip", udf_string) def strip_impl(result, to_strip, strip_char): return _string_view_strip(result, to_strip, strip_char) -@create_binary_string_func_return_string("StringView.lstrip") +@create_binary_string_func("StringView.lstrip", udf_string) def lstrip_impl(result, to_strip, strip_char): return _string_view_lstrip(result, to_strip, strip_char) -@create_binary_string_func_return_string("StringView.rstrip") +@create_binary_string_func("StringView.rstrip", udf_string) def rstrip_impl(result, to_strip, strip_char): return _string_view_rstrip(result, to_strip, strip_char) From 9991c7607994fa06d08b64d0f8a55c8d03c2f2d5 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 8 Nov 2022 07:46:07 -0800 Subject: [PATCH 44/54] more cleanup --- python/cudf/cudf/core/udf/masked_typing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/udf/masked_typing.py b/python/cudf/cudf/core/udf/masked_typing.py index f600e33ff64..7baf2d585e2 100644 --- a/python/cudf/cudf/core/udf/masked_typing.py +++ b/python/cudf/cudf/core/udf/masked_typing.py @@ -70,7 +70,6 @@ def _type_to_masked_type(t): "attempting to use a column of unsupported dtype in a UDF. " f"Supported dtypes are:\n{supported_type_str}" ) - breakpoint() return types.Poison(err) else: return result From 837a49c727770066a0a06801ff69de47824be229 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Wed, 9 Nov 2022 10:27:59 -0600 Subject: [PATCH 45/54] Update python/strings_udf/strings_udf/lowering.py Co-authored-by: Vyas Ramasubramani --- python/strings_udf/strings_udf/lowering.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/strings_udf/strings_udf/lowering.py b/python/strings_udf/strings_udf/lowering.py index 3dfc2b1c413..49ba77ee996 100644 --- a/python/strings_udf/strings_udf/lowering.py +++ b/python/strings_udf/strings_udf/lowering.py @@ -190,7 +190,11 @@ def binary_func_impl(context, builder, sig, args): # binary function of two strings yielding a new string # example: str.strip(other) -> str # shim functions can not return a struct due to C linkage - # so we operate on an extra void ptr and throw away nb_retval + # so we create a new udf_string and pass a pointer to it + # for the shim function to write the output to. The return + # value of compile_internal is therefore discarded (although + # this may change in the future if we need to return error + # codes, for instance). udf_str_ptr = builder.alloca( default_manager[udf_string].get_value_type() ) From 302fe6021ceb58f31fc140402b1ac3b3b93ae3b0 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 9 Nov 2022 08:43:00 -0800 Subject: [PATCH 46/54] address reviews --- python/cudf/cudf/core/udf/strings_typing.py | 4 ++-- python/cudf/cudf/tests/test_udf_masked_ops.py | 3 +++ python/strings_udf/strings_udf/_typing.py | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/udf/strings_typing.py b/python/cudf/cudf/core/udf/strings_typing.py index 93a45680cf1..f8f50600b12 100644 --- a/python/cudf/cudf/core/udf/strings_typing.py +++ b/python/cudf/cudf/core/udf/strings_typing.py @@ -13,7 +13,7 @@ id_unary_funcs, int_binary_funcs, size_type, - string_binary_funcs, + string_return_attrs, string_view, udf_string, ) @@ -174,7 +174,7 @@ def resolve_valid(self, mod): create_masked_binary_attr(f"MaskedType.{func}", size_type), ) -for func in string_binary_funcs: +for func in string_return_attrs: setattr( MaskedStringViewAttrs, f"resolve_{func}", diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index 2d9951982f9..7af47f981d6 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -876,6 +876,7 @@ def func(row): run_masked_udf_test(func, str_udf_data, check_dtype=False) +@string_udf_test @pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) def test_string_udf_strip(str_udf_data, strip_char): def func(row): @@ -884,6 +885,7 @@ def func(row): run_masked_udf_test(func, str_udf_data, check_dtype=False) +@string_udf_test @pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) def test_string_udf_lstrip(str_udf_data, strip_char): def func(row): @@ -892,6 +894,7 @@ def func(row): run_masked_udf_test(func, str_udf_data, check_dtype=False) +@string_udf_test @pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) def test_string_udf_rstrip(str_udf_data, strip_char): def func(row): diff --git a/python/strings_udf/strings_udf/_typing.py b/python/strings_udf/strings_udf/_typing.py index 5da73413cef..a309a9cb93c 100644 --- a/python/strings_udf/strings_udf/_typing.py +++ b/python/strings_udf/strings_udf/_typing.py @@ -229,7 +229,7 @@ def resolve_count(self, mod): "isnumeric", "istitle", ] -string_binary_funcs = ["strip", "lstrip", "rstrip"] +string_return_attrs = ["strip", "lstrip", "rstrip"] for func in bool_binary_funcs: setattr( @@ -238,7 +238,7 @@ def resolve_count(self, mod): create_binary_attr(func, types.boolean), ) -for func in string_binary_funcs: +for func in string_return_attrs: setattr( StringViewAttrs, f"resolve_{func}", From e0f98cc9ee0dc413f755e4696eaafdf5024a89c9 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 9 Nov 2022 12:07:29 -0800 Subject: [PATCH 47/54] plumb concat to empty shim function for now --- python/cudf/cudf/tests/test_udf_masked_ops.py | 9 ++++++ .../strings_udf/cpp/src/strings/udf/shim.cu | 8 ++++++ python/strings_udf/strings_udf/_typing.py | 5 ++++ python/strings_udf/strings_udf/lowering.py | 28 +++++++++++++++++++ .../strings_udf/tests/test_string_udfs.py | 8 ++++++ 5 files changed, 58 insertions(+) diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index 7af47f981d6..fbe6b3f8888 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -903,6 +903,15 @@ def func(row): run_masked_udf_test(func, str_udf_data, check_dtype=False) +@string_udf_test +@pytest.mark.parametrize("concat_char", ["1", "a", "12", " ", "", ".", "@"]) +def test_string_udf_concat(str_udf_data, concat_char): + def func(row): + return row["str_col"] + concat_char + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + @pytest.mark.parametrize( "data", [[1.0, 0.0, 1.5], [1, 0, 2], [True, False, True]] ) diff --git a/python/strings_udf/cpp/src/strings/udf/shim.cu b/python/strings_udf/cpp/src/strings/udf/shim.cu index 63e740c5226..cac97d454ac 100644 --- a/python/strings_udf/cpp/src/strings/udf/shim.cu +++ b/python/strings_udf/cpp/src/strings/udf/shim.cu @@ -270,3 +270,11 @@ extern "C" __device__ int rstrip(int* nb_retval, return 0; } + +extern "C" __device__ int concat(int* nb_retval, + void* const* to_strip, + void* udf_str, + void* const* strip_str) +{ + return 0; +} diff --git a/python/strings_udf/strings_udf/_typing.py b/python/strings_udf/strings_udf/_typing.py index a309a9cb93c..b678db88b95 100644 --- a/python/strings_udf/strings_udf/_typing.py +++ b/python/strings_udf/strings_udf/_typing.py @@ -159,8 +159,13 @@ def generic(self, args, kws): register_stringview_binaryop(operator.gt, types.boolean) register_stringview_binaryop(operator.le, types.boolean) register_stringview_binaryop(operator.ge, types.boolean) + +# st in other register_stringview_binaryop(operator.contains, types.boolean) +# st + other +register_stringview_binaryop(operator.add, udf_string) + def create_binary_attr(attrname, retty): """ diff --git a/python/strings_udf/strings_udf/lowering.py b/python/strings_udf/strings_udf/lowering.py index 49ba77ee996..f4b227f26dc 100644 --- a/python/strings_udf/strings_udf/lowering.py +++ b/python/strings_udf/strings_udf/lowering.py @@ -25,6 +25,9 @@ # CUDA function declarations # read-only (input is a string_view, output is a fixed with type) _string_view_len = cuda.declare_device("len", size_type(_STR_VIEW_PTR)) +_concat_string_view = cuda.declare_device( + "concat", types.void(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR) +) def _declare_binary_func(lhs, rhs, out, name): @@ -159,6 +162,31 @@ def len_impl(context, builder, sig, args): return result +def call_concat_string_view(result, lhs, rhs): + return _concat_string_view(result, lhs, rhs) + + +@cuda_lower(operator.add, string_view, string_view) +def concat_impl(context, builder, sig, args): + lhs_ptr = builder.alloca(args[0].type) + rhs_ptr = builder.alloca(args[1].type) + builder.store(args[0], lhs_ptr) + builder.store(args[1], rhs_ptr) + + udf_str_ptr = builder.alloca(default_manager[udf_string].get_value_type()) + _ = context.compile_internal( + builder, + call_concat_string_view, + types.void(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR), + (udf_str_ptr, lhs_ptr, rhs_ptr), + ) + + result = cgutils.create_struct_proxy(udf_string)( + context, builder, value=builder.load(udf_str_ptr) + ) + return result._getvalue() + + def create_binary_string_func(binary_func, retty): """ Provide a wrapper around numba's low-level extension API which diff --git a/python/strings_udf/strings_udf/tests/test_string_udfs.py b/python/strings_udf/strings_udf/tests/test_string_udfs.py index 522433d404f..abaa6919502 100644 --- a/python/strings_udf/strings_udf/tests/test_string_udfs.py +++ b/python/strings_udf/strings_udf/tests/test_string_udfs.py @@ -302,3 +302,11 @@ def func(st): return st.rstrip(strip_char) run_udf_test(data, func, "str") + + +@pytest.mark.parametrize("concat_char", ["1", "a", "12", " ", "", ".", "@"]) +def test_string_udf_concat(data, concat_char): + def func(st): + return st + concat_char + + run_udf_test(data, func, "str") From 2bbba3be6f8db5b0ff1375da6cb6f5a02241191d Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 9 Nov 2022 18:03:04 -0800 Subject: [PATCH 48/54] troublesome segfaulting shim function --- python/strings_udf/cpp/src/strings/udf/shim.cu | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/python/strings_udf/cpp/src/strings/udf/shim.cu b/python/strings_udf/cpp/src/strings/udf/shim.cu index cac97d454ac..73f0755c941 100644 --- a/python/strings_udf/cpp/src/strings/udf/shim.cu +++ b/python/strings_udf/cpp/src/strings/udf/shim.cu @@ -271,10 +271,15 @@ extern "C" __device__ int rstrip(int* nb_retval, return 0; } -extern "C" __device__ int concat(int* nb_retval, - void* const* to_strip, - void* udf_str, - void* const* strip_str) +extern "C" __device__ int concat(int* nb_retval, void* udf_str, void* const* lhs, void* const* rhs) { + auto lhs_ptr = reinterpret_cast(lhs); + auto rhs_ptr = reinterpret_cast(rhs); + auto udf_str_ptr = reinterpret_cast(udf_str); + + udf_string result; + result.append(*lhs_ptr).append(*rhs_ptr); + *udf_str_ptr = result; + printf("%s\n", result.data()); return 0; } From 906f6d5d31ed3947b5534514ae7c0aaa7f6c0cc5 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 10 Nov 2022 07:59:06 -0800 Subject: [PATCH 49/54] zero out preallocated udf_string --- python/strings_udf/cpp/src/strings/udf/shim.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/strings_udf/cpp/src/strings/udf/shim.cu b/python/strings_udf/cpp/src/strings/udf/shim.cu index 015b6d242d6..383b2ff055b 100644 --- a/python/strings_udf/cpp/src/strings/udf/shim.cu +++ b/python/strings_udf/cpp/src/strings/udf/shim.cu @@ -278,8 +278,10 @@ extern "C" __device__ int concat(int* nb_retval, void* udf_str, void* const* lhs auto udf_str_ptr = reinterpret_cast(udf_str); - udf_string result; + // struct allocated by numba is not initialized + memset(udf_str_ptr, 0, sizeof(udf_string)) + udf_string result; result.append(*lhs_ptr).append(*rhs_ptr); *udf_str_ptr = result; printf("%s\n", result.data()); From bdb64e9d8effb3bb4934a3f9bea6185040adc248 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 10 Nov 2022 08:37:16 -0800 Subject: [PATCH 50/54] add tests for maskedtype and a little extra typing --- python/cudf/cudf/core/udf/strings_typing.py | 10 ++++++++++ python/strings_udf/cpp/src/strings/udf/shim.cu | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/udf/strings_typing.py b/python/cudf/cudf/core/udf/strings_typing.py index f8f50600b12..e8a35c12f71 100644 --- a/python/cudf/cudf/core/udf/strings_typing.py +++ b/python/cudf/cudf/core/udf/strings_typing.py @@ -59,6 +59,16 @@ def len_typing(self, args, kws): return nb_signature(size_type, args[0]) +@register_string_function(operator.add) +def concat_typing(self, args, kws): + if _is_valid_string_arg(args[0]) and _is_valid_string_arg(args[1]): + return nb_signature( + MaskedType(udf_string), + MaskedType(string_view), + MaskedType(string_view), + ) + + @register_string_function(operator.contains) def contains_typing(self, args, kws): if _is_valid_string_arg(args[0]) and _is_valid_string_arg(args[1]): diff --git a/python/strings_udf/cpp/src/strings/udf/shim.cu b/python/strings_udf/cpp/src/strings/udf/shim.cu index 383b2ff055b..5c191062bf0 100644 --- a/python/strings_udf/cpp/src/strings/udf/shim.cu +++ b/python/strings_udf/cpp/src/strings/udf/shim.cu @@ -279,9 +279,9 @@ extern "C" __device__ int concat(int* nb_retval, void* udf_str, void* const* lhs auto udf_str_ptr = reinterpret_cast(udf_str); // struct allocated by numba is not initialized - memset(udf_str_ptr, 0, sizeof(udf_string)) + memset(udf_str_ptr, 0, sizeof(udf_string)); - udf_string result; + udf_string result; result.append(*lhs_ptr).append(*rhs_ptr); *udf_str_ptr = result; printf("%s\n", result.data()); From bfff98a0d356f0c3d3c56b89f90a54637ccc49b0 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Thu, 10 Nov 2022 13:34:01 -0600 Subject: [PATCH 51/54] Update python/strings_udf/cpp/src/strings/udf/shim.cu Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com> --- python/strings_udf/cpp/src/strings/udf/shim.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/python/strings_udf/cpp/src/strings/udf/shim.cu b/python/strings_udf/cpp/src/strings/udf/shim.cu index 5c191062bf0..7b9a9074753 100644 --- a/python/strings_udf/cpp/src/strings/udf/shim.cu +++ b/python/strings_udf/cpp/src/strings/udf/shim.cu @@ -284,6 +284,5 @@ extern "C" __device__ int concat(int* nb_retval, void* udf_str, void* const* lhs udf_string result; result.append(*lhs_ptr).append(*rhs_ptr); *udf_str_ptr = result; - printf("%s\n", result.data()); return 0; } From c22123ab1eeac0b8e2a23a9ec8534cb51daee5c1 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 14 Nov 2022 09:42:04 -0800 Subject: [PATCH 52/54] move memset into lowering --- python/strings_udf/cpp/src/strings/udf/shim.cu | 3 --- python/strings_udf/strings_udf/lowering.py | 4 +++- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/python/strings_udf/cpp/src/strings/udf/shim.cu b/python/strings_udf/cpp/src/strings/udf/shim.cu index 7b9a9074753..bc717da4669 100644 --- a/python/strings_udf/cpp/src/strings/udf/shim.cu +++ b/python/strings_udf/cpp/src/strings/udf/shim.cu @@ -278,9 +278,6 @@ extern "C" __device__ int concat(int* nb_retval, void* udf_str, void* const* lhs auto udf_str_ptr = reinterpret_cast(udf_str); - // struct allocated by numba is not initialized - memset(udf_str_ptr, 0, sizeof(udf_string)); - udf_string result; result.append(*lhs_ptr).append(*rhs_ptr); *udf_str_ptr = result; diff --git a/python/strings_udf/strings_udf/lowering.py b/python/strings_udf/strings_udf/lowering.py index d98384c0d02..f4bc73c5b6f 100644 --- a/python/strings_udf/strings_udf/lowering.py +++ b/python/strings_udf/strings_udf/lowering.py @@ -174,7 +174,9 @@ def concat_impl(context, builder, sig, args): builder.store(args[0], lhs_ptr) builder.store(args[1], rhs_ptr) - udf_str_ptr = builder.alloca(default_manager[udf_string].get_value_type()) + udf_str_ptr = cgutils.alloca_once( + builder, default_manager[udf_string].get_value_type(), zfill=True + ) _ = context.compile_internal( builder, call_concat_string_view, From 7255d944a6c52d5358058a01c3e3cb57f3796248 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 15 Nov 2022 07:47:55 -0800 Subject: [PATCH 53/54] use placement new --- python/strings_udf/cpp/src/strings/udf/shim.cu | 2 +- python/strings_udf/strings_udf/lowering.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/python/strings_udf/cpp/src/strings/udf/shim.cu b/python/strings_udf/cpp/src/strings/udf/shim.cu index bc717da4669..737afff4a1b 100644 --- a/python/strings_udf/cpp/src/strings/udf/shim.cu +++ b/python/strings_udf/cpp/src/strings/udf/shim.cu @@ -276,7 +276,7 @@ extern "C" __device__ int concat(int* nb_retval, void* udf_str, void* const* lhs auto lhs_ptr = reinterpret_cast(lhs); auto rhs_ptr = reinterpret_cast(rhs); - auto udf_str_ptr = reinterpret_cast(udf_str); + auto udf_str_ptr = new (udf_str) udf_string; udf_string result; result.append(*lhs_ptr).append(*rhs_ptr); diff --git a/python/strings_udf/strings_udf/lowering.py b/python/strings_udf/strings_udf/lowering.py index f4bc73c5b6f..d98384c0d02 100644 --- a/python/strings_udf/strings_udf/lowering.py +++ b/python/strings_udf/strings_udf/lowering.py @@ -174,9 +174,7 @@ def concat_impl(context, builder, sig, args): builder.store(args[0], lhs_ptr) builder.store(args[1], rhs_ptr) - udf_str_ptr = cgutils.alloca_once( - builder, default_manager[udf_string].get_value_type(), zfill=True - ) + udf_str_ptr = builder.alloca(default_manager[udf_string].get_value_type()) _ = context.compile_internal( builder, call_concat_string_view, From d6d030c0528981baa040a3184a735cd2ddf8e3fa Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 16 Nov 2022 06:11:24 -0800 Subject: [PATCH 54/54] add reflected concat test --- python/strings_udf/strings_udf/tests/test_string_udfs.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/strings_udf/strings_udf/tests/test_string_udfs.py b/python/strings_udf/strings_udf/tests/test_string_udfs.py index abaa6919502..49663ee02ec 100644 --- a/python/strings_udf/strings_udf/tests/test_string_udfs.py +++ b/python/strings_udf/strings_udf/tests/test_string_udfs.py @@ -310,3 +310,11 @@ def func(st): return st + concat_char run_udf_test(data, func, "str") + + +@pytest.mark.parametrize("concat_char", ["1", "a", "12", " ", "", ".", "@"]) +def test_string_udf_concat_reflected(data, concat_char): + def func(st): + return concat_char + st + + run_udf_test(data, func, "str")