From 797433d1f5450f5397ecc319caacf3f0fc6fcc22 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 28 Feb 2022 15:25:11 -0500 Subject: [PATCH 1/4] Move standalone string functions from string_view.hpp to string.hpp --- cpp/include/cudf/strings/string.hpp | 121 +++++++++++++++++++++++ cpp/include/cudf/strings/string_view.cuh | 3 +- cpp/include/cudf/strings/string_view.hpp | 92 ----------------- 3 files changed, 123 insertions(+), 93 deletions(-) create mode 100644 cpp/include/cudf/strings/string.hpp diff --git a/cpp/include/cudf/strings/string.hpp b/cpp/include/cudf/strings/string.hpp new file mode 100644 index 00000000000..1b88a9dd8fd --- /dev/null +++ b/cpp/include/cudf/strings/string.hpp @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +/** + * @file + * @brief Standalone string functions. + */ + +namespace cudf { + +using char_utf8 = uint32_t; ///< UTF-8 characters are 1-4 bytes + +namespace strings { +namespace detail { + +/** + * @brief This will return true if passed the first byte of a UTF-8 character. + * + * @param byte Any byte from a valid UTF-8 character + * @return true if this the first byte of the character + */ +constexpr bool is_begin_utf8_char(uint8_t byte) +{ + // The (0xC0 & 0x80) bit pattern identifies a continuation byte of a character. + return (byte & 0xC0) != 0x80; +} + +/** + * @brief Returns the number of bytes in the specified character. + * + * @param character Single character + * @return Number of bytes + */ +constexpr size_type bytes_in_char_utf8(char_utf8 character) +{ + return 1 + static_cast((character & unsigned{0x0000FF00}) > 0) + + static_cast((character & unsigned{0x00FF0000}) > 0) + + static_cast((character & unsigned{0xFF000000}) > 0); +} + +/** + * @brief Returns the number of bytes used to represent the provided byte. + * + * This could be 0 to 4 bytes. 0 is returned for intermediate bytes within a + * single character. For example, for the two-byte 0xC3A8 single character, + * the first byte would return 2 and the second byte would return 0. + * + * @param byte Byte from an encoded character. + * @return Number of bytes. + */ +constexpr size_type bytes_in_utf8_byte(uint8_t byte) +{ + return 1 + static_cast((byte & 0xF0) == 0xF0) // 4-byte character prefix + + static_cast((byte & 0xE0) == 0xE0) // 3-byte character prefix + + static_cast((byte & 0xC0) == 0xC0) // 2-byte character prefix + - static_cast((byte & 0xC0) == 0x80); // intermediate byte +} + +/** + * @brief Convert a char array into a char_utf8 value. + * + * @param str String containing encoded char bytes. + * @param[out] character Single char_utf8 value. + * @return The number of bytes in the character + */ +constexpr size_type to_char_utf8(const char* str, char_utf8& character) +{ + size_type const chr_width = bytes_in_utf8_byte(static_cast(*str)); + + character = static_cast(*str++) & 0xFF; + if (chr_width > 1) { + character = character << 8; + character |= (static_cast(*str++) & 0xFF); // << 8; + if (chr_width > 2) { + character = character << 8; + character |= (static_cast(*str++) & 0xFF); // << 16; + if (chr_width > 3) { + character = character << 8; + character |= (static_cast(*str++) & 0xFF); // << 24; + } + } + } + return chr_width; +} + +/** + * @brief Place a char_utf8 value into a char array. + * + * @param character Single character + * @param[out] str Output array. + * @return The number of bytes in the character + */ +constexpr inline size_type from_char_utf8(char_utf8 character, char* str) +{ + size_type const chr_width = bytes_in_char_utf8(character); + for (size_type idx = 0; idx < chr_width; ++idx) { + str[chr_width - idx - 1] = static_cast(character) & 0xFF; + character = character >> 8; + } + return chr_width; +} + +} // namespace detail +} // namespace strings +} // namespace cudf diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 24c8bfea2be..f374995e129 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #pragma once +#include #include #ifndef __CUDA_ARCH__ diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp index f88f573ac0c..9f7fa433b9c 100644 --- a/cpp/include/cudf/strings/string_view.hpp +++ b/cpp/include/cudf/strings/string_view.hpp @@ -333,96 +333,4 @@ class string_view { __device__ [[nodiscard]] inline size_type character_offset(size_type bytepos) const; }; -namespace strings { -namespace detail { - -/** - * @brief This will return true if passed the first byte of a UTF-8 character. - * - * @param byte Any byte from a valid UTF-8 character - * @return true if this the first byte of the character - */ -constexpr bool is_begin_utf8_char(uint8_t byte) -{ - // The (0xC0 & 0x80) bit pattern identifies a continuation byte of a character. - return (byte & 0xC0) != 0x80; -} - -/** - * @brief Returns the number of bytes in the specified character. - * - * @param character Single character - * @return Number of bytes - */ -constexpr size_type bytes_in_char_utf8(char_utf8 character) -{ - return 1 + static_cast((character & unsigned{0x0000FF00}) > 0) + - static_cast((character & unsigned{0x00FF0000}) > 0) + - static_cast((character & unsigned{0xFF000000}) > 0); -} - -/** - * @brief Returns the number of bytes used to represent the provided byte. - * - * This could be 0 to 4 bytes. 0 is returned for intermediate bytes within a - * single character. For example, for the two-byte 0xC3A8 single character, - * the first byte would return 2 and the second byte would return 0. - * - * @param byte Byte from an encoded character. - * @return Number of bytes. - */ -constexpr size_type bytes_in_utf8_byte(uint8_t byte) -{ - return 1 + static_cast((byte & 0xF0) == 0xF0) // 4-byte character prefix - + static_cast((byte & 0xE0) == 0xE0) // 3-byte character prefix - + static_cast((byte & 0xC0) == 0xC0) // 2-byte character prefix - - static_cast((byte & 0xC0) == 0x80); // intermediate byte -} - -/** - * @brief Convert a char array into a char_utf8 value. - * - * @param str String containing encoded char bytes. - * @param[out] character Single char_utf8 value. - * @return The number of bytes in the character - */ -CUDF_HOST_DEVICE inline size_type to_char_utf8(const char* str, char_utf8& character) -{ - size_type const chr_width = bytes_in_utf8_byte(static_cast(*str)); - - character = static_cast(*str++) & 0xFF; - if (chr_width > 1) { - character = character << 8; - character |= (static_cast(*str++) & 0xFF); // << 8; - if (chr_width > 2) { - character = character << 8; - character |= (static_cast(*str++) & 0xFF); // << 16; - if (chr_width > 3) { - character = character << 8; - character |= (static_cast(*str++) & 0xFF); // << 24; - } - } - } - return chr_width; -} - -/** - * @brief Place a char_utf8 value into a char array. - * - * @param character Single character - * @param[out] str Allocated char array with enough space to hold the encoded character. - * @return The number of bytes in the character - */ -CUDF_HOST_DEVICE inline size_type from_char_utf8(char_utf8 character, char* str) -{ - size_type const chr_width = bytes_in_char_utf8(character); - for (size_type idx = 0; idx < chr_width; ++idx) { - str[chr_width - idx - 1] = static_cast(character) & 0xFF; - character = character >> 8; - } - return chr_width; -} - -} // namespace detail -} // namespace strings } // namespace cudf From ff01046edb95ddcd06f412dc1175213f35353882 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 28 Feb 2022 15:59:29 -0500 Subject: [PATCH 2/4] move string.hpp to detail --- conda/recipes/libcudf/meta.yaml | 1 + cpp/include/cudf/strings/{ => detail}/string.hpp | 0 cpp/include/cudf/strings/string_view.cuh | 2 +- cpp/include/cudf/strings/string_view.hpp | 2 +- 4 files changed, 3 insertions(+), 2 deletions(-) rename cpp/include/cudf/strings/{ => detail}/string.hpp (100%) diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 4e20c979f6c..5e09c623b3d 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -203,6 +203,7 @@ test: - test -f $PREFIX/include/cudf/strings/detail/fill.hpp - test -f $PREFIX/include/cudf/strings/detail/json.hpp - test -f $PREFIX/include/cudf/strings/detail/replace.hpp + - test -f $PREFIX/include/cudf/strings/detail/string.hpp - test -f $PREFIX/include/cudf/strings/detail/utilities.hpp - test -f $PREFIX/include/cudf/strings/extract.hpp - test -f $PREFIX/include/cudf/strings/findall.hpp diff --git a/cpp/include/cudf/strings/string.hpp b/cpp/include/cudf/strings/detail/string.hpp similarity index 100% rename from cpp/include/cudf/strings/string.hpp rename to cpp/include/cudf/strings/detail/string.hpp diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index f374995e129..c9c917df508 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -16,7 +16,7 @@ #pragma once -#include +#include #include #ifndef __CUDA_ARCH__ diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp index 9f7fa433b9c..0c76f7d818d 100644 --- a/cpp/include/cudf/strings/string_view.hpp +++ b/cpp/include/cudf/strings/string_view.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 4dbc4cb9669ea3f09216f70b03f40258305e322b Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 1 Mar 2022 08:18:39 -0500 Subject: [PATCH 3/4] rename string.h to utf8.h --- cpp/include/cudf/strings/detail/{string.hpp => utf8.hpp} | 0 cpp/include/cudf/strings/string_view.cuh | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename cpp/include/cudf/strings/detail/{string.hpp => utf8.hpp} (100%) diff --git a/cpp/include/cudf/strings/detail/string.hpp b/cpp/include/cudf/strings/detail/utf8.hpp similarity index 100% rename from cpp/include/cudf/strings/detail/string.hpp rename to cpp/include/cudf/strings/detail/utf8.hpp diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index c9c917df508..9ef361d6519 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -16,7 +16,7 @@ #pragma once -#include +#include #include #ifndef __CUDA_ARCH__ From d0539b754f8652ad0ad41f82a90fce4dcf133544 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 1 Mar 2022 09:19:43 -0500 Subject: [PATCH 4/4] update meta.yaml with new header name --- conda/recipes/libcudf/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 5e09c623b3d..0edcf825c8b 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -203,7 +203,7 @@ test: - test -f $PREFIX/include/cudf/strings/detail/fill.hpp - test -f $PREFIX/include/cudf/strings/detail/json.hpp - test -f $PREFIX/include/cudf/strings/detail/replace.hpp - - test -f $PREFIX/include/cudf/strings/detail/string.hpp + - test -f $PREFIX/include/cudf/strings/detail/utf8.hpp - test -f $PREFIX/include/cudf/strings/detail/utilities.hpp - test -f $PREFIX/include/cudf/strings/extract.hpp - test -f $PREFIX/include/cudf/strings/findall.hpp