From 797433d1f5450f5397ecc319caacf3f0fc6fcc22 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 28 Feb 2022 15:25:11 -0500
Subject: [PATCH 1/4] Move standalone string functions from string_view.hpp to
 string.hpp

---
 cpp/include/cudf/strings/string.hpp      | 121 +++++++++++++++++++++++
 cpp/include/cudf/strings/string_view.cuh |   3 +-
 cpp/include/cudf/strings/string_view.hpp |  92 -----------------
 3 files changed, 123 insertions(+), 93 deletions(-)
 create mode 100644 cpp/include/cudf/strings/string.hpp

diff --git a/cpp/include/cudf/strings/string.hpp b/cpp/include/cudf/strings/string.hpp
new file mode 100644
index 00000000000..1b88a9dd8fd
--- /dev/null
+++ b/cpp/include/cudf/strings/string.hpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/types.hpp>
+
+/**
+ * @file
+ * @brief Standalone string functions.
+ */
+
+namespace cudf {
+
+using char_utf8 = uint32_t;  ///< UTF-8 characters are 1-4 bytes
+
+namespace strings {
+namespace detail {
+
+/**
+ * @brief This will return true if passed the first byte of a UTF-8 character.
+ *
+ * @param byte Any byte from a valid UTF-8 character
+ * @return true if this the first byte of the character
+ */
+constexpr bool is_begin_utf8_char(uint8_t byte)
+{
+  // The (0xC0 & 0x80) bit pattern identifies a continuation byte of a character.
+  return (byte & 0xC0) != 0x80;
+}
+
+/**
+ * @brief Returns the number of bytes in the specified character.
+ *
+ * @param character Single character
+ * @return Number of bytes
+ */
+constexpr size_type bytes_in_char_utf8(char_utf8 character)
+{
+  return 1 + static_cast<size_type>((character & unsigned{0x0000FF00}) > 0) +
+         static_cast<size_type>((character & unsigned{0x00FF0000}) > 0) +
+         static_cast<size_type>((character & unsigned{0xFF000000}) > 0);
+}
+
+/**
+ * @brief Returns the number of bytes used to represent the provided byte.
+ *
+ * This could be 0 to 4 bytes. 0 is returned for intermediate bytes within a
+ * single character. For example, for the two-byte 0xC3A8 single character,
+ * the first byte would return 2 and the second byte would return 0.
+ *
+ * @param byte Byte from an encoded character.
+ * @return Number of bytes.
+ */
+constexpr size_type bytes_in_utf8_byte(uint8_t byte)
+{
+  return 1 + static_cast<size_type>((byte & 0xF0) == 0xF0)  // 4-byte character prefix
+         + static_cast<size_type>((byte & 0xE0) == 0xE0)    // 3-byte character prefix
+         + static_cast<size_type>((byte & 0xC0) == 0xC0)    // 2-byte character prefix
+         - static_cast<size_type>((byte & 0xC0) == 0x80);   // intermediate byte
+}
+
+/**
+ * @brief Convert a char array into a char_utf8 value.
+ *
+ * @param str String containing encoded char bytes.
+ * @param[out] character Single char_utf8 value.
+ * @return The number of bytes in the character
+ */
+constexpr size_type to_char_utf8(const char* str, char_utf8& character)
+{
+  size_type const chr_width = bytes_in_utf8_byte(static_cast<uint8_t>(*str));
+
+  character = static_cast<char_utf8>(*str++) & 0xFF;
+  if (chr_width > 1) {
+    character = character << 8;
+    character |= (static_cast<char_utf8>(*str++) & 0xFF);  // << 8;
+    if (chr_width > 2) {
+      character = character << 8;
+      character |= (static_cast<char_utf8>(*str++) & 0xFF);  // << 16;
+      if (chr_width > 3) {
+        character = character << 8;
+        character |= (static_cast<char_utf8>(*str++) & 0xFF);  // << 24;
+      }
+    }
+  }
+  return chr_width;
+}
+
+/**
+ * @brief Place a char_utf8 value into a char array.
+ *
+ * @param character Single character
+ * @param[out] str Output array.
+ * @return The number of bytes in the character
+ */
+constexpr inline size_type from_char_utf8(char_utf8 character, char* str)
+{
+  size_type const chr_width = bytes_in_char_utf8(character);
+  for (size_type idx = 0; idx < chr_width; ++idx) {
+    str[chr_width - idx - 1] = static_cast<char>(character) & 0xFF;
+    character                = character >> 8;
+  }
+  return chr_width;
+}
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index 24c8bfea2be..f374995e129 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/strings/string.hpp>
 #include <cudf/strings/string_view.hpp>
 
 #ifndef __CUDA_ARCH__
diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp
index f88f573ac0c..9f7fa433b9c 100644
--- a/cpp/include/cudf/strings/string_view.hpp
+++ b/cpp/include/cudf/strings/string_view.hpp
@@ -333,96 +333,4 @@ class string_view {
   __device__ [[nodiscard]] inline size_type character_offset(size_type bytepos) const;
 };
 
-namespace strings {
-namespace detail {
-
-/**
- * @brief This will return true if passed the first byte of a UTF-8 character.
- *
- * @param byte Any byte from a valid UTF-8 character
- * @return true if this the first byte of the character
- */
-constexpr bool is_begin_utf8_char(uint8_t byte)
-{
-  // The (0xC0 & 0x80) bit pattern identifies a continuation byte of a character.
-  return (byte & 0xC0) != 0x80;
-}
-
-/**
- * @brief Returns the number of bytes in the specified character.
- *
- * @param character Single character
- * @return Number of bytes
- */
-constexpr size_type bytes_in_char_utf8(char_utf8 character)
-{
-  return 1 + static_cast<size_type>((character & unsigned{0x0000FF00}) > 0) +
-         static_cast<size_type>((character & unsigned{0x00FF0000}) > 0) +
-         static_cast<size_type>((character & unsigned{0xFF000000}) > 0);
-}
-
-/**
- * @brief Returns the number of bytes used to represent the provided byte.
- *
- * This could be 0 to 4 bytes. 0 is returned for intermediate bytes within a
- * single character. For example, for the two-byte 0xC3A8 single character,
- * the first byte would return 2 and the second byte would return 0.
- *
- * @param byte Byte from an encoded character.
- * @return Number of bytes.
- */
-constexpr size_type bytes_in_utf8_byte(uint8_t byte)
-{
-  return 1 + static_cast<size_type>((byte & 0xF0) == 0xF0)  // 4-byte character prefix
-         + static_cast<size_type>((byte & 0xE0) == 0xE0)    // 3-byte character prefix
-         + static_cast<size_type>((byte & 0xC0) == 0xC0)    // 2-byte character prefix
-         - static_cast<size_type>((byte & 0xC0) == 0x80);   // intermediate byte
-}
-
-/**
- * @brief Convert a char array into a char_utf8 value.
- *
- * @param str String containing encoded char bytes.
- * @param[out] character Single char_utf8 value.
- * @return The number of bytes in the character
- */
-CUDF_HOST_DEVICE inline size_type to_char_utf8(const char* str, char_utf8& character)
-{
-  size_type const chr_width = bytes_in_utf8_byte(static_cast<uint8_t>(*str));
-
-  character = static_cast<char_utf8>(*str++) & 0xFF;
-  if (chr_width > 1) {
-    character = character << 8;
-    character |= (static_cast<char_utf8>(*str++) & 0xFF);  // << 8;
-    if (chr_width > 2) {
-      character = character << 8;
-      character |= (static_cast<char_utf8>(*str++) & 0xFF);  // << 16;
-      if (chr_width > 3) {
-        character = character << 8;
-        character |= (static_cast<char_utf8>(*str++) & 0xFF);  // << 24;
-      }
-    }
-  }
-  return chr_width;
-}
-
-/**
- * @brief Place a char_utf8 value into a char array.
- *
- * @param character Single character
- * @param[out] str Allocated char array with enough space to hold the encoded character.
- * @return The number of bytes in the character
- */
-CUDF_HOST_DEVICE inline size_type from_char_utf8(char_utf8 character, char* str)
-{
-  size_type const chr_width = bytes_in_char_utf8(character);
-  for (size_type idx = 0; idx < chr_width; ++idx) {
-    str[chr_width - idx - 1] = static_cast<char>(character) & 0xFF;
-    character                = character >> 8;
-  }
-  return chr_width;
-}
-
-}  // namespace detail
-}  // namespace strings
 }  // namespace cudf

From ff01046edb95ddcd06f412dc1175213f35353882 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 28 Feb 2022 15:59:29 -0500
Subject: [PATCH 2/4] move string.hpp to detail

---
 conda/recipes/libcudf/meta.yaml                  | 1 +
 cpp/include/cudf/strings/{ => detail}/string.hpp | 0
 cpp/include/cudf/strings/string_view.cuh         | 2 +-
 cpp/include/cudf/strings/string_view.hpp         | 2 +-
 4 files changed, 3 insertions(+), 2 deletions(-)
 rename cpp/include/cudf/strings/{ => detail}/string.hpp (100%)

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 4e20c979f6c..5e09c623b3d 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -203,6 +203,7 @@ test:
     - test -f $PREFIX/include/cudf/strings/detail/fill.hpp
     - test -f $PREFIX/include/cudf/strings/detail/json.hpp
     - test -f $PREFIX/include/cudf/strings/detail/replace.hpp
+    - test -f $PREFIX/include/cudf/strings/detail/string.hpp
     - test -f $PREFIX/include/cudf/strings/detail/utilities.hpp
     - test -f $PREFIX/include/cudf/strings/extract.hpp
     - test -f $PREFIX/include/cudf/strings/findall.hpp
diff --git a/cpp/include/cudf/strings/string.hpp b/cpp/include/cudf/strings/detail/string.hpp
similarity index 100%
rename from cpp/include/cudf/strings/string.hpp
rename to cpp/include/cudf/strings/detail/string.hpp
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index f374995e129..c9c917df508 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <cudf/strings/string.hpp>
+#include <cudf/strings/detail/string.hpp>
 #include <cudf/strings/string_view.hpp>
 
 #ifndef __CUDA_ARCH__
diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp
index 9f7fa433b9c..0c76f7d818d 100644
--- a/cpp/include/cudf/strings/string_view.hpp
+++ b/cpp/include/cudf/strings/string_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 4dbc4cb9669ea3f09216f70b03f40258305e322b Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 1 Mar 2022 08:18:39 -0500
Subject: [PATCH 3/4] rename string.h to utf8.h

---
 cpp/include/cudf/strings/detail/{string.hpp => utf8.hpp} | 0
 cpp/include/cudf/strings/string_view.cuh                 | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename cpp/include/cudf/strings/detail/{string.hpp => utf8.hpp} (100%)

diff --git a/cpp/include/cudf/strings/detail/string.hpp b/cpp/include/cudf/strings/detail/utf8.hpp
similarity index 100%
rename from cpp/include/cudf/strings/detail/string.hpp
rename to cpp/include/cudf/strings/detail/utf8.hpp
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index c9c917df508..9ef361d6519 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <cudf/strings/detail/string.hpp>
+#include <cudf/strings/detail/utf8.hpp>
 #include <cudf/strings/string_view.hpp>
 
 #ifndef __CUDA_ARCH__

From d0539b754f8652ad0ad41f82a90fce4dcf133544 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 1 Mar 2022 09:19:43 -0500
Subject: [PATCH 4/4] update meta.yaml with new header name

---
 conda/recipes/libcudf/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 5e09c623b3d..0edcf825c8b 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -203,7 +203,7 @@ test:
     - test -f $PREFIX/include/cudf/strings/detail/fill.hpp
     - test -f $PREFIX/include/cudf/strings/detail/json.hpp
     - test -f $PREFIX/include/cudf/strings/detail/replace.hpp
-    - test -f $PREFIX/include/cudf/strings/detail/string.hpp
+    - test -f $PREFIX/include/cudf/strings/detail/utf8.hpp
     - test -f $PREFIX/include/cudf/strings/detail/utilities.hpp
     - test -f $PREFIX/include/cudf/strings/extract.hpp
     - test -f $PREFIX/include/cudf/strings/findall.hpp