From 97ce908875ae32f7c93a25105dd567a0bba97495 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 19 Apr 2022 16:25:15 -0400 Subject: [PATCH 1/4] add dstring class --- cpp/include/cudf/strings/udf/dstring.cuh | 454 +++++++++++++++++++ cpp/include/cudf/strings/udf/dstring.hpp | 526 +++++++++++++++++++++++ 2 files changed, 980 insertions(+) create mode 100644 cpp/include/cudf/strings/udf/dstring.cuh create mode 100644 cpp/include/cudf/strings/udf/dstring.hpp diff --git a/cpp/include/cudf/strings/udf/dstring.cuh b/cpp/include/cudf/strings/udf/dstring.cuh new file mode 100644 index 00000000000..c9017e11bd0 --- /dev/null +++ b/cpp/include/cudf/strings/udf/dstring.cuh @@ -0,0 +1,454 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "dstring.hpp" + +#include +#include +#include + +#include +#include +#include + +namespace cudf { +namespace strings { +namespace udf { + +__device__ inline char* dstring::allocate(cudf::size_type bytes) +{ + char* data = static_cast(malloc(bytes + 1)); + data[bytes] = 0; // add null-terminator so we can printf strings in device code + return data; +} + +__device__ inline void dstring::deallocate(char* data) +{ + if (data) free(data); +} + +__device__ void dstring::reallocate(cudf::size_type bytes) +{ + m_capacity = bytes; + auto new_data = allocate(m_capacity); + memcpy(new_data, m_data, std::min(m_bytes, bytes)); + deallocate(m_data); + m_data = new_data; +} + +__device__ inline dstring::dstring(char const* data, cudf::size_type bytes) + : m_bytes(bytes), m_capacity(bytes) +{ + m_data = allocate(m_capacity); + memcpy(m_data, data, bytes); +} + +__device__ dstring::dstring(cudf::size_type count, cudf::char_utf8 chr) +{ + if (count <= 0) { return; } + m_bytes = m_capacity = cudf::strings::detail::bytes_in_char_utf8(chr) * count; + m_data = allocate(m_capacity); + auto out_ptr = m_data; + for (auto idx = 0; idx < count; ++idx) { + out_ptr += cudf::strings::detail::from_char_utf8(chr, out_ptr); + } +} + +__device__ inline dstring::dstring(char const* data) +{ + m_bytes = m_capacity = cudf::strings::detail::bytes_in_null_terminated_string(data); + m_data = allocate(m_capacity); + memcpy(m_data, data, m_bytes); +} + +__device__ inline dstring::dstring(dstring const& src) + : m_bytes(src.m_bytes), m_capacity(src.m_bytes) +{ + m_data = allocate(m_capacity); + memcpy(m_data, src.m_data, m_bytes); +} + +__device__ inline dstring::dstring(dstring&& src) + : m_data(src.m_data), m_bytes(src.m_bytes), m_capacity(src.m_capacity) +{ + src.m_data = nullptr; + src.m_bytes = 0; + src.m_capacity = 0; +} + +__device__ inline dstring::dstring(cudf::string_view const& str) + : m_bytes(str.size_bytes()), m_capacity(str.size_bytes()) +{ + m_data = allocate(m_capacity); + memcpy(m_data, str.data(), m_bytes); +} + +__device__ inline dstring::~dstring() { deallocate(m_data); } + +__device__ inline dstring& dstring::operator=(dstring const& str) { return assign(str); } + +__device__ inline dstring& dstring::operator=(dstring&& str) { return assign(std::move(str)); } + +__device__ inline dstring& dstring::operator=(cudf::string_view const& str) { return assign(str); } + +__device__ inline dstring& dstring::operator=(char const* str) { return assign(str); } + +__device__ dstring& dstring::assign(dstring&& str) +{ + if (this == &str) { return *this; } + m_data = str.m_data; + m_bytes = str.m_bytes; + m_capacity = str.m_capacity; + str.m_data = nullptr; + str.m_bytes = 0; + str.m_capacity = 0; + return *this; +} + +__device__ dstring& dstring::assign(cudf::string_view const& str) +{ + return assign(str.data(), str.size_bytes()); +} + +__device__ dstring& dstring::assign(char const* str) +{ + return assign(str, cudf::strings::detail::bytes_in_null_terminated_string(str)); +} + +__device__ dstring& dstring::assign(char const* str, cudf::size_type bytes) +{ + if (bytes >= m_capacity) { + deallocate(m_data); + m_capacity = bytes; + m_data = allocate(m_capacity); + } + m_bytes = bytes; + memcpy(m_data, str, bytes); + m_data[m_bytes] = 0; + return *this; +} + +__device__ inline cudf::size_type dstring::size_bytes() const { return m_bytes; } + +__device__ inline cudf::size_type dstring::length() const +{ + return cudf::strings::detail::characters_in_string(m_data, m_bytes); +} + +__device__ cudf::size_type dstring::max_size() const +{ + return std::numeric_limits::max() - 1; +} + +__device__ inline char* dstring::data() { return m_data; } + +__device__ inline char const* dstring::data() const { return m_data; } + +__device__ inline bool dstring::is_empty() const { return m_bytes == 0; } + +__device__ inline bool dstring::is_null() const { return m_data == nullptr; } + +__device__ inline cudf::string_view::const_iterator dstring::begin() const +{ + return cudf::string_view::const_iterator(cudf::string_view(m_data, m_bytes), 0); +} + +__device__ inline cudf::string_view::const_iterator dstring::end() const +{ + return cudf::string_view::const_iterator(cudf::string_view(m_data, m_bytes), length()); +} + +__device__ inline cudf::char_utf8 dstring::at(cudf::size_type pos) const +{ + auto const offset = byte_offset(pos); + auto chr = cudf::char_utf8{0}; + if (offset < m_bytes) { cudf::strings::detail::to_char_utf8(data() + offset, chr); } + return chr; +} + +__device__ inline cudf::char_utf8 dstring::operator[](cudf::size_type pos) const { return at(pos); } + +__device__ inline cudf::size_type dstring::byte_offset(cudf::size_type pos) const +{ + cudf::size_type offset = 0; + + auto sptr = m_data; + auto eptr = sptr + m_bytes; + while ((pos > 0) && (sptr < eptr)) { + auto const byte = static_cast(*sptr++); + auto const char_bytes = cudf::strings::detail::bytes_in_utf8_byte(byte); + if (char_bytes) { --pos; } + offset += char_bytes; + } + return offset; +} + +__device__ inline int dstring::compare(cudf::string_view const& in) const +{ + return compare(in.data(), in.size_bytes()); +} + +__device__ inline int dstring::compare(char const* data, cudf::size_type bytes) const +{ + auto const view = static_cast(*this); + return view.compare(data, bytes); +} + +__device__ inline bool dstring::operator==(cudf::string_view const& rhs) const +{ + return m_bytes == rhs.size_bytes() && compare(rhs) == 0; +} + +__device__ inline bool dstring::operator!=(cudf::string_view const& rhs) const +{ + return compare(rhs) != 0; +} + +__device__ inline bool dstring::operator<(cudf::string_view const& rhs) const +{ + return compare(rhs) < 0; +} + +__device__ inline bool dstring::operator>(cudf::string_view const& rhs) const +{ + return compare(rhs) > 0; +} + +__device__ inline bool dstring::operator<=(cudf::string_view const& rhs) const +{ + int rc = compare(rhs); + return (rc == 0) || (rc < 0); +} + +__device__ inline bool dstring::operator>=(cudf::string_view const& rhs) const +{ + int rc = compare(rhs); + return (rc == 0) || (rc > 0); +} + +__device__ inline void dstring::clear() +{ + deallocate(m_data); + m_data = nullptr; + m_bytes = 0; + m_capacity = 0; +} + +__device__ inline void dstring::resize(cudf::size_type count) +{ + if (count > max_size()) { return; } + if (count > m_capacity) { reallocate(count); } + + // add padding if necessary (null chars) + if (count > m_bytes) { memset(m_data + m_bytes, 0, count - m_bytes); } + + m_bytes = count; + m_data[m_bytes] = 0; +} + +__device__ void dstring::reserve(cudf::size_type count) +{ + if (count < max_size() && count > m_capacity) { reallocate(count); } +} + +__device__ cudf::size_type dstring::capacity() const { return m_capacity; } + +__device__ void dstring::shrink_to_fit() +{ + if (m_bytes < m_capacity) { reallocate(m_bytes); } +} + +__device__ inline dstring& dstring::append(char const* str, cudf::size_type in_bytes) +{ + if (in_bytes <= 0) { return *this; } + auto const nbytes = m_bytes + in_bytes; + if (nbytes > m_capacity) { reallocate(2 * nbytes); } + memcpy(m_data + m_bytes, str, in_bytes); + m_bytes = nbytes; + m_data[m_bytes] = 0; + return *this; +} + +__device__ inline dstring& dstring::append(char const* str) +{ + return append(str, cudf::strings::detail::bytes_in_null_terminated_string(str)); +} + +__device__ inline dstring& dstring::append(cudf::char_utf8 chr, cudf::size_type count) +{ + if (count <= 0) { return *this; } + auto const char_bytes = cudf::strings::detail::bytes_in_char_utf8(chr) * count; + auto const nbytes = m_bytes + char_bytes; + if (nbytes > m_capacity) { reallocate(2 * nbytes); } + auto out_ptr = m_data + m_bytes; + for (auto idx = 0; idx < count; ++idx) { + out_ptr += cudf::strings::detail::from_char_utf8(chr, out_ptr); + } + m_bytes = nbytes; + m_data[m_bytes] = 0; + return *this; +} + +__device__ inline dstring& dstring::append(cudf::string_view const& in) +{ + return append(in.data(), in.size_bytes()); +} + +__device__ inline dstring& dstring::operator+=(cudf::string_view const& in) { return append(in); } + +__device__ inline dstring& dstring::operator+=(cudf::char_utf8 chr) { return append(chr); } + +__device__ inline dstring& dstring::operator+=(char const* str) { return append(str); } + +__device__ inline dstring& dstring::insert(cudf::size_type pos, + char const* str, + cudf::size_type in_bytes) +{ + return replace(pos, 0, str, in_bytes); +} + +__device__ inline dstring& dstring::insert(cudf::size_type pos, char const* str) +{ + return insert(pos, str, cudf::strings::detail::bytes_in_null_terminated_string(str)); +} + +__device__ inline dstring& dstring::insert(cudf::size_type pos, cudf::string_view const& in) +{ + return insert(pos, in.data(), in.size_bytes()); +} + +__device__ inline dstring& dstring::insert(cudf::size_type pos, + cudf::size_type count, + cudf::char_utf8 chr) +{ + return replace(pos, 0, count, chr); +} + +__device__ inline dstring dstring::substr(cudf::size_type pos, cudf::size_type count) const +{ + if (pos < 0) { return dstring{"", 0}; } + auto const spos = byte_offset(pos); + if (spos >= m_bytes) { return dstring{"", 0}; } + auto const epos = count < 0 ? m_bytes : std::min(byte_offset(pos + count), m_bytes); + return dstring{data() + spos, epos - spos}; +} + +// utility for replace() +__device__ void dstring::shift_bytes(cudf::size_type spos, + cudf::size_type epos, + cudf::size_type nbytes) +{ + if (nbytes < m_bytes) { + // shift bytes to the left [...wxyz] -> [wxyzxyz] + auto src = epos; + auto tgt = spos; + while (tgt < nbytes) { + m_data[tgt++] = m_data[src++]; + } + } else if (nbytes > m_bytes) { + // shift bytes to the right [abcd...] -> [abcabcd] + auto src = m_bytes; + auto tgt = nbytes; + while (src > epos) { + m_data[--tgt] = m_data[--src]; + } + } +} + +__device__ inline dstring& dstring::replace(cudf::size_type pos, + cudf::size_type count, + char const* str, + cudf::size_type in_bytes) +{ + if (pos < 0 || in_bytes < 0) { return *this; } + auto const spos = byte_offset(pos); + if (spos > m_bytes) { return *this; } + auto const epos = count < 0 ? m_bytes : std::min(byte_offset(pos + count), m_bytes); + + // compute new size + auto const nbytes = m_bytes + in_bytes - (epos - spos); + if (nbytes > m_capacity) { reallocate(2 * nbytes); } + + // move bytes -- make room for replacement + shift_bytes(spos + in_bytes, epos, nbytes); + + // insert the replacement + memcpy(m_data + spos, str, in_bytes); + + m_bytes = nbytes; + m_data[m_bytes] = 0; + return *this; +} + +__device__ inline dstring& dstring::replace(cudf::size_type pos, + cudf::size_type count, + char const* str) +{ + return replace(pos, count, str, cudf::strings::detail::bytes_in_null_terminated_string(str)); +} + +__device__ inline dstring& dstring::replace(cudf::size_type pos, + cudf::size_type count, + cudf::string_view const& in) +{ + return replace(pos, count, in.data(), in.size_bytes()); +} + +__device__ inline dstring& dstring::replace(cudf::size_type pos, + cudf::size_type count, + cudf::size_type chr_count, + cudf::char_utf8 chr) +{ + if (pos < 0 || chr_count < 0) { return *this; } + auto const spos = byte_offset(pos); + if (spos > m_bytes) { return *this; } + auto const epos = count < 0 ? m_bytes : std::min(byte_offset(pos + count), m_bytes); + + // compute input size + auto const char_bytes = cudf::strings::detail::bytes_in_char_utf8(chr) * chr_count; + // compute new output size + auto const nbytes = m_bytes + char_bytes - (epos - spos); + if (nbytes > m_capacity) { reallocate(2 * nbytes); } + + // move bytes -- make room for the new character(s) + shift_bytes(spos + char_bytes, epos, nbytes); + + // copy chr chr_count times + auto out_ptr = m_data + spos; + for (auto idx = 0; idx < chr_count; ++idx) { + out_ptr += cudf::strings::detail::from_char_utf8(chr, out_ptr); + } + + m_bytes = nbytes; + m_data[m_bytes] = 0; + return *this; +} + +__device__ dstring& dstring::erase(cudf::size_type pos, cudf::size_type count) +{ + return replace(pos, count, nullptr, 0); +} + +__device__ inline cudf::size_type dstring::char_offset(cudf::size_type bytepos) const +{ + return cudf::strings::detail::characters_in_string(data(), bytepos); +} + +} // namespace udf +} // namespace strings +} // namespace cudf diff --git a/cpp/include/cudf/strings/udf/dstring.hpp b/cpp/include/cudf/strings/udf/dstring.hpp new file mode 100644 index 00000000000..8d793ab42f3 --- /dev/null +++ b/cpp/include/cudf/strings/udf/dstring.hpp @@ -0,0 +1,526 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +/** + * @file + * @brief Class definition for cudf::strings::udf::dstring. + */ + +namespace cudf { +namespace strings { +namespace udf { + +class dstring { + public: + /** + * @brief Represents unknown character position or length. + */ + static constexpr cudf::size_type npos = static_cast(-1); + + /** + * @brief Cast to cudf::string_view operator + */ + __device__ operator cudf::string_view() const { return cudf::string_view(m_data, m_bytes); } + + /** + * @brief Create an empty string. + */ + dstring() = default; + + /** + * @brief Create a string using existing device memory. + * + * The given memory is copied into the instance returned. + * + * @param data Device pointer to UTF-8 encoded string + * @param bytes Number of bytes in `data` + */ + __device__ dstring(char const* data, cudf::size_type bytes); + + /** + * @brief Create a string object from a null-terminated character array. + * + * The given memory is copied into the instance returned. + * + * @param data Device pointer to UTF-8 encoded null-terminated + * character array. + */ + __device__ dstring(char const* data); + + /** + * @brief Create a string object from a cudf::string_view. + * + * The input string data is copied into the instance returned. + * + * @param str String to copy + */ + __device__ dstring(cudf::string_view const& str); + + /** + * @brief Create a string object with `count` copies of character `chr`. + * + * @param count Number of times to copy `chr` + * @param chr Character from which to create the string + */ + __device__ dstring(cudf::size_type count, cudf::char_utf8 chr); + + /** + * @brief Create a string object from another instance. + * + * The string data is copied from the `src` into the instance returned. + * + * @param src String to copy + */ + __device__ dstring(dstring const& src); + + /** + * @brief Create a string object from a move reference. + * + * The string data is moved from `src` into the instance returned. + * The `src` will have no content. + * + * @param src String to copy + */ + __device__ dstring(dstring&& src); + + __device__ ~dstring(); + + __device__ dstring& operator=(dstring const&); + __device__ dstring& operator=(dstring&&); + __device__ dstring& operator=(cudf::string_view const&); + __device__ dstring& operator=(char const*); + + /** + * @brief Return the number of bytes in this string. + */ + __device__ cudf::size_type size_bytes() const; + + /** + * @brief Return the number of characters in this string. + */ + __device__ cudf::size_type length() const; + + /** + * @brief Return the maximum number of bytes a dstring can hold. + */ + __device__ cudf::size_type max_size() const; + + /** + * @brief Return the internal pointer to the character array for this object. + */ + __device__ char* data(); + __device__ char const* data() const; + + /** + * @brief Returns true if there are no characters in this string. + */ + __device__ bool is_empty() const; + + /** + * @brief Returns true if `data()==nullptr` + * + * This is experimental and may be removed in the futre. + */ + __device__ bool is_null() const; + + /** + * @brief Returns an iterator that can be used to navigate through + * the UTF-8 characters in this string. + * + * This returns a `cudf::string_view::const_iterator` which is read-only. + */ + __device__ cudf::string_view::const_iterator begin() const; + __device__ cudf::string_view::const_iterator end() const; + + /** + * @brief Returns the character at the specified position. + * + * This will return 0 if `pos >= length()`. + * + * @param pos Index position of character to return + * @return Character at position `pos` + */ + __device__ cudf::char_utf8 at(cudf::size_type pos) const; + + /** + * @brief Returns the character at the specified index. + * + * This will return 0 if `pos >= length()`. + * Note this is read-only. Use replace() to modify a character. + * + * @param pos Index position of character to return + * @return Character at position `pos` + */ + __device__ cudf::char_utf8 operator[](cudf::size_type pos) const; + + /** + * @brief Return the byte offset for a given character position. + * + * The byte offset for the character at `pos` such that + * `data() + byte_offset(pos)` points to the memory location + * the character at position `pos`. + * + * @param pos Index position of character to return byte offset. + * @return Byte offset for character at `pos` + */ + __device__ cudf::size_type byte_offset(cudf::size_type pos) const; + + /** + * @brief Comparing target string with this string + * + * @param str Target string to compare with this string + * @return 0 If they compare equal + * <0 Either the value of the first character of this string that does + * not match is ordered before the corresponding character in `str`, + * or all compared characters match but the `str` string is shorter. + * >0 Either the value of the first character of this string that does + * not match is ordered after the corresponding character in `str`, + * or all compared characters match but the `str` string is longer. + */ + __device__ int compare(cudf::string_view const& str) const; + + /** + * @brief Comparing target character array with this string + * + * @param str Target array of UTF-8 characters. + * @param bytes Number of bytes in `str`. + * @return 0 If they compare equal + * <0 Either the value of the first character of this string that does + * not match is ordered before the corresponding character in `str`, + * or all compared characters match but `bytes < size_bytes()`. + * >0 Either the value of the first character of this string that does + * not match is ordered after the corresponding character in `str`, + * or all compared characters match but `bytes > size_bytes()`. + */ + __device__ int compare(char const* str, cudf::size_type bytes) const; + + /** + * @brief Returns true if `rhs` matches this string exactly + */ + __device__ bool operator==(cudf::string_view const& rhs) const; + + /** + * @brief Returns true if `rhs` does not match this string + */ + __device__ bool operator!=(cudf::string_view const& rhs) const; + + /** + * @brief Returns true if this string is ordered before `rhs` + */ + __device__ bool operator<(cudf::string_view const& rhs) const; + + /** + * @brief Returns true if `rhs` is ordered before this string + */ + __device__ bool operator>(cudf::string_view const& rhs) const; + + /** + * @brief Returns true if this string matches or is ordered before `rhs` + */ + __device__ bool operator<=(cudf::string_view const& rhs) const; + + /** + * @brief Returns true if `rhs` matches or is ordered before this string + */ + __device__ bool operator>=(cudf::string_view const& rhs) const; + + /** + * @brief Remove all bytes from this string. + * + * All pointers, references, and iterators are invalidated. + */ + __device__ void clear(); + + /** + * @brief Resizes string to contain `count` bytes. + * + * If `count > size_bytes()` then zero-padding is added. + * If `count < size_bytes()` then the string is truncated to size `count`. + * + * All pointers, references, and iterators may be invalidated. + * + * @param count Size in bytes of this string. + */ + __device__ void resize(cudf::size_type count); + + /** + * @brief Reserve `count` bytes in this string. + * + * If `count > capacity()`, new memory is allocated and `capacity()` will + * be greater than or equal to `count`. + * There is no effect if `count <= capacity()`. + * + * @param count Total number of bytes to reserve for this string + */ + __device__ void reserve(cudf::size_type count); + + /** + * @brief Returns the number of bytes that the string has allocated. + */ + __device__ cudf::size_type capacity() const; + + /** + * @brief Reduces internal allocation to just `size_bytes()`. + * + * All pointers, references, and iterators may be invalidated. + */ + __device__ void shrink_to_fit(); + + /** + * @brief Moves the contents of `str` into this string instance + * + * @param str String to move + * @return This string new contents + */ + __device__ dstring& assign(dstring&& str); + + /** + * @brief Replaces the contents of this string with contents of `str` + * + * @param str String to copy + * @return This string new contents + */ + __device__ dstring& assign(cudf::string_view const& str); + + /** + * @brief Replaces the contents of this string with contents of `str` + * + * @param str Null-terminated UTF-8 character array + * @return This string new contents + */ + __device__ dstring& assign(char const* str); + + /** + * @brief Replaces the contents of this string with contents of `str` + * + * @param str UTF-8 character array + * @param bytes Number of bytes to copy from `str` + * @return This string new contents + */ + __device__ dstring& assign(char const* str, cudf::size_type bytes); + + /** + * @brief Append a string to the end of this string. + * + * @param str String to append + * @return This string with the appended argument + */ + __device__ dstring& operator+=(cudf::string_view const& str); + + /** + * @brief Append a character to the end of this string. + * + * @param str Character to append + * @return This string with the appended argument + */ + __device__ dstring& operator+=(cudf::char_utf8 chr); + + /** + * @brief Append a null-terminated device memory character array + * to the end of this string. + * + * @param str String to append + * @return This string with the appended argument + */ + __device__ dstring& operator+=(char const* str); + + /** + * @brief Append a null-terminated character array to the end of this string. + * + * @param str String to append + * @return This string with the appended argument + */ + __device__ dstring& append(char const* str); + + /** + * @brief Append a character array to the end of this string. + * + * @param str Character array to append + * @param bytes Number of bytes from `str` to append. + * @return This string with the appended argument + */ + __device__ dstring& append(char const* str, cudf::size_type bytes); + + /** + * @brief Append a string to the end of this string. + * + * @param str String to append + * @return This string with the appended argument + */ + __device__ dstring& append(cudf::string_view const& str); + + /** + * @brief Append a character to the end of this string + * a specified number of times. + * + * @param chr Character to append + * @param count Number of times to append `chr` + * @return This string with the append character(s) + */ + __device__ dstring& append(cudf::char_utf8 chr, cudf::size_type count = 1); + + /** + * @brief Insert a string into the character position specified. + * + * There is no effect if `pos < 0 or pos > length()`. + * + * @param pos Character position to begin insert + * @param str String to insert into this one + * @return This string with the inserted argument + */ + __device__ dstring& insert(cudf::size_type pos, cudf::string_view const& str); + + /** + * @brief Insert a null-terminated character array into the character position specified. + * + * There is no effect if `pos < 0 or pos > length()`. + * + * @param pos Character position to begin insert + * @param data Null-terminated character array to insert + * @return This string with the inserted argument + */ + __device__ dstring& insert(cudf::size_type pos, char const* data); + + /** + * @brief Insert a character array into the character position specified. + * + * There is no effect if `pos < 0 or pos > length()`. + * + * @param pos Character position to begin insert + * @param data Character array to insert + * @param bytes Number of bytes from `data` to insert + * @return This string with the inserted argument + */ + __device__ dstring& insert(cudf::size_type pos, char const* data, cudf::size_type bytes); + + /** + * @brief Insert a character one or more times into the character position specified. + * + * There is no effect if `pos < 0 or pos > length()`. + * + * @param pos Character position to begin insert + * @param count Number of times to insert `chr` + * @param chr Character to insert + * @return This string with the inserted argument + */ + __device__ dstring& insert(cudf::size_type pos, cudf::size_type count, cudf::char_utf8 chr); + + /** + * @brief Returns a substring of this string. + * + * An empty string is returned if `pos < 0 or pos >= length()`. + * + * @param pos Character position to start the substring + * @param count Number of characters for the substring; + * This can be greater than the number of available characters. + * Default npos returns characters in range `[pos, length())`. + * @return New string with the specified characters + */ + __device__ dstring substr(cudf::size_type pos, cudf::size_type count = npos) const; + + /** + * @brief Replace a range of characters with a given string. + * + * Replaces characters in range `[pos, pos + count]` with `str`. + * There is no effect if `pos < 0 or pos > length()`. + * + * @param pos Position of first character to replace + * @param count Number of characters to replace + * @param str String to replace the given range + * @return This string modified with the replacement + */ + __device__ dstring& replace(cudf::size_type pos, + cudf::size_type count, + cudf::string_view const& str); + + /** + * @brief Replace a range of characters with a null-terminated character array. + * + * Replaces characters in range `[pos, pos + count)` with `data`. + * There is no effect if `pos < 0 or pos > length()`. + * + * @param pos Position of first character to replace + * @param count Number of characters to replace + * @param data Null-terminated character array to replace the given range + * @return This string modified with the replacement + */ + __device__ dstring& replace(cudf::size_type pos, cudf::size_type count, char const* data); + + /** + * @brief Replace a range of characters with a given character array. + * + * Replaces characters in range `[pos, pos + count)` with `[data, data + bytes)`. + * There is no effect if `pos < 0 or pos > length()`. + * + * @param pos Position of first character to replace + * @param count Number of characters to replace + * @param data String to replace the given range + * @param bytes Number of bytes from data to use for replacement + * @return This string modified with the replacement + */ + __device__ dstring& replace(cudf::size_type pos, + cudf::size_type count, + char const* data, + cudf::size_type bytes); + + /** + * @brief Replace a range of characters with a character one or more times. + * + * Replaces characters in range `[pos, pos + count)` with `chr` `chr_count` times. + * There is no effect if `pos < 0 or pos > length()`. + * + * @param pos Position of first character to replace + * @param count Number of characters to replace + * @param chr_count Number of times `chr` will repeated + * @param chr Character to use for replacement + * @return This string modified with the replacement + */ + __device__ dstring& replace(cudf::size_type pos, + cudf::size_type count, + cudf::size_type chr_count, + cudf::char_utf8 chr); + + /** + * @brief Removes specified characters from this string. + * + * Removes `min(count, length() - pos)` characters starting at `pos`. + * There is no effect if `pos < 0 or pos >= length()`. + * + * @param pos Character position to begin insert + * @param count Number of characters to remove starting at `pos` + * @return This string with remove characters + */ + __device__ dstring& erase(cudf::size_type pos, cudf::size_type count = npos); + + private: + char* m_data{}; + cudf::size_type m_bytes{}; + cudf::size_type m_capacity{}; + + // utilities + __device__ char* allocate(cudf::size_type bytes); + __device__ void deallocate(char* data); + __device__ void reallocate(cudf::size_type bytes); + __device__ cudf::size_type char_offset(cudf::size_type bytepos) const; + __device__ void shift_bytes(cudf::size_type spos, cudf::size_type epos, cudf::size_type nbytes); +}; + +} // namespace udf +} // namespace strings +} // namespace cudf From f3d347ee5011c9f623d4a8bd03737c4c834ce9c7 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 19 Apr 2022 16:26:19 -0400 Subject: [PATCH 2/4] add udf column functions --- .../cudf/strings/udf/column_functions.hpp | 88 +++++++++++++ cpp/src/strings/udf/column_functions.cu | 118 ++++++++++++++++++ 2 files changed, 206 insertions(+) create mode 100644 cpp/include/cudf/strings/udf/column_functions.hpp create mode 100644 cpp/src/strings/udf/column_functions.cu diff --git a/cpp/include/cudf/strings/udf/column_functions.hpp b/cpp/include/cudf/strings/udf/column_functions.hpp new file mode 100644 index 00000000000..4539eb2a981 --- /dev/null +++ b/cpp/include/cudf/strings/udf/column_functions.hpp @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include + +namespace cudf { +namespace strings { +//! Strings UDF support +namespace udf { + +/** + * @addtogroup strings_udfs + * @{ + * @file + * @brief Strings APIs for supporting user-defined functions + */ + +/** + * @brief Return a vector of cudf::string_view for the given strings column + * + * @param input Strings column + * @param mr Device memory resource used to allocate the returned vector + * @return Device vector of cudf::string_view objects + */ +rmm::device_uvector create_string_view_array( + cudf::strings_column_view const input, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Return an empty dstring array + * + * Once finished with the array call free_dstring_array to deallocate the dstring objects + * before destroying the return memory buffer. + * + * @param size Number of empty dstring elements + * @param mr Device memory resource used to allocate the returned vector + * @return Device buffer containing the empty dstring objects + */ +std::unique_ptr create_dstring_array( + size_type size, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Return a cudf::column given an array of dstring objects + * + * @param input dstring array + * @param mr Device memory resource used to allocate the returned vector + * @return A strings column copy of the dstring objects + */ +std::unique_ptr make_strings_column( + device_span input, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Free all the dstring objects in the given array + * + * Call this to free the internal memory within individual dstring objects. + * The input dstrings are modified (emptied) and can be reused. + * + * @param input dstring array + */ +void free_dstring_array(device_span input); + +/** @} */ // end of group +} // namespace udf +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/udf/column_functions.cu b/cpp/src/strings/udf/column_functions.cu new file mode 100644 index 00000000000..0f98096a41f --- /dev/null +++ b/cpp/src/strings/udf/column_functions.cu @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include + +#include +#include + +namespace cudf { +namespace strings { +namespace udf { + +namespace { + +struct free_dstring_fn { + cudf::strings::udf::dstring* d_strings; + __device__ void operator()(cudf::size_type idx) { d_strings[idx].clear(); } +}; + +struct dstring_to_string_view_transform_fn { + __device__ cudf::string_view operator()(cudf::strings::udf::dstring const& d_str) + { + return cudf::string_view{d_str.data(), d_str.size_bytes()}; + } +}; + +} // namespace + +namespace detail { + +std::unique_ptr create_dstring_array(cudf::size_type size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const output_vector_size = size * sizeof(cudf::strings::udf::dstring); + auto result = std::make_unique(output_vector_size, stream, mr); + cudaMemset(result->data(), 0, output_vector_size); + return result; +} + +void free_dstring_array(device_span input, rmm::cuda_stream_view stream) +{ + auto const size = static_cast(input.size()); + auto d_strings = input.data(); + thrust::for_each_n( + rmm::exec_policy(stream), thrust::make_counting_iterator(0), size, free_dstring_fn{d_strings}); +} + +std::unique_ptr make_strings_column(device_span input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const size = static_cast(input.size()); + auto d_input = input.data(); + + // create string_views of the dstrings + auto indices = rmm::device_uvector(size, stream); + thrust::transform(rmm::exec_policy(stream), + d_input, + d_input + size, + indices.data(), + dstring_to_string_view_transform_fn{}); + + return cudf::make_strings_column(indices, cudf::string_view(nullptr, 0), stream); +} + +} // namespace detail + +rmm::device_uvector create_string_view_array(cudf::strings_column_view const input, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return cudf::strings::detail::create_string_vector_from_column(cudf::strings_column_view(input), + rmm::cuda_stream_default); +} + +std::unique_ptr create_dstring_array(size_type size, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::create_dstring_array(size, rmm::cuda_stream_default, mr); +} + +std::unique_ptr make_strings_column(device_span input, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::make_strings_column(input, rmm::cuda_stream_default, mr); +} + +void free_dstring_array(device_span input) +{ + CUDF_FUNC_RANGE(); + return detail::free_dstring_array(input, rmm::cuda_stream_default); +} + +} // namespace udf +} // namespace strings +} // namespace cudf From 7a3f0cd08b81e2b9230e44c27d01d896bda93647 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 19 Apr 2022 16:26:55 -0400 Subject: [PATCH 3/4] add dstring gtests --- cpp/tests/strings/dstring_tests.cu | 444 +++++++++++++++++++++++++++++ 1 file changed, 444 insertions(+) create mode 100644 cpp/tests/strings/dstring_tests.cu diff --git a/cpp/tests/strings/dstring_tests.cu b/cpp/tests/strings/dstring_tests.cu new file mode 100644 index 00000000000..72f82f14d72 --- /dev/null +++ b/cpp/tests/strings/dstring_tests.cu @@ -0,0 +1,444 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +struct DStringTest : public cudf::test::BaseFixture { +}; + +namespace { + +template +void run_dstring_test(Functor fn, cudf::column_view const expected) +{ + auto const rows = cudf::column_view(expected).size(); + auto output = cudf::strings::udf::create_dstring_array(rows); + auto output_data = static_cast(output->data()); + auto stream = rmm::cuda_stream_default; + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(rows), + output_data, + fn); + + auto d_span = cudf::device_span(output_data, rows); + auto results = cudf::strings::udf::make_strings_column(d_span); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + cudf::strings::udf::free_dstring_array( + cudf::device_span(output_data, rows)); +} + +struct ctor_and_assign_fn { + using dstring = cudf::strings::udf::dstring; + __device__ dstring operator()(int idx) + { + switch (idx) { + case 0: { // dstring() + dstring result; + return result; + } + case 1: { // dstring(char*) + dstring result("hello"); + return result; + } + case 2: { // dstring(char*,int) + dstring result("goodbye", 4); + return result; + } + case 3: { // dstring(string_view) + cudf::string_view sv("world", 5); + dstring result(sv); + return result; + } + case 4: { // dstring(int,char) + dstring result(5, '#'); + return result; + } + case 5: { // dstring(dstring&) + dstring input("copy"); + dstring result(input); + return result; + } + case 6: { // dstring(dstring&&) + dstring input("move"); + dstring result(std::move(input)); + return result; + } + case 7: { // operator=(char*) + dstring result; + result = "hello"; + return result; + } + case 8: { // operator=(string_view) + cudf::string_view sv("world", 5); + dstring result; + result = sv; + return result; + } + case 9: { // operator=(dstring&) + dstring result, input("copied"); + result = input; + return result; + } + case 10: { // operator=(dstring&&) + dstring result; + result = dstring("moved"); + return result; + } + case 11: { // operator=(char*) + dstring result; + result = "hello"; + return result; + } + case 12: { // assign(char*) + dstring result; + return result.assign("accénted"); + } + case 13: { // assign(char*,int) + dstring result; + return result.assign("goodbye", 4); + } + case 14: { // assign(string_view) + cudf::string_view sv("world", 5); + dstring result; + return result.assign(sv); + } + case 15: { // operator=(dstring&&) + dstring result; + return result.assign(dstring("movié")); + } + } + } +}; + +} // namespace + +TEST_F(DStringTest, Constructors) +{ + cudf::test::strings_column_wrapper expected({"", + "hello", + "good", + "world", + "#####", + "copy", + "move", + "hello", + "world", + "copied", + "moved", + "hello", + "accénted", + "good", + "world", + "movié"}, + {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + run_dstring_test(ctor_and_assign_fn{}, expected); +} + +namespace { + +struct append_fn { + using dstring = cudf::strings::udf::dstring; + __device__ dstring operator()(int idx) + { + dstring result(":::"); + cudf::string_view sv("world", 5); + switch (idx) { + case 0: return result.append("hello"); // append(char*) + case 1: return result.append("goodbye", 4); // append(char*,int) + case 2: return result.append(sv); // append(string_view) + case 3: return result.append('$', 6); // append(char,int) + case 4: return result += "accénted"; // operator+=(char*) + case 5: return result += sv; // operator+=(string_view) + case 6: return result += '?'; // operator+=(char) + } + } +}; + +} // namespace + +TEST_F(DStringTest, Append) +{ + cudf::test::strings_column_wrapper expected( + {":::hello", ":::good", ":::world", ":::$$$$$$", ":::accénted", ":::world", ":::?"}); + run_dstring_test(append_fn{}, expected); +} + +namespace { + +struct insert_fn { + using dstring = cudf::strings::udf::dstring; + __device__ dstring operator()(int idx) + { + dstring result(":::::"); + cudf::string_view sv("insert", 6); + switch (idx) { + case 0: return result.insert(1, "héllo"); // insert(char*) + case 1: return result.insert(2, "good day", 4); // insert(char*,int) + case 2: return result.insert(3, sv); // insert(string_view) + case 3: return result.insert(4, 2, '_'); // insert(char,int) + case 4: return result.insert(0, "héllo"); + case 5: return result.insert(result.length(), "héllo"); + case 6: return result.insert(-1, "héllo"); + case 7: return result.insert(result.length(), 1, 'X'); + } + } +}; + +} // namespace + +TEST_F(DStringTest, Insert) +{ + cudf::test::strings_column_wrapper expected({":héllo::::", + "::good:::", + ":::insert::", + "::::__:", + "héllo:::::", + ":::::héllo", + ":::::", + ":::::X"}); + + run_dstring_test(insert_fn{}, expected); +} + +namespace { + +struct replace_fn { + using dstring = cudf::strings::udf::dstring; + __device__ dstring operator()(int idx) + { + dstring result("0123456789"); + cudf::string_view sv("replace", 7); + switch (idx) { + case 0: return result.replace(5, 1, " "); // replace(char*): same size + case 1: return result.replace(3, 2, "XYZ", 2); // replace(char*,int): same size + case 2: return result.replace(2, 7, sv); // replace(string_view): same size + case 3: return result.replace(1, 2, 2, '_'); // replace(char,int): same size + case 4: return result.replace(2, 5, "XY"); // replace(char*): smaller + case 5: return result.replace(0, 6, "XYZ", 2); // replace(char*,int): smaller + case 6: return result.replace(1, 8, sv); // replace(string_view): smaller + case 7: return result.replace(3, 4, 2, '_'); // replace(char,int): smaller + case 8: return result.replace(2, 2, "WXYZ"); // replace(char*): larger + case 9: return result.replace(0, 2, "WXYZ", 3); // replace(char*,int): larger + case 10: return result.replace(1, 4, sv); // replace(string_view): larger + case 11: return result.replace(3, 4, 6, '_'); // replace(char,int): larger + case 12: return result.replace(5, -1, ""); // replace to the end of a string + case 13: return result.replace(5, 0, " "); // replace/insert + case 14: return result.replace(result.length(), -1, "X"); // replace/append + case 15: return result.replace(-1, -1, "X"); // no change + case 16: return result.replace(0, -1, "X", -1); // no change + case 17: return result.replace(3, 0, ""); // no change + } + } +}; + +} // namespace + +TEST_F(DStringTest, Replace) +{ + cudf::test::strings_column_wrapper expected({ + "01234 6789", // 0 + "012XY56789", // 1 + "01replace9", // 2 + "0__3456789", // 3 + "01XY789", // 4 + "XY6789", // 5 + "0replace9", // 6 + "012__789", // 7 + "01WXYZ456789", // 8 + "WXY23456789", // 9 + "0replace56789", // 10 + "012______789", // 11 + "01234", // 12 + "01234 56789", // 13 + "0123456789X", // 14 + "0123456789", // 15 + "0123456789", // 16 + "0123456789" // 17 + }); + + run_dstring_test(replace_fn{}, expected); +} + +namespace { + +struct erase_fn { + using dstring = cudf::strings::udf::dstring; + __device__ dstring operator()(int idx) + { + dstring result("0123456789"); + switch (idx) { + case 0: return result.erase(5, 5); + case 1: return result.erase(5); + case 2: return result.erase(0, 5); + case 3: return result.erase(0); + case 4: return result.erase(-1); + case 5: return result.erase(result.length()); + } + } +}; + +} // namespace + +TEST_F(DStringTest, Erase) +{ + cudf::test::strings_column_wrapper expected( + {"01234", "01234", "56789", "", "0123456789", "0123456789"}); + run_dstring_test(erase_fn{}, expected); +} + +namespace { + +struct substring_fn { + using dstring = cudf::strings::udf::dstring; + __device__ dstring operator()(int idx) + { + dstring result("0123456789"); + switch (idx) { + case 0: return result.substr(0, 5); + case 1: return result.substr(5); + case 2: return result.substr(5, result.length()); + case 3: return result.substr(0); + case 4: return result.substr(-1); + case 5: return result.substr(0, result.length()); + case 6: return result.substr(3, 0); + case 7: return result.substr(result.length(), 0); + } + } +}; + +} // namespace + +TEST_F(DStringTest, Substring) +{ + cudf::test::strings_column_wrapper expected( + {"01234", "56789", "56789", "0123456789", "", "0123456789", "", ""}); + run_dstring_test(substring_fn{}, expected); +} + +namespace { + +struct resize_fn { + using dstring = cudf::strings::udf::dstring; + __device__ cudf::size_type operator()(int idx) + { + dstring result("0123456789"); + switch (idx) { + case 0: result.reserve(5); break; + case 1: result.reserve(25); break; + case 2: result.resize(12); break; + case 3: result.resize(4); break; + case 4: result.clear(); break; + case 5: + result.reserve(25); + result.shrink_to_fit(); + break; + case 6: + result.resize(12); + result.shrink_to_fit(); + break; + } + return result.size_bytes(); + } +}; + +} // namespace + +TEST_F(DStringTest, Resize) +{ + cudf::test::fixed_width_column_wrapper expected({10, 10, 12, 4, 0, 10, 12}); + + auto rows = cudf::column_view(expected).size(); + auto stream = rmm::cuda_stream_default; + auto d_result = rmm::device_uvector(rows, stream); + + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(rows), + d_result.data(), + resize_fn{}); + + auto d_span = cudf::device_span(d_result.data(), rows); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(cudf::column_view(d_span), expected); +} + +namespace { + +struct compare_fn { + using dstring = cudf::strings::udf::dstring; + __device__ int operator()(cudf::string_view const sv) + { + dstring result("abcdef"); + auto const rtn = result.compare(sv); + // convert to simply 0, 1, or -1 + return rtn == 0 ? 0 : (rtn / std::abs(rtn)); + } +}; + +} // namespace + +TEST_F(DStringTest, Compare) +{ + cudf::test::strings_column_wrapper input({"abcdef", "abcdefg", "abcdéf", "012345", "abc", ""}); + auto rows = cudf::column_view(input).size(); + auto stream = rmm::cuda_stream_default; + + auto view_array = cudf::strings::udf::create_string_view_array(cudf::strings_column_view(input)); + auto d_result = rmm::device_uvector(rows, stream); + + thrust::transform( + rmm::exec_policy(stream), view_array.begin(), view_array.end(), d_result.data(), compare_fn{}); + + auto expected = cudf::test::fixed_width_column_wrapper({0, -1, -1, 1, 1, 1}); + auto d_span = cudf::device_span(d_result.data(), rows); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(cudf::column_view(d_span), expected); +} + +namespace { + +struct copy_fn { + cudf::string_view const* d_input; + + using dstring = cudf::strings::udf::dstring; + __device__ dstring operator()(int idx) + { + dstring result{d_input[idx]}; + return result; + } +}; + +} // namespace + +TEST_F(DStringTest, ColumnFunctions) +{ + cudf::test::strings_column_wrapper input({"abcdef", "abcdefg", "abcdéf", "012345", "abc", ""}); + auto view_array = cudf::strings::udf::create_string_view_array(cudf::strings_column_view(input)); + + run_dstring_test(copy_fn{view_array.data()}, input); +} From cb7d581699593045bd54f33f0e6911898ce4e14f Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 19 Apr 2022 16:28:06 -0400 Subject: [PATCH 4/4] Add initial support for string udfs in libcudf --- cpp/CMakeLists.txt | 1 + cpp/include/cudf/strings/string.cuh | 66 ++++++++++++++++++--- cpp/include/cudf/strings/string_view.cuh | 14 +---- cpp/include/doxygen_groups.h | 1 + cpp/src/strings/convert/convert_floats.cu | 6 +- cpp/src/strings/convert/convert_integers.cu | 13 ++-- cpp/tests/CMakeLists.txt | 1 + 7 files changed, 70 insertions(+), 32 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d9422edaa8f..9d1e2a1a6f7 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -479,6 +479,7 @@ add_library( src/strings/strip.cu src/strings/substring.cu src/strings/translate.cu + src/strings/udf/column_functions.cu src/strings/utilities.cu src/strings/wrap.cu src/structs/copying/concatenate.cu diff --git a/cpp/include/cudf/strings/string.cuh b/cpp/include/cudf/strings/string.cuh index 0cfcaeb913e..a6df0fc13bd 100644 --- a/cpp/include/cudf/strings/string.cuh +++ b/cpp/include/cudf/strings/string.cuh @@ -17,13 +17,21 @@ #include -#include -#include -#include - namespace cudf { namespace strings { -namespace string { +namespace detail { + +__device__ inline static cudf::size_type bytes_in_null_terminated_string(char const* str) +{ + if (!str) return 0; + cudf::size_type bytes = 0; + while (*str++) + ++bytes; + return bytes; +} + +} // namespace detail + /** * @addtogroup strings_classes * @{ @@ -50,9 +58,12 @@ inline __device__ bool is_integer(string_view const& d_str) auto begin = d_str.begin(); auto end = d_str.end(); if (*begin == '+' || *begin == '-') ++begin; - return (thrust::distance(begin, end) > 0) && - thrust::all_of( - thrust::seq, begin, end, [] __device__(auto chr) { return chr >= '0' && chr <= '9'; }); + auto const result = begin < end; + while (begin < end) { + if (*begin < '0' || *begin > '9') { return false; } + ++begin; + } + return result; } /** @@ -149,7 +160,44 @@ inline __device__ bool is_float(string_view const& d_str) return result; } +__device__ inline bool starts_with(cudf::string_view const dstr, + char const* tgt, + cudf::size_type bytes) +{ + if (bytes > dstr.size_bytes()) { return false; } + auto const start_str = cudf::string_view{dstr.data(), bytes}; + return start_str.compare(tgt, bytes) == 0; +} + +__device__ inline bool starts_with(cudf::string_view const dstr, char const* tgt) +{ + return starts_with(dstr, tgt, detail::bytes_in_null_terminated_string(tgt)); +} + +__device__ inline bool starts_with(cudf::string_view const dstr, cudf::string_view const& tgt) +{ + return starts_with(dstr, tgt.data(), tgt.size_bytes()); +} + +__device__ inline bool ends_with(cudf::string_view const dstr, + char const* tgt, + cudf::size_type bytes) +{ + if (bytes > dstr.size_bytes()) { return false; } + auto const end_str = cudf::string_view{dstr.data() + dstr.size_bytes() - bytes, bytes}; + return end_str.compare(tgt, bytes) == 0; +} + +__device__ inline bool ends_with(cudf::string_view const dstr, char const* tgt) +{ + return ends_with(dstr, tgt, detail::bytes_in_null_terminated_string(tgt)); +} + +__device__ inline bool ends_with(cudf::string_view const dstr, cudf::string_view const& tgt) +{ + return starts_with(dstr, tgt.data(), tgt.size_bytes()); +} + /** @} */ // end of group -} // namespace string } // namespace strings } // namespace cudf diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index a486a5a765c..a9cd45420b3 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -23,13 +23,6 @@ #include #endif -// This is defined when including this header in a https://github.com/NVIDIA/jitify -// or jitify2 source file. The jitify cannot include thrust headers at this time. -#ifndef CUDF_JIT_UDF -#include -#include -#endif - // This file should only include device code logic. // Host-only or host/device code should be defined in the string_view.hpp header file. @@ -47,18 +40,13 @@ namespace detail { __device__ inline size_type characters_in_string(const char* str, size_type bytes) { if ((str == nullptr) || (bytes == 0)) return 0; - auto ptr = reinterpret_cast(str); -#ifndef CUDF_JIT_UDF - return thrust::count_if( - thrust::seq, ptr, ptr + bytes, [](uint8_t chr) { return is_begin_utf8_char(chr); }); -#else + auto ptr = reinterpret_cast(str); size_type chars = 0; auto const end = ptr + bytes; while (ptr < end) { chars += is_begin_utf8_char(*ptr++); } return chars; -#endif } /** diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h index 0abaebc3b0c..21c802ee9cb 100644 --- a/cpp/include/doxygen_groups.h +++ b/cpp/include/doxygen_groups.h @@ -129,6 +129,7 @@ * @defgroup strings_replace Replacing * @defgroup strings_split Splitting * @defgroup strings_json JSON + * @defgroup strings_udfs UDF Support * @} * @defgroup dictionary_apis Dictionary * @{ diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu index b8a10a00f5b..8acf348ef05 100644 --- a/cpp/src/strings/convert/convert_floats.cu +++ b/cpp/src/strings/convert/convert_floats.cu @@ -67,8 +67,8 @@ __device__ inline double stod(string_view const& d_str) // special strings: NaN, Inf if ((in_ptr < end) && *in_ptr > '9') { auto const inf_nan = string_view(in_ptr, static_cast(thrust::distance(in_ptr, end))); - if (string::is_nan_str(inf_nan)) return std::numeric_limits::quiet_NaN(); - if (string::is_inf_str(inf_nan)) return sign * std::numeric_limits::infinity(); + if (is_nan_str(inf_nan)) return std::numeric_limits::quiet_NaN(); + if (is_inf_str(inf_nan)) return sign * std::numeric_limits::infinity(); } // Parse and store the mantissa as much as we can, @@ -567,7 +567,7 @@ std::unique_ptr is_float( d_results, [d_column] __device__(size_type idx) { if (d_column.is_null(idx)) return false; - return string::is_float(d_column.element(idx)); + return strings::is_float(d_column.element(idx)); }); results->set_null_count(strings.null_count()); return results; diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu index 95ddf1822a7..75c2f851bab 100644 --- a/cpp/src/strings/convert/convert_integers.cu +++ b/cpp/src/strings/convert/convert_integers.cu @@ -150,14 +150,13 @@ std::unique_ptr is_integer( d_column->pair_begin(), d_column->pair_end(), d_results, - [] __device__(auto const& p) { return p.second ? string::is_integer(p.first) : false; }); + [] __device__(auto const& p) { return p.second ? strings::is_integer(p.first) : false; }); } else { - thrust::transform( - rmm::exec_policy(stream), - d_column->pair_begin(), - d_column->pair_end(), - d_results, - [] __device__(auto const& p) { return p.second ? string::is_integer(p.first) : false; }); + thrust::transform(rmm::exec_policy(stream), + d_column->pair_begin(), + d_column->pair_end(), + d_results, + [] __device__(auto const& p) { return strings::is_integer(p.first); }); } // Calling mutable_view() on a column invalidates it's null count so we need to set it back diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index e016f47616b..a9e2060e98c 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -390,6 +390,7 @@ ConfigureTest( strings/concatenate_tests.cpp strings/contains_tests.cpp strings/datetime_tests.cpp + strings/dstring_tests.cu strings/durations_tests.cpp strings/extract_tests.cpp strings/factories_test.cu