diff --git a/CHANGELOG.md b/CHANGELOG.md index d9530722307..db359ee71c0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,7 @@ - PR #2838 CSV Reader: Support ARROW_RANDOM_FILE input - PR #2655 CuPy-based Series and Dataframe .values property - PR #2803 Added `edit_distance_matrix()` function to calculate pairwise edit distance for each string on a given nvstrings object. +- PR #2811 Start of cudf strings column work based on 2207 - PR #2872 Add Java pinned memory pool allocator ## Improvements diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f98cca35ded..3e51fa5a7ea 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -324,7 +324,7 @@ add_library(cudf src/column/legacy/column.cpp src/column/legacy/context.cpp src/table/legacy/table.cpp - src/string/nvcategory_util.cpp + src/strings/nvcategory_util.cpp src/join/joining.cu src/orderby/orderby.cu src/predicates/is_sorted.cu @@ -432,13 +432,10 @@ add_library(cudf src/table/table.cpp src/bitmask/null_mask.cu src/sort/sort.cu + src/strings/strings_column_factories.cu + src/strings/strings_column_view.cu src/column/legacy/interop.cpp) -# Override RPATH for nvstrings -set_target_properties(libNVStrings PROPERTIES BUILD_RPATH "\$ORIGIN") -set_target_properties(libNVCategory PROPERTIES BUILD_RPATH "\$ORIGIN") -set_target_properties(libNVText PROPERTIES BUILD_RPATH "\$ORIGIN") - # Rename installation to proper names for later finding set_target_properties(libNVStrings PROPERTIES OUTPUT_NAME "NVStrings") set_target_properties(libNVCategory PROPERTIES OUTPUT_NAME "NVCategory") @@ -447,6 +444,11 @@ set_target_properties(libNVText PROPERTIES OUTPUT_NAME "NVText") # Override RPATH for cudf set_target_properties(cudf PROPERTIES BUILD_RPATH "\$ORIGIN") +# Override RPATH for nvstrings +set_target_properties(libNVStrings PROPERTIES BUILD_RPATH "\$ORIGIN") +set_target_properties(libNVCategory PROPERTIES BUILD_RPATH "\$ORIGIN") +set_target_properties(libNVText PROPERTIES BUILD_RPATH "\$ORIGIN") + ################################################################################################### # - jitify ---------------------------------------------------------------------------------------- diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index 8e83da52458..18b46303c0c 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -18,6 +18,8 @@ #include #include #include +#include +#include namespace cudf { @@ -70,34 +72,6 @@ class alignas(16) column_device_view_base { return head() + _offset; } - /**---------------------------------------------------------------------------* - * @brief Returns reference to element at the specified index. - * - * This function accounts for the offset. - * - * @tparam T The element type - * @param element_index Position of the desired element - *---------------------------------------------------------------------------**/ - template - __device__ T const& element(size_type element_index) const noexcept { - return data()[element_index]; - } - - /**---------------------------------------------------------------------------* - * @brief Returns `string_view` to the string element at the specified index. - * - * This function accounts for the offset. - * - * @param element_index Position of the desired string - *---------------------------------------------------------------------------**/ - /* - template <> - __device__ string_view const& element( - size_type element_index) const noexcept { - // Fill this in - } - */ - /**---------------------------------------------------------------------------* * @brief Returns the number of elements in the column *---------------------------------------------------------------------------**/ @@ -234,6 +208,33 @@ class alignas(16) column_device_view : public detail::column_device_view_base { column_device_view& operator=(column_device_view const&) = default; column_device_view& operator=(column_device_view&&) = default; + /**---------------------------------------------------------------------------* + * @brief Creates an instance of this class using the specified host memory + * pointer (h_ptr) to store child objects and the device memory pointer (d_ptr) + * as a base for any child object pointers. + * + * @param column Column view from which to create this instance. + * @param h_ptr Host memory pointer on which to place any child data. + * @param d_ptr Device memory pointer on which to base any child pointers. + *---------------------------------------------------------------------------**/ + column_device_view( column_view column, ptrdiff_t h_ptr, ptrdiff_t d_ptr ); + + /**---------------------------------------------------------------------------* + * @brief Returns reference to element at the specified index. + * + * If the element at the specified index is NULL, i.e., `is_null(element_index) == true`, + * then any attempt to use the result will lead to undefined behavior. + * + * This function accounts for the offset. + * + * @tparam T The element type + * @param element_index Position of the desired element + *---------------------------------------------------------------------------**/ + template + __device__ T const element(size_type element_index) const noexcept { + return data()[element_index]; + } + /**---------------------------------------------------------------------------* * @brief Factory to construct a column view that is usable in device memory. * @@ -265,6 +266,14 @@ class alignas(16) column_device_view : public detail::column_device_view_base { *---------------------------------------------------------------------------**/ void destroy(); + /**---------------------------------------------------------------------------* + * @brief Return the size in bytes of the amount of memory needed to hold a + * device view of the specified column and it's children. + * + * @param source_view The `column_view` to use for this calculation. + *---------------------------------------------------------------------------**/ + static size_type extent(column_view source_view); + /**---------------------------------------------------------------------------* * @brief Returns the specified child * @@ -308,6 +317,17 @@ class alignas(16) mutable_column_device_view default; mutable_column_device_view& operator=(mutable_column_device_view&&) = default; + /**---------------------------------------------------------------------------* + * @brief Creates an instance of this class using the specified host memory + * pointer (h_ptr) to store child objects and the device memory pointer (d_ptr) + * as a base for any child object pointers. + * + * @param column Column view from which to create this instance. + * @param h_ptr Host memory pointer on which to place any child data. + * @param d_ptr Device memory pointer on which to base any child pointers. + *---------------------------------------------------------------------------**/ + mutable_column_device_view( mutable_column_view column, ptrdiff_t h_ptr, ptrdiff_t d_ptr ); + /**---------------------------------------------------------------------------* * @brief Factory to construct a column view that is usable in device memory. * @@ -327,7 +347,8 @@ class alignas(16) mutable_column_device_view * @return A `unique_ptr` to a `mutable_column_device_view` that makes the *data from `source_view` available in device memory. *---------------------------------------------------------------------------**/ - static auto create(mutable_column_view source_view, cudaStream_t stream = 0); + static std::unique_ptr> + create(mutable_column_view source_view, cudaStream_t stream = 0); /**---------------------------------------------------------------------------* * @brief Returns pointer to the base device memory allocation casted to @@ -376,21 +397,6 @@ class alignas(16) mutable_column_device_view return data()[element_index]; } - /**---------------------------------------------------------------------------* - * @brief Returns `string_view` to the string element at the specified index. - * - * This function accounts for the offset. - * - * @param element_index Position of the desired string - *---------------------------------------------------------------------------**/ - /* - template <> - __device__ string_view& element( - size_type element_index) noexcept { - // Fill this in - } - */ - /**---------------------------------------------------------------------------* * @brief Returns raw pointer to the underlying bitmask allocation. * @@ -454,6 +460,14 @@ class alignas(16) mutable_column_device_view null_mask()[word_index] = new_word; } + /**---------------------------------------------------------------------------* + * @brief Return the size in bytes of the amount of memory needed to hold a + * device view of the specified column and it's children. + * + * @param source_view The `column_view` to use for this calculation. + *---------------------------------------------------------------------------**/ + static size_type extent(mutable_column_view source_view); + private: mutable_column_device_view* mutable_children{}; ///< Array of `mutable_column_device_view` @@ -479,6 +493,28 @@ class alignas(16) mutable_column_device_view * allocated to hold the child views. *---------------------------------------------------------------------------**/ void destroy(); + }; +/**---------------------------------------------------------------------------* + * @brief Returns `string_view` to the string element at the specified index. + * + * If the element at the specified index is NULL, i.e., `is_null(element_index) == true`, + * then any attempt to use the result will lead to undefined behavior. + * + * This function accounts for the offset. + * + * @param element_index Position of the desired string element + * @return string_view instance representing this element at this index + *---------------------------------------------------------------------------**/ +template <> +__device__ inline string_view const column_device_view::element( + size_type element_index) const noexcept { + size_type index = element_index + offset(); // account for this view's _offset + const int32_t* d_offsets = d_children[strings_column_view::offsets_column_index].data(); + const char* d_strings = d_children[strings_column_view::chars_column_index].data(); + size_type offset = d_offsets[index]; + return string_view{d_strings + offset, d_offsets[index+1] - offset}; +} + } // namespace cudf diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp index 9dbd24f67e4..884567eadf5 100644 --- a/cpp/include/cudf/column/column_factories.hpp +++ b/cpp/include/cudf/column/column_factories.hpp @@ -19,6 +19,8 @@ #include #include "column.hpp" +#include + namespace cudf { /**---------------------------------------------------------------------------* * @brief Construct column with sufficient uninitialized storage @@ -43,4 +45,68 @@ std::unique_ptr make_numeric_column( data_type type, size_type size, mask_state state = UNALLOCATED, cudaStream_t stream = 0, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); + +/**---------------------------------------------------------------------------* + * @brief Construct STRING type column given an array of pointer/size pairs. + * The total number of char bytes must not exceed the maximum size of size_type. + * The string characters are expected to be UTF-8 encoded sequence of char bytes. + * Use the strings_column_view class to perform strings operations on this type + * of column. + * + * @note `null_count()` and `null_bitmask` are determined if a pair contains + * a null string. That is, for each pair, if `.first` is null, that string + * is considered null. Likewise, a string is considered empty (not null) + * if `.first` is not null and `.second` is 0. Otherwise the `.first` member + * must be a valid device address pointing to `.second` consecutive bytes. + * + * @throws std::bad_alloc if device memory allocation fails + * + * @param strings The pointer/size pair arrays. + * Each pointer must be a device memory address or `nullptr` (indicating a null string). + * The size must be the number of bytes. + * @param stream Optional stream for use with all memory allocation + * and device kernels + * @param mr Optional resource to use for device memory + * allocation of the column's `null_mask` and children. + *---------------------------------------------------------------------------**/ +std::unique_ptr make_strings_column( + const rmm::device_vector>& strings, + cudaStream_t stream = 0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); + +/**---------------------------------------------------------------------------* + * @brief Construct STRING type column given an contiguous array of chars + * encoded as UTF-8, an array of byte offsets identifying individual strings + * within the char array, and a null bitmask. + * The total number of char bytes must not exceed the maximum size of size_type. + * Use the strings_column_view class to perform strings operations on this type + * of column. + * This function makes a deep copy of the strings, offsets, null_mask to create + * a new column. + * + * @throws std::bad_alloc if device memory allocation fails + * + * @param strings The contiguous array of chars in device memory. + * This char array is expected to be UTF-8 encoded characters. + * @param offsets The array of byte offsets in device memory. + * The number of elements is one more than the total number + * of strings so the offset[last] - offset[0] is the total + * number of bytes in the strings array. + * @param null_mask The array of bits specifying the null strings. + * This array must be in device memory. + * Arrow format for nulls is used for interpeting this bitmask. + * @param null_count The number of null string entries. + * @param stream Optional stream for use with all memory allocation + * and device kernels + * @param mr Optional resource to use for device memory + * allocation of the column's `null_mask` and children. + *---------------------------------------------------------------------------**/ +std::unique_ptr make_strings_column( + const rmm::device_vector& strings, + const rmm::device_vector& offsets, + const rmm::device_vector& null_mask, + size_type null_count, + cudaStream_t stream = 0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); + } // namespace cudf diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh new file mode 100644 index 00000000000..eb5113df517 --- /dev/null +++ b/cpp/include/cudf/strings/string_view.cuh @@ -0,0 +1,347 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +namespace cudf +{ + +// UTF-8 characters are 1-4 bytes +using char_utf8 = uint32_t; + +/**---------------------------------------------------------------------------* + * @brief A non-owning, immutable view of device data that is a variable length + * char array representing a UTF-8 string. The caller must maintain the + * device memory for the lifetime of this instance. + * + * It provides a simple wrapper and string operations for an individual string + * with a column of strings. + *---------------------------------------------------------------------------**/ +class string_view +{ + public: + string_view() = default; + /**---------------------------------------------------------------------------* + * @brief Create instance from existing device char array. + * + * @param data Device char array encoded in UTF8. + * @param bytes Number of bytes in data array. + *---------------------------------------------------------------------------**/ + __host__ __device__ string_view(const char* data, size_type bytes); + /**---------------------------------------------------------------------------* + * @brief Create instance from existing device char array. The array must + * include a null-terminator ('\0). + * + * @param data Device char array encoded in UTF8. + *---------------------------------------------------------------------------**/ + __device__ string_view(const char* data); + string_view(const string_view&) = default; + string_view(string_view&&) = default; + ~string_view() = default; + string_view& operator=(const string_view&) = default; + string_view& operator=(string_view&&) = default; + + /**---------------------------------------------------------------------------* + * @brief Return the number of bytes in this string + *---------------------------------------------------------------------------**/ + __host__ __device__ size_type size_bytes() const; + /**---------------------------------------------------------------------------* + * @brief Return the number of characters in this string + *---------------------------------------------------------------------------**/ + __device__ size_type length() const; + /**---------------------------------------------------------------------------* + * @brief Return a pointer to the internal device array + *---------------------------------------------------------------------------**/ + __host__ __device__ const char* data() const; + + /**---------------------------------------------------------------------------* + * @brief Return true if string has no characters + *---------------------------------------------------------------------------**/ + __host__ __device__ bool empty() const; + /**---------------------------------------------------------------------------* + * @brief Return true if string is NULL. + * That is, `data()==nullptr` for this instance. + *---------------------------------------------------------------------------**/ + __host__ __device__ bool is_null() const; + + /**---------------------------------------------------------------------------* + * @brief Handy iterator for navigating through encoded characters. + *---------------------------------------------------------------------------**/ + class const_iterator + { + public: + using difference_type = ptrdiff_t; + using value_type = char_utf8; + using reference = char_utf8&; + using pointer = char_utf8*; + using iterator_category = std::input_iterator_tag; // do not allow going backwards + __device__ const_iterator(const string_view& str, size_type pos); + const_iterator(const const_iterator& mit) = default; + const_iterator(const_iterator&& mit) = default; + const_iterator& operator=(const const_iterator&) = default; + const_iterator& operator=(const_iterator&&) = default; + __device__ const_iterator& operator++(); + __device__ const_iterator operator++(int); + __device__ bool operator==(const const_iterator& rhs) const; + __device__ bool operator!=(const const_iterator& rhs) const; + __device__ char_utf8 operator*() const; + __device__ size_type position() const; + __device__ size_type byte_offset() const; + private: + const char* p{}; + size_type cpos{}, offset{}; + }; + + /**---------------------------------------------------------------------------* + * @brief Return new iterator pointing to the beginning of this string + *---------------------------------------------------------------------------**/ + __device__ const_iterator begin() const; + /**---------------------------------------------------------------------------* + * @brief Return new iterator pointing past the end of this string + *---------------------------------------------------------------------------**/ + __device__ const_iterator end() const; + + /**---------------------------------------------------------------------------* + * @brief Return single UTF-8 character at the given character position + * + * @param pos Character position + *---------------------------------------------------------------------------**/ + __device__ char_utf8 operator[](size_type pos) const; + /**---------------------------------------------------------------------------* + * @brief Return the byte offset from data() for a given character position + * + * @param pos Character position + *---------------------------------------------------------------------------**/ + __device__ size_type byte_offset(size_type pos) const; + + /**---------------------------------------------------------------------------* + * @brief Comparing target string with this string. Each character is compared + * as a UTF-8 code-point value. + * + * @param str Target string to compare with this string. + * @return 0 If they compare equal. + * <0 Either the value of the first character of this string that does + * not match is lower in the arg string, or all compared characters + * match but the arg string is shorter. + * >0 Either the value of the first character of this string that does + * not match is greater in the arg string, or all compared characters + * match but the arg string is longer. + *---------------------------------------------------------------------------**/ + __device__ int compare(const string_view& str) const; + /**---------------------------------------------------------------------------* + * @brief Comparing target string with this string. Each character is compared + * as a UTF-8 code-point value. + * + * @param str Target string to compare with this string. + * @param bytes Number of bytes in str. + * @return 0 If they compare equal. + * <0 Either the value of the first character of this string that does + * not match is lower in the arg string, or all compared characters + * match but the arg string is shorter. + * >0 Either the value of the first character of this string that does + * not match is greater in the arg string, or all compared characters + * match but the arg string is longer. + *---------------------------------------------------------------------------**/ + __device__ int compare(const char* str, size_type bytes) const; + + /**---------------------------------------------------------------------------* + * @brief Returns true if rhs matches this string exactly. + *---------------------------------------------------------------------------**/ + __device__ bool operator==(const string_view& rhs) const; + /**---------------------------------------------------------------------------* + * @brief Returns true if rhs does not match this string. + *---------------------------------------------------------------------------**/ + __device__ bool operator!=(const string_view& rhs) const; + /**---------------------------------------------------------------------------* + * @brief Returns true if this string is ordered before rhs. + *---------------------------------------------------------------------------**/ + __device__ bool operator<(const string_view& rhs) const; + /**---------------------------------------------------------------------------* + * @brief Returns true if rhs is ordered before this string. + *---------------------------------------------------------------------------**/ + __device__ bool operator>(const string_view& rhs) const; + /**---------------------------------------------------------------------------* + * @brief Returns true if this string matches or is ordered before rhs. + *---------------------------------------------------------------------------**/ + __device__ bool operator<=(const string_view& rhs) const; + /**---------------------------------------------------------------------------* + * @brief Returns true if rhs matches or is ordered before this string. + *---------------------------------------------------------------------------**/ + __device__ bool operator>=(const string_view& rhs) const; + + /**---------------------------------------------------------------------------* + * @brief Returns the character position of the first occurrence where the + * argument str is found in this string within the character range [pos,pos+n). + * + * @param str Target string to search within this string. + * @param pos Character position to start search within this string. + * @param count Number of characters from pos to include in the search. + * Specify -1 to indicate to the end of the string. + * @return -1 if str is not found in this string. + *---------------------------------------------------------------------------**/ + __device__ size_type find( const string_view& str, size_type pos=0, size_type count=-1 ) const; + /**---------------------------------------------------------------------------* + * @brief Returns the character position of the first occurrence where the + * array str is found in this string within the character range [pos,pos+n). + * + * @param str Target array to search within this string. + * @param bytes Number of bytes in str. + * @param pos Character position to start search within this string. + * @param count Number of characters from pos to include in the search. + * Specify -1 to indicate to the end of the string. + * @return -1 if arg string is not found in this string. + *---------------------------------------------------------------------------**/ + __device__ size_type find( const char* str, size_type bytes, size_type pos=0, size_type count=-1 ) const; + /**---------------------------------------------------------------------------* + * @brief Returns the character position of the first occurrence where + * character is found in this string within the character range [pos,pos+n). + * + * @param character Single encoded character. + * @param pos Character position to start search within this string. + * @param count Number of characters from pos to include in the search. + * Specify -1 to indicate to the end of the string. + * @return -1 if arg string is not found in this string. + *---------------------------------------------------------------------------**/ + __device__ size_type find( char_utf8 character, size_type pos=0, size_type count=-1 ) const; + /**---------------------------------------------------------------------------* + * @brief Returns the character position of the last occurrence where the + * argument str is found in this string within the character range [pos,pos+n). + * + * @param str Target string to search within this string. + * @param pos Character position to start search within this string. + * @param count Number of characters from pos to include in the search. + * Specify -1 to indicate to the end of the string. + * @return -1 if arg string is not found in this string. + *---------------------------------------------------------------------------**/ + __device__ size_type rfind( const string_view& str, size_type pos=0, size_type count=-1 ) const; + /**---------------------------------------------------------------------------* + * @brief Returns the character position of the last occurrence where the + * array str is found in this string within the character range [pos,pos+n). + * + * @param str Target string to search with this string. + * @param bytes Number of bytes in str. + * @param pos Character position to start search within this string. + * @param count Number of characters from pos to include in the search. + * Specify -1 to indicate to the end of the string. + * @return -1 if arg string is not found in this string. + *---------------------------------------------------------------------------**/ + __device__ size_type rfind( const char* str, size_type bytes, size_type pos=0, size_type count=-1 ) const; + /**---------------------------------------------------------------------------* + * @brief Returns the character position of the last occurrence where + * character is found in this string within the character range [pos,pos+n). + * + * @param character Single encoded character. + * @param pos Character position to start search within this string. + * @param count Number of characters from pos to include in the search. + * Specify -1 to indicate to the end of the string. + * @return -1 if arg string is not found in this string. + *---------------------------------------------------------------------------**/ + __device__ size_type rfind( char_utf8 character, size_type pos=0, size_type count=-1 ) const; + + /**---------------------------------------------------------------------------* + * @brief Return a sub-string of this string. The original string and device + * memory but must still be maintained for the lifetime of the instance. + * + * @param start Character position to start the sub-string. + * @param length Number of characters from start to include in the sub-string. + * @return New instance pointing to a subset of the characters within this instance. + *---------------------------------------------------------------------------**/ + __device__ string_view substr( size_type start, size_type length ) const; + + /**---------------------------------------------------------------------------* + * @brief Tokenizes this string around the given delimiter up to count times. + * + * @param delimiter Character to use for separating tokens. + * @param count Maximum number of tokens to return. + * Specify -1 to indicate all tokens. + * @param[out] Array to hold output tokens. + * Specify nullptr here to return just the token count. + * @return Number of tokens. + *---------------------------------------------------------------------------**/ + __device__ size_type split( const char* delimiter, size_type count, string_view* strs ) const; + + /**---------------------------------------------------------------------------* + * @brief Tokenizes this string around the given delimiter up to count times + * starting at the end of the string. + * + * @param delimiter Character to use for separating tokens. + * @param count Maximum number of tokens to return. + * Specify -1 to indicate all tokens. + * @param[out] Array to hold output tokens. + * Specify nullptr here to return just the token count. + * @return Number of tokens. + *---------------------------------------------------------------------------**/ + __device__ size_type rsplit( const char* delimiter, size_type count, string_view* strs ) const; + +private: + const char* _data{}; ///< Pointer to device memory contain char array for this string + size_type _bytes{}; ///< Number of bytes in _data for this string + + /**---------------------------------------------------------------------------* + * @brief Return the character position of the given byte offset. + * + * @param bytepos Byte position from start of _data. + * @return The character position for the specified byte. + *---------------------------------------------------------------------------**/ + __device__ size_type character_offset(size_type bytepos) const; +}; + +namespace strings +{ +namespace detail +{ +/**---------------------------------------------------------------------------* + * @brief Returns the number of bytes in the specified character. + * + * @param chr Single character + *---------------------------------------------------------------------------**/ +__host__ __device__ size_type bytes_in_char_utf8( char_utf8 character ); + +/**---------------------------------------------------------------------------* + * @brief Convert a char array into a char_utf8 value. + * + * @param str String containing encoded char bytes. + * @param[out] chr Single char_utf8 value. + * @return The number of bytes in the character + *---------------------------------------------------------------------------**/ +__host__ __device__ size_type to_char_utf8( const char* str, char_utf8& character ); + +/**---------------------------------------------------------------------------* + * @brief Place a char_utf8 value into a char array. + * + * @param chr Single character + * @param[out] str Allocated char array with enough space to hold the encoded characer. + * @return The number of bytes in the character + *---------------------------------------------------------------------------**/ +__host__ __device__ size_type from_char_utf8( char_utf8 character, char* str ); + +/**---------------------------------------------------------------------------* + * @brief Return the number of UTF-8 characters in this provided char array. + * + * @param str String with encoded char bytes. + * @param bytes Number of bytes in str. + * @return The number of characters in the array. + *---------------------------------------------------------------------------**/ +__host__ __device__ size_type characters_in_string( const char* str, size_type bytes ); + +} // namespace detail +} // namespace strings +} // namespace cudf + +#include "./string_view.inl" diff --git a/cpp/include/cudf/strings/string_view.inl b/cpp/include/cudf/strings/string_view.inl new file mode 100644 index 00000000000..afbd9d962ae --- /dev/null +++ b/cpp/include/cudf/strings/string_view.inl @@ -0,0 +1,504 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#include + +namespace +{ +using BYTE = uint8_t; + +/**---------------------------------------------------------------------------* + * @brief Returns the number of bytes used to represent the provided byte. + * This could be 0 to 4 bytes. 0 is returned for intermediate bytes within a + * single character. For example, for the two-byte 0xC3A8 single character, + * the first byte would return 2 and the second byte would return 0. + * + * @param byte Byte from an encoded character. + * @return Number of bytes. + *---------------------------------------------------------------------------**/ +__host__ __device__ inline cudf::size_type bytes_in_utf8_byte(BYTE byte) +{ + cudf::size_type count = 1; + count += (int)((byte & 0xF0) == 0xF0); // 4-byte character prefix + count += (int)((byte & 0xE0) == 0xE0); // 3-byte character prefix + count += (int)((byte & 0xC0) == 0xC0); // 2-byte character prefix + count -= (int)((byte & 0xC0) == 0x80); // intermediate byte + return count; +} + +/**---------------------------------------------------------------------------* + * @brief Returns the number of bytes used in the provided char array by + * searching for a null-terminator ('\0') byte. + * + * @param str Null-terminated array of chars. + * @return Number of bytes. + *---------------------------------------------------------------------------**/ +__device__ inline cudf::size_type string_length( const char* str ) +{ + if( !str ) + return 0; + cudf::size_type bytes = 0; + while(*str++) + ++bytes; + return bytes; +} + +} // namespace + +namespace cudf +{ + +__host__ __device__ inline string_view::string_view(const char* data, size_type bytes) + : _data(data), _bytes(bytes) +{} + +__device__ inline string_view::string_view(const char* data) + : _data{data}, _bytes{string_length(data)} +{} + +// +__host__ __device__ inline size_type string_view::size_bytes() const +{ + return _bytes; +} + +__device__ inline size_type string_view::length() const +{ + return strings::detail::characters_in_string(_data,_bytes); +} + +__host__ __device__ inline const char* string_view::data() const +{ + return _data; +} + +__host__ __device__ inline bool string_view::empty() const +{ + return _bytes == 0; +} + +__host__ __device__ inline bool string_view::is_null() const +{ + return _data == nullptr; +} + +// the custom iterator knows about UTF8 encoding +__device__ inline string_view::const_iterator::const_iterator(const string_view& str, size_type pos) + : cpos{pos}, p{str.data()}, offset{str.byte_offset(pos)} +{} + +__device__ inline string_view::const_iterator& string_view::const_iterator::operator++() +{ + offset += bytes_in_utf8_byte((BYTE)p[offset]); + ++cpos; + return *this; +} + +__device__ inline string_view::const_iterator string_view::const_iterator::operator++(int) +{ + const_iterator tmp(*this); + operator++(); + return tmp; +} + +__device__ inline bool string_view::const_iterator::operator==(const string_view::const_iterator& rhs) const +{ + return (p == rhs.p) && (cpos == rhs.cpos); +} + +__device__ inline bool string_view::const_iterator::operator!=(const string_view::const_iterator& rhs) const +{ + return (p != rhs.p) || (cpos != rhs.cpos); +} + +// unsigned int can hold 1-4 bytes for the UTF8 char +__device__ inline char_utf8 string_view::const_iterator::operator*() const +{ + char_utf8 chr = 0; + strings::detail::to_char_utf8(p + offset, chr); + return chr; +} + +__device__ inline size_type string_view::const_iterator::position() const +{ + return cpos; +} + +__device__ inline size_type string_view::const_iterator::byte_offset() const +{ + return offset; +} + +__device__ inline string_view::const_iterator string_view::begin() const +{ + return const_iterator(*this, 0); +} + +__device__ inline string_view::const_iterator string_view::end() const +{ + return const_iterator(*this, length()); +} + +__device__ inline char_utf8 string_view::operator[](size_type pos) const +{ + unsigned int offset = byte_offset(pos); + if(offset >= _bytes) + return 0; + char_utf8 chr = 0; + strings::detail::to_char_utf8(data() + offset, chr); + return chr; +} + +__device__ inline size_type string_view::byte_offset(size_type pos) const +{ + size_type offset = 0; + const char* sptr = _data; + const char* eptr = sptr + _bytes; + while( (pos > 0) && (sptr < eptr) ) + { + size_type charbytes = bytes_in_utf8_byte((BYTE)*sptr++); + if( charbytes ) + --pos; + offset += charbytes; + } + return offset; +} + +__device__ inline int string_view::compare(const string_view& in) const +{ + return compare(in.data(), in.size_bytes()); +} + +__device__ inline int string_view::compare(const char* data, size_type bytes) const +{ + const unsigned char* ptr1 = reinterpret_cast(this->data()); + if(!ptr1) + return -1; + const unsigned char* ptr2 = reinterpret_cast(data); + if(!ptr2) + return 1; + size_type len1 = size_bytes(); + size_type idx = 0; + for(; (idx < len1) && (idx < bytes); ++idx) + { + if(*ptr1 != *ptr2) + return (int)*ptr1 - (int)*ptr2; + ++ptr1; + ++ptr2; + } + if(idx < len1) + return 1; + if(idx < bytes) + return -1; + return 0; +} + +__device__ inline bool string_view::operator==(const string_view& rhs) const +{ + return compare(rhs) == 0; +} + +__device__ inline bool string_view::operator!=(const string_view& rhs) const +{ + return compare(rhs) != 0; +} + +__device__ inline bool string_view::operator<(const string_view& rhs) const +{ + return compare(rhs) < 0; +} + +__device__ inline bool string_view::operator>(const string_view& rhs) const +{ + return compare(rhs) > 0; +} + +__device__ inline bool string_view::operator<=(const string_view& rhs) const +{ + int rc = compare(rhs); + return (rc == 0) || (rc < 0); +} + +__device__ inline bool string_view::operator>=(const string_view& rhs) const +{ + int rc = compare(rhs); + return (rc == 0) || (rc > 0); +} + +__device__ inline size_type string_view::find(const string_view& str, size_type pos, int count) const +{ + return find(str.data(), str.size_bytes(), pos, count); +} + +__device__ inline size_type string_view::find(const char* str, size_type bytes, size_type pos, int count) const +{ + const char* sptr = data(); + if(!str || !bytes) + return -1; + size_type nchars = length(); + if(count < 0) + count = nchars; + size_type end = pos + count; + if(end < 0 || end > nchars) + end = nchars; + size_type spos = byte_offset(pos); + size_type epos = byte_offset(end); + + size_type len2 = bytes; + size_type len1 = (epos - spos) - len2 + 1; + + const char* ptr1 = sptr + spos; + const char* ptr2 = str; + for(size_type idx=0; idx < len1; ++idx) + { + bool match = true; + for( size_type jdx=0; match && (jdx < len2); ++jdx ) + match = (ptr1[jdx] == ptr2[jdx]); + if( match ) + return character_offset(idx+spos); + ptr1++; + } + return -1; +} + +__device__ inline size_type string_view::find(char_utf8 chr, size_type pos, int count) const +{ + char str[sizeof(char_utf8)]; + size_type chwidth = strings::detail::from_char_utf8(chr,str); + return find(str,chwidth,pos,count); +} + +__device__ inline size_type string_view::rfind(const string_view& str, size_type pos, int count) const +{ + return rfind(str.data(), str.size_bytes(), pos, count); +} + +__device__ inline size_type string_view::rfind(const char* str, size_type bytes, size_type pos, int count) const +{ + const char* sptr = data(); + if(!str || !bytes) + return -1; + size_type sz = size_bytes(); + size_type nchars = length(); + size_type end = pos + count; + if(end < 0 || end > nchars) + end = nchars; + size_type spos = byte_offset(pos); + size_type epos = byte_offset(end); + + size_type len2 = bytes; + size_type len1 = (epos - spos) - len2 + 1; + + const char* ptr1 = sptr + epos - len2; + const char* ptr2 = str; + for(int idx=0; idx < len1; ++idx) + { + bool match = true; + for(size_type jdx=0; match && (jdx < len2); ++jdx) + match = (ptr1[jdx] == ptr2[jdx]); + if(match) + return character_offset(epos - len2 - idx); + ptr1--; // go backwards + } + return -1; +} + +__device__ inline size_type string_view::rfind(char_utf8 chr, size_type pos, int count) const +{ + char str[sizeof(char_utf8)]; + size_type chwidth = strings::detail::from_char_utf8(chr,str); + return rfind(str,chwidth,pos,count); +} + +// parameters are character position values +__device__ inline string_view string_view::substr(size_type pos, size_type length) const +{ + size_type spos = byte_offset(pos); + size_type epos = byte_offset(pos + length); + if( epos > size_bytes() ) + epos = size_bytes(); + if(spos >= epos) + return string_view("",0); + length = epos - spos; // converts length to bytes + return string_view(data()+spos,length); +} + +__device__ inline size_type string_view::split(const char* delim, int count, string_view* strs) const +{ + const char* sptr = data(); + size_type sz = size_bytes(); + if(sz == 0) + { + if(strs && count) + strs[0] = *this; + return 1; + } + + size_type bytes = string_length(delim); + size_type delimCount = 0; + size_type pos = find(delim, bytes); + while(pos >= 0) + { + ++delimCount; + pos = find(delim, bytes, pos + bytes); + } + + size_type strsCount = delimCount + 1; + size_type rtn = strsCount; + if((count > 0) && (rtn > count)) + rtn = count; + if(!strs) + return rtn; + // + if(strsCount < count) + count = strsCount; + // + size_type dchars = (bytes ? strings::detail::characters_in_string(delim,bytes) : 1); + size_type nchars = length(); + size_type spos = 0, sidx = 0; + size_type epos = find(delim, bytes); + while(epos >= 0) + { + if(sidx >= (count - 1)) // add this to the while clause + break; + strs[sidx++] = substr(spos, epos - spos); + spos = epos + dchars; + epos = find(delim, bytes, spos); + } + if((spos <= nchars) && (sidx < count)) + strs[sidx] = substr(spos, nchars - spos); + // + return rtn; +} + + +__device__ inline size_type string_view::rsplit(const char* delim, int count, string_view* strs) const +{ + const char* sptr = data(); + size_type sz = size_bytes(); + if(sz == 0) + { + if(strs && count) + strs[0] = *this; + return 1; + } + + size_type bytes = string_length(delim); + size_type delimCount = 0; + size_type pos = find(delim, bytes); + while(pos >= 0) + { + ++delimCount; + pos = find(delim, bytes, (unsigned int)pos + bytes); + } + + unsigned int strsCount = delimCount + 1; + unsigned int rtn = strsCount; + if((count > 0) && (rtn > count)) + rtn = count; + if(!strs) + return rtn; + // + if(strsCount < count) + count = strsCount; + // + unsigned int dchars = (bytes ? strings::detail::characters_in_string(delim,bytes) : 1); + int epos = (int)length(); // end pos is not inclusive + int sidx = count - 1; // index for strs array + int spos = rfind(delim, bytes); + while(spos >= 0) + { + if(sidx <= 0) + break; + //int spos = pos + (int)bytes; + int len = epos - spos - dchars; + strs[sidx--] = substr((unsigned int)spos+dchars, (unsigned int)len); + epos = spos; + spos = rfind(delim, bytes, 0, (unsigned int)epos); + } + if(epos >= 0) + strs[0] = substr(0, epos); + // + return rtn; +} + +__device__ inline size_type string_view::character_offset(size_type bytepos) const +{ + return strings::detail::characters_in_string(data(), bytepos); +} + +namespace strings +{ +namespace detail +{ +__host__ __device__ inline size_type bytes_in_char_utf8(char_utf8 chr) +{ + size_type count = 1; + count += (int)((chr & (unsigned)0x0000FF00) > 0); + count += (int)((chr & (unsigned)0x00FF0000) > 0); + count += (int)((chr & (unsigned)0xFF000000) > 0); + return count; +} + +__host__ __device__ inline size_type to_char_utf8(const char* pSrc, char_utf8 &chr) +{ + size_type chwidth = bytes_in_utf8_byte((BYTE)*pSrc); + chr = (char_utf8)(*pSrc++) & 0xFF; + if(chwidth > 1) + { + chr = chr << 8; + chr |= ((char_utf8)(*pSrc++) & 0xFF); // << 8; + if(chwidth > 2) + { + chr = chr << 8; + chr |= ((char_utf8)(*pSrc++) & 0xFF); // << 16; + if(chwidth > 3) + { + chr = chr << 8; + chr |= ((char_utf8)(*pSrc++) & 0xFF); // << 24; + } + } + } + return chwidth; +} + +__host__ __device__ inline size_type from_char_utf8(char_utf8 chr, char* dst) +{ + size_type chwidth = bytes_in_char_utf8(chr); + for(size_type idx = 0; idx < chwidth; ++idx) + { + dst[chwidth - idx - 1] = (char)chr & 0xFF; + chr = chr >> 8; + } + return chwidth; +} + +// counts the number of characters in the given char array +__host__ __device__ inline size_type characters_in_string(const char* str, size_type bytes) +{ + if( (str==0) || (bytes==0) ) + return 0; + // + unsigned int nchars = 0; + for(size_type idx = 0; idx < bytes; ++idx) + nchars += (unsigned int)(((BYTE)str[idx] & 0xC0) != 0x80); + return (size_type)nchars; +} + +} // namespace detail +} // namespace strings +} // namespace cudf diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp new file mode 100644 index 00000000000..1aa55e71c0b --- /dev/null +++ b/cpp/include/cudf/strings/strings_column_view.hpp @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include + +namespace cudf { + +/**---------------------------------------------------------------------------* + * @brief Given a column-view of strings type, an instance of this class + * provides a wrapper on this compound column for strings operations. + *---------------------------------------------------------------------------**/ +class strings_column_view : private column_view +{ +public: + strings_column_view( column_view strings_column ); + strings_column_view( strings_column_view&& strings_view ) = default; + strings_column_view( const strings_column_view& strings_view ) = default; + ~strings_column_view() = default; + strings_column_view& operator=(strings_column_view const&) = default; + strings_column_view& operator=(strings_column_view&&) = default; + + static constexpr size_type offsets_column_index{0}; + static constexpr size_type chars_column_index{1}; + + using column_view::size; + using column_view::null_mask; + using column_view::null_count; + + /**---------------------------------------------------------------------------* + * @brief Returns the parent column. + *---------------------------------------------------------------------------**/ + column_view parent() const; + + /**---------------------------------------------------------------------------* + * @brief Returns the internal column of offsets + *---------------------------------------------------------------------------**/ + column_view offsets() const; + + /**---------------------------------------------------------------------------* + * @brief Returns the internal column of chars + *---------------------------------------------------------------------------**/ + column_view chars() const; + +}; + +namespace strings +{ + +/**---------------------------------------------------------------------------* + * @brief Prints the strings to stdout. + * + * @param strings Strings instance for this operation. + * @param start Index of first string to print. + * @param end Index of last string to print. Specify -1 for all strings. + * @param max_width Maximum number of characters to print per string. + * Specify -1 to print all characters. + * @param delimiter The chars to print between each string. + * Default is new-line character. + *---------------------------------------------------------------------------**/ +void print( strings_column_view strings, + size_type start=0, size_type end=-1, + size_type max_width=-1, const char* delimiter = "\n" ); + +/**---------------------------------------------------------------------------* + * @brief Create output per Arrow strings format. + * The return pair is the array of chars and the array of offsets. + * + * @param strings Strings instance for this operation. + * @param stream CUDA stream to use kernels in this method. + * @param mr Resource for allocating device memory. + * @return Pair containing a contiguous array of chars and an array of offsets. + *---------------------------------------------------------------------------**/ +std::pair, rmm::device_vector> + create_offsets( strings_column_view strings, + cudaStream_t stream=0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); + +} // namespace strings +} // namespace cudf diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index 8131825d2ae..e25cd8a0282 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -53,6 +53,7 @@ struct table; class column; class column_view; class mutable_column_view; +class string_view; namespace exp { class table; diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp index 741a022ac4d..a8751d8471a 100644 --- a/cpp/include/cudf/utilities/traits.hpp +++ b/cpp/include/cudf/utilities/traits.hpp @@ -18,6 +18,7 @@ #include #include +#include #include @@ -69,6 +70,13 @@ constexpr inline bool is_numeric() { return std::is_integral::value or std::is_floating_point::value; } +struct is_numeric_impl { + template + bool operator()() { + return is_numeric(); + } +}; + /**---------------------------------------------------------------------------* * @brief Indicates whether `type` is a numeric `data_type`. * @@ -81,8 +89,7 @@ constexpr inline bool is_numeric() { * @return false `type` is not numeric *---------------------------------------------------------------------------**/ constexpr inline bool is_numeric(data_type type) { - return cudf::exp::type_dispatcher( - type, [](auto dummy) { return is_numeric(); }, 0); + return cudf::exp::type_dispatcher(type, is_numeric_impl{}); } /**---------------------------------------------------------------------------* @@ -101,6 +108,13 @@ constexpr inline bool is_fixed_width() { return cudf::is_numeric(); } +struct is_fixed_width_impl { + template + bool operator()() { + return is_fixed_width(); + } +}; + /**---------------------------------------------------------------------------* * @brief Indicates whether elements of `type` are fixed-width. * @@ -111,12 +125,11 @@ constexpr inline bool is_fixed_width() { * @return false `type` is variable-width *---------------------------------------------------------------------------**/ constexpr inline bool is_fixed_width(data_type type) { - return cudf::exp::type_dispatcher( - type, [](auto dummy) { return is_fixed_width(); }, 0); + return cudf::exp::type_dispatcher(type, is_fixed_width_impl{}); } /**---------------------------------------------------------------------------* - * @brief Indictates whether the type `T` is a compound type. + * @brief Indicates whether the type `T` is a compound type. * * `column`s with "compound" elements are logically a single column of elements, * but may be concretely implemented with two or more `column`s. For example, a @@ -129,24 +142,15 @@ constexpr inline bool is_fixed_width(data_type type) { *---------------------------------------------------------------------------**/ template constexpr inline bool is_compound() { - // TODO Implement with checks for the compound wrapper types - return false; + return std::is_same::value; } -/**---------------------------------------------------------------------------* - * @brief Indicates whether the type `T` is a simple type. - * - * "Simple" element types are implemented with only a single column, i.e., - * `num_children() == 0` for columns of "simple" elements - * - * @tparam T The type to verify - * @return true `T` corresponds to a simple type - * @return false `T` corresponds to a compound type - *---------------------------------------------------------------------------**/ -template -constexpr inline bool is_simple() { - return not is_compound(); -} +struct is_compound_impl { + template + bool operator()() { + return is_compound(); + } +}; /**---------------------------------------------------------------------------* * @brief Indicates whether elements of `type` are compound. @@ -161,10 +165,31 @@ constexpr inline bool is_simple() { * @return false `type` is a simple type *---------------------------------------------------------------------------**/ constexpr inline bool is_compound(data_type type) { - return cudf::exp::type_dispatcher( - type, [](auto dummy) { return is_compound(); }, 0); + return cudf::exp::type_dispatcher(type, is_compound_impl{}); } +/**---------------------------------------------------------------------------* + * @brief Indicates whether the type `T` is a simple type. + * + * "Simple" element types are implemented with only a single column, i.e., + * `num_children() == 0` for columns of "simple" elements + * + * @tparam T The type to verify + * @return true `T` corresponds to a simple type + * @return false `T` corresponds to a compound type + *---------------------------------------------------------------------------**/ +template +constexpr inline bool is_simple() { + return not is_compound(); +} + +struct is_simple_impl { + template + bool operator()() { + return is_simple(); + } +}; + /**---------------------------------------------------------------------------* * @brief Indicates whether elements of `type` are simple. * diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp index cc582b2df0c..23b43584386 100644 --- a/cpp/include/cudf/utilities/type_dispatcher.hpp +++ b/cpp/include/cudf/utilities/type_dispatcher.hpp @@ -92,6 +92,7 @@ CUDF_TYPE_MAPPING(int32_t, type_id::INT32); CUDF_TYPE_MAPPING(int64_t, type_id::INT64); CUDF_TYPE_MAPPING(float, type_id::FLOAT32); CUDF_TYPE_MAPPING(double, type_id::FLOAT64); +CUDF_TYPE_MAPPING(cudf::string_view, type_id::STRING); /**---------------------------------------------------------------------------* * @brief Invokes an `operator()` template with the type instantiation based on @@ -211,6 +212,9 @@ CUDA_HOST_DEVICE_CALLABLE constexpr decltype(auto) type_dispatcher( case FLOAT64: return f.template operator()::type>( std::forward(args)...); + case STRING: + return f.template operator()::type>( + std::forward(args)...); default: { #ifndef __CUDA_ARCH__ CUDF_FAIL("Unsupported type_id."); diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu index 329c03980a9..cef67f03b94 100644 --- a/cpp/src/column/column_device_view.cu +++ b/cpp/src/column/column_device_view.cu @@ -32,26 +32,120 @@ column_device_view::column_device_view(column_view source) // Free device memory allocated for children void column_device_view::destroy() { + // TODO Needs to handle grand-children + RMM_FREE(d_children,0); delete this; - // TODO Implement once support for children is added +} + +// Place any child objects in host memory (h_ptr) and use the device +// memory ptr (d_ptr) to set any child object pointers. +column_device_view::column_device_view( column_view source, ptrdiff_t h_ptr, ptrdiff_t d_ptr ) + : detail::column_device_view_base{source.type(), source.size(), + source.head(), source.null_mask(), + source.null_count(), source.offset()}, + _num_children{source.num_children()} +{ + if( count_descendants(source) > _num_children ) { + CUDF_FAIL("Columns with grand-children are not currently supported."); + } + if( _num_children > 0 ) + { + column_device_view* h_column = reinterpret_cast(h_ptr); + column_device_view* d_column = reinterpret_cast(d_ptr); + d_children = d_column; + for( size_type idx=0; idx < _num_children; ++idx ) + { // inplace-new each child + column_view child = source.child(idx); + new(h_column) column_device_view(child); + h_column++; + } + } } // Construct a unique_ptr that invokes `destroy()` as it's deleter -std::unique_ptr> -column_device_view::create(column_view source_view, cudaStream_t stream) { +std::unique_ptr> column_device_view::create(column_view source, cudaStream_t stream) { + size_type num_children = source.num_children(); + if( count_descendants(source) > num_children ) { + CUDF_FAIL("Columns with grand-children are not currently supported."); + } + auto deleter = [](column_device_view* v) { v->destroy(); }; + std::unique_ptr p{ + new column_device_view(source), deleter}; - size_type num_descendants{count_descendants(source_view)}; - if (num_descendants > 0) { - CUDF_FAIL("Columns with children are not currently supported."); + if( num_children > 0 ) + { + // create device memory for the children + RMM_TRY(RMM_ALLOC(&p->d_children, sizeof(column_device_view)*num_children, stream)); + // build the children into CPU memory first + std::vector buffer(sizeof(column_device_view)*num_children); + auto h_ptr = buffer.data(); + for( size_type idx=0; idx < num_children; ++idx ) + { + // create device-view from view + column_device_view child(source.child(idx)); + // copy child into buffer + memcpy(h_ptr, &child, sizeof(column_device_view)); + // point to the next array slot + h_ptr += sizeof(column_device_view); + } + // copy the CPU memory with the children into device memory + CUDA_TRY(cudaMemcpyAsync(p->d_children, buffer.data(), num_children*sizeof(column_device_view), + cudaMemcpyHostToDevice, stream)); + p->_num_children = num_children; + cudaStreamSynchronize(stream); } + return p; +} - auto deleter = [](column_device_view* v) { v->destroy(); }; +size_type column_device_view::extent(column_view source) { + size_type data_size = sizeof(column_device_view); + for( size_type idx=0; idx < source.num_children(); ++idx ) + data_size += extent(source.child(idx)); + return data_size; +} - std::unique_ptr p{ - new column_device_view(source_view), deleter}; +// For use with inplace-new to pre-fill memory to be copied to device +mutable_column_device_view::mutable_column_device_view( mutable_column_view source ) + : detail::column_device_view_base{source.type(), source.size(), + source.head(), source.null_mask(), + source.null_count(), source.offset()} +{ + // TODO children may not be actually possible for mutable columns + CUDF_EXPECTS(source.num_children()>0, "Mutable columns with children are not currently supported."); +} + +mutable_column_device_view::mutable_column_device_view( mutable_column_view source, ptrdiff_t h_ptr, ptrdiff_t d_ptr ) + : detail::column_device_view_base{source.type(), source.size(), + source.head(), source.null_mask(), + source.null_count(), source.offset()} +{ + // TODO children may not be actually possible for mutable columns + CUDF_EXPECTS(source.num_children()>0, "Mutable columns with children are not currently supported."); +} +// Handle freeing children +void mutable_column_device_view::destroy() { + RMM_FREE(mutable_children,0); + delete this; +} + +// Construct a unique_ptr that invokes `destroy()` as it's deleter +std::unique_ptr> + mutable_column_device_view::create(mutable_column_view source, cudaStream_t stream) { + // TODO children may not be actually possible for mutable columns + CUDF_EXPECTS(source.num_children()>0, "Mutable columns with children are not currently supported."); + auto deleter = [](mutable_column_device_view* v) { v->destroy(); }; + std::unique_ptr p{ + new mutable_column_device_view(source), deleter}; return p; } -} // namespace cudf \ No newline at end of file +size_type mutable_column_device_view::extent(mutable_column_view source) { + size_type data_size = sizeof(mutable_column_device_view); + for( size_type idx=0; idx < source.num_children(); ++idx ) + data_size += extent(source.child(idx)); + return data_size; +} + + +} // namespace cudf diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp index a3f5ca1d377..1d042d761ba 100644 --- a/cpp/src/column/column_factories.cpp +++ b/cpp/src/column/column_factories.cpp @@ -23,7 +23,13 @@ namespace cudf { namespace { struct size_of_helper { template - constexpr int operator()() const noexcept { + constexpr std::enable_if_t(), int> operator()() const { + CUDF_FAIL("Invalid, non fixed-width element type."); + } + + template + constexpr std::enable_if_t(), int> operator()() const + noexcept { return sizeof(T); } }; diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index 562bcd11d6c..4b84b1f5d18 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -42,8 +43,11 @@ column_view_base::column_view_base(data_type type, size_type size, CUDF_EXPECTS(nullptr == data, "EMPTY column should have no data."); CUDF_EXPECTS(nullptr == null_mask, "EMPTY column should have no null mask."); - } else if (size > 0) { - CUDF_EXPECTS(nullptr != data, "Null data pointer."); + } + else if ( is_compound(type) ) { + CUDF_EXPECTS(nullptr == data, "Compound (parent) columns cannot have data"); + } else if( size > 0){ + CUDF_EXPECTS(nullptr != data, "Null data pointer."); } CUDF_EXPECTS(offset >= 0, "Invalid offset."); diff --git a/cpp/src/string/nvcategory_util.cpp b/cpp/src/strings/nvcategory_util.cpp similarity index 100% rename from cpp/src/string/nvcategory_util.cpp rename to cpp/src/strings/nvcategory_util.cpp diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu new file mode 100644 index 00000000000..6ebbf716464 --- /dev/null +++ b/cpp/src/strings/strings_column_factories.cu @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + + +namespace cudf { + +// Create a strings-type column from array of pointer/size pairs +std::unique_ptr make_strings_column( + const rmm::device_vector>& strings, + cudaStream_t stream, + rmm::mr::device_memory_resource* mr) +{ + size_type num_strings = strings.size(); + // maybe a separate factory for creating null strings-column + CUDF_EXPECTS(num_strings > 0, "must specify at least one pair"); + + auto execpol = rmm::exec_policy(stream); + auto d_strings = strings.data().get(); + + // check total size is not too large for cudf column + size_t bytes = thrust::transform_reduce( execpol->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_strings), + [d_strings] __device__ (size_t idx) { + auto item = d_strings[idx]; + return (item.first!=nullptr) ? item.second : 0; + }, + 0, thrust::plus()); + CUDF_EXPECTS( bytes < std::numeric_limits::max(), "total size of strings is too large for cudf column" ); + + // build offsets column -- last entry is the total size + auto offsets_column = make_numeric_column( data_type{INT32}, num_strings+1, mask_state::UNALLOCATED, stream, mr ); + auto offsets_view = offsets_column->mutable_view(); + auto d_offsets = offsets_view.data(); + // Using inclusive-scan to compute last entry which is the total size. + // Exclusive-scan is possible but will not compute that last entry. + // Rather than manually computing the final offset using values in device memory, + // we use inclusive-scan on a shifted output (d_offsets+1) and then set the first + // zero offset manually. + thrust::transform_inclusive_scan( execpol->on(stream), + thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_strings), + d_offsets+1, // fills in offsets entries [1,num_strings] + [d_strings] __device__ (size_type idx) { + thrust::pair item = d_strings[idx]; + return ( item.first!=nullptr ? static_cast(item.second) : 0 ); + }, + thrust::plus() ); + // set the first offset to 0 + CUDA_TRY(cudaMemsetAsync( d_offsets, 0, sizeof(*d_offsets), stream)); + + // create null mask + auto valid_mask = valid_if( static_cast(nullptr), + [d_strings] __device__ (size_type idx) { return d_strings[idx].first!=nullptr; }, + num_strings, stream ); + auto null_count = valid_mask.second; + rmm::device_buffer null_mask(valid_mask.first, gdf_valid_allocation_size(num_strings), + stream, mr); + RMM_TRY( RMM_FREE(valid_mask.first,stream) ); // TODO valid_if to return device_buffer in future + // if we have all nulls, a null chars column is allowed + // if all non-null strings are empty strings, we need a non-null chars column + // - in this case we set the bytes to 1 to create a minimal one-byte chars column + if( (bytes==0) && (null_count < num_strings) ) + bytes = 1; // all entries are empty strings + + // build chars column + auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::UNALLOCATED, stream, mr ); + auto chars_view = chars_column->mutable_view(); + auto d_chars = chars_view.data(); + thrust::for_each_n(execpol->on(stream), thrust::make_counting_iterator(0), num_strings, + [d_strings, d_offsets, d_chars] __device__(size_type idx){ + // place individual strings + auto item = d_strings[idx]; + if( item.first!=nullptr ) + memcpy(d_chars + d_offsets[idx], item.first, item.second ); + }); + + // build children vector + std::vector> children; + children.emplace_back(std::move(offsets_column)); + children.emplace_back(std::move(chars_column)); + + // no data-ptr with num_strings elements plus children + return std::make_unique( + data_type{STRING}, num_strings, rmm::device_buffer{0,stream,mr}, + null_mask, null_count, + std::move(children)); +} + +// Create a strings-type column from array of chars and array of offsets. +std::unique_ptr make_strings_column( + const rmm::device_vector& strings, + const rmm::device_vector& offsets, + const rmm::device_vector& valid_mask, + size_type null_count, + cudaStream_t stream, + rmm::mr::device_memory_resource* mr ) +{ + size_type num_strings = offsets.size()-1; + CUDF_EXPECTS( num_strings > 0, "strings count must be greater than 0"); + CUDF_EXPECTS( null_count < num_strings, "null strings column not yet supported"); + if( null_count > 0 ) { + CUDF_EXPECTS( !valid_mask.empty(), "Cannot have null elements without a null mask." ); + } + + auto execpol = rmm::exec_policy(stream); + size_type bytes = offsets.back() - offsets[0]; + CUDF_EXPECTS( bytes >=0, "invalid offsets vector"); + + // build offsets column -- this is the number of strings + 1 + auto offsets_column = make_numeric_column( data_type{INT32}, num_strings+1, mask_state::UNALLOCATED, stream, mr ); + auto offsets_view = offsets_column->mutable_view(); + CUDA_TRY(cudaMemcpyAsync( offsets_view.data(), offsets.data().get(), + (num_strings+1)*sizeof(int32_t), + cudaMemcpyDeviceToDevice, stream )); + + // build null bitmask + rmm::device_buffer null_mask; + if( null_count ) + null_mask = rmm::device_buffer(valid_mask.data().get(), + gdf_valid_allocation_size(num_strings), + stream, mr); + + // build chars column + auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::UNALLOCATED, stream, mr ); + auto chars_view = chars_column->mutable_view(); + CUDA_TRY(cudaMemcpyAsync( chars_view.data(), strings.data().get(), bytes, + cudaMemcpyDeviceToDevice, stream )); + + // build children vector + std::vector> children; + children.emplace_back(std::move(offsets_column)); + children.emplace_back(std::move(chars_column)); + + // + return std::make_unique( + data_type{STRING}, num_strings, rmm::device_buffer{0,stream,mr}, + null_mask, null_count, + std::move(children)); +} + +} // namespace cudf diff --git a/cpp/src/strings/strings_column_view.cu b/cpp/src/strings/strings_column_view.cu new file mode 100644 index 00000000000..e9148c95dac --- /dev/null +++ b/cpp/src/strings/strings_column_view.cu @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include + +namespace cudf { + +// +strings_column_view::strings_column_view( column_view strings_column ) + : column_view(strings_column) +{ + CUDF_EXPECTS( type().id()==STRING, "strings_column_view only supports strings"); +} + +column_view strings_column_view::parent() const +{ + return static_cast(*this); +} + +column_view strings_column_view::offsets() const +{ + return child(offsets_column_index); +} + +column_view strings_column_view::chars() const +{ + return child(chars_column_index); +} + + +namespace strings +{ + +// print strings to stdout +void print( strings_column_view strings, + size_type start, size_type end, + size_type max_width, const char* delimiter ) +{ + size_type count = strings.size(); + if( end < 0 || end > count ) + end = count; + if( start < 0 ) + start = 0; + if( start >= end ) + throw std::invalid_argument("invalid parameter value"); + count = end - start; + + // stick with the default stream for this odd/rare stdout function + auto execpol = rmm::exec_policy(0); + auto strings_column = column_device_view::create(strings.parent()); + auto d_column = *strings_column; + auto d_offsets = strings.offsets().data(); + auto d_strings = strings.chars().data(); + + // create output strings offsets + rmm::device_vector output_offsets(count+1,0); + size_t* d_output_offsets = output_offsets.data().get(); + thrust::transform_inclusive_scan( execpol->on(0), + thrust::make_counting_iterator(start), + thrust::make_counting_iterator(end), + d_output_offsets+1, + [d_column, max_width] __device__ (size_type idx) { + if( d_column.nullable() && d_column.is_null(idx) ) + return 0; + string_view d_str = d_column.element(idx); + size_type bytes = d_str.size_bytes(); + if( (max_width > 0) && (d_str.length() > max_width) ) + bytes = d_str.byte_offset(max_width); + return bytes+1; // allow for null-terminator on non-null strings + }, + thrust::plus()); + CUDA_TRY(cudaMemset( d_output_offsets, 0, sizeof(*d_output_offsets))); + + // build output buffer + size_t buffer_size = output_offsets.back(); // last element has total size + if( buffer_size == 0 ) + { + printf("all %d strings are null\n", count); + return; + } + rmm::device_vector buffer(buffer_size,0); // allocate and pre-null-terminate + char* d_buffer = buffer.data().get(); + // copy strings into output buffer + thrust::for_each_n(execpol->on(0), + thrust::make_counting_iterator(0), count, + [d_strings, start, d_offsets, d_output_offsets, d_buffer] __device__(size_type idx) { + size_t output_offset = d_output_offsets[idx]; + size_t length = d_output_offsets[idx+1] - output_offset; // bytes + if( length ) // this is only 0 for nulls + { + idx += start; + size_type offset = d_offsets[idx]; + memcpy(d_buffer + output_offset, d_strings + offset, length-1 ); + } + }); + + // copy output buffer to host + std::vector h_offsets(count+1); + CUDA_TRY(cudaMemcpy( h_offsets.data(), d_output_offsets, (count+1)*sizeof(size_t), cudaMemcpyDeviceToHost)); + std::vector h_buffer(buffer_size); + CUDA_TRY(cudaMemcpy( h_buffer.data(), d_buffer, buffer_size, cudaMemcpyDeviceToHost )); + + // print out the strings to stdout + for( size_type idx=0; idx < count; ++idx ) + { + size_t offset = h_offsets[idx]; + size_t length = h_offsets[idx+1] - offset; + printf("%d:",idx); + if( length ) + printf("[%s]", h_buffer.data()+offset); + else + printf(""); + printf("%s",delimiter); + } +} + +std::pair, rmm::device_vector> + create_offsets( strings_column_view strings, + cudaStream_t stream, + rmm::mr::device_memory_resource* mr ) +{ + std::pair, rmm::device_vector> results; + + size_type count = strings.size(); + auto d_offsets = strings.offsets().data(); + results.second = rmm::device_vector(count+1); + CUDA_TRY(cudaMemcpyAsync( results.second.data().get(), d_offsets, (count+1)*sizeof(size_type), + cudaMemcpyDeviceToHost, stream)); + + size_type bytes = thrust::device_pointer_cast(d_offsets)[count]; + auto d_chars = strings.chars().data(); + results.first = rmm::device_vector(bytes); + CUDA_TRY(cudaMemcpyAsync( results.first.data().get(), d_chars, bytes, + cudaMemcpyDeviceToHost, stream)); + + return results; +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/src/table/table_device_view.cu b/cpp/src/table/table_device_view.cu index 30b284fc6e6..ca6128bd431 100644 --- a/cpp/src/table/table_device_view.cu +++ b/cpp/src/table/table_device_view.cu @@ -21,9 +21,11 @@ #include +#include #include #include + namespace cudf { namespace detail { @@ -39,19 +41,55 @@ table_device_view_base::table_device_view_base( : _num_rows{source_view.num_rows()}, _num_columns{source_view.num_columns()}, _stream{stream} { + + // The table's columns must be converted to ColumnDeviceView + // objects and copied into device memory for the table_device_view's + // _columns member. if (source_view.num_columns() > 0) { - size_type total_descendants = + // + // First calculate the size of memory needed to hold the + // array of ColumnDeviceViews. This is done by calling extent() + // for each of the ColumnViews in the table_view's columns. + size_type views_size_bytes = std::accumulate(source_view.begin(), source_view.end(), 0, - [](size_type init, column_view col) { - return init + count_descendants(col); - }); - CUDF_EXPECTS(0 == total_descendants, - "Columns with descendants are not yet supported."); - - auto views_size_bytes = - source_view.num_columns() * sizeof(*source_view.begin()); + [](size_type init, auto col) { + return init + ColumnDeviceView::extent(col); + }); + // A buffer of CPU memory is allocated to hold the ColumnDeviceView + // objects. Once filled, the CPU memory is then copied to device memory + // at the _columns member pointer. + std::vector h_buffer(views_size_bytes); + ColumnDeviceView* h_column = reinterpret_cast(h_buffer.data()); + // Each ColumnDeviceView instance may have child objects which may + // require setting some internal device pointers before being copied + // from CPU to device. + // Allocate the device memory to be used in the result. + // We need this pointer in order to pass it down when creating the + // ColumnDeviceViews so the column can set the pointer(s) for any + // of its child objects. RMM_TRY(RMM_ALLOC(&_columns, views_size_bytes, stream)); - CUDA_TRY(cudaMemcpyAsync(_columns, &(*source_view.begin()), + ColumnDeviceView* d_column = _columns; + // The beginning of the memory must be the fixed-sized ColumnDeviceView + // objects in order for _columns to be used as an array. Therefore, + // any child data is assigned to the end of this array. + int8_t* h_end = (int8_t*)(h_column + source_view.num_columns()); + int8_t* d_end = (int8_t*)(d_column + source_view.num_columns()); + // Create the ColumnDeviceView from each column within the CPU memory + // Any column child data should be copied into h_end and any + // internal pointers should be set using d_end. + for( auto itr=source_view.begin(); itr!=source_view.end(); ++itr ) + { + auto col = *itr; + // convert the ColumnView into ColumnDeviceView + new(h_column) ColumnDeviceView(col,(ptrdiff_t)h_end,(ptrdiff_t)d_end); + h_column++; // point to memory slot for the next ColumnDeviceView + // update the pointers for holding ColumnDeviceView's child data + auto col_child_data_size = (ColumnDeviceView::extent(col) - sizeof(ColumnDeviceView)); + h_end += col_child_data_size; + d_end += col_child_data_size; + } + + CUDA_TRY(cudaMemcpyAsync(_columns, h_buffer.data(), views_size_bytes, cudaMemcpyDefault, stream)); } } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 92960f4762f..ec83eab5625 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -499,6 +499,12 @@ set(DISPATCHER_TEST_SRC ConfigureTest(DISPATCHER_TEST "${DISPATCHER_TEST_SRC}") ################################################################################################### +# - strings test -------------------------------------------------------------------------------------- + +set(STRINGS_TEST_SRC + "${CMAKE_CURRENT_SOURCE_DIR}/strings/factories_test.cu") + +ConfigureTest(STRINGS_TEST "${STRINGS_TEST_SRC}") # - bitmask tests --------------------------------------------------------------------------------- set(BITMASK_TEST_SRC diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu new file mode 100644 index 00000000000..a2866dbd445 --- /dev/null +++ b/cpp/tests/strings/factories_test.cu @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +#include +#include + + +struct FactoriesTest : public cudf::test::BaseFixture {}; + +TEST_F(FactoriesTest, CreateColumnFromArray) +{ + std::vector h_test_strings{ "the quick brown fox jumps over the lazy dog", + "the fat cat lays next to the other accénted cat", + "a slow moving turtlé cannot catch the bird", + "which can be composéd together to form a more complete", + "thé result does not include the value in the sum in", + "", nullptr, "absent stop words" }; + + cudf::size_type memsize = 0; + for( auto itr=h_test_strings.begin(); itr!=h_test_strings.end(); ++itr ) + memsize += *itr ? (cudf::size_type)strlen(*itr) : 0; + cudf::size_type count = (cudf::size_type)h_test_strings.size(); + thrust::host_vector h_buffer(memsize); + thrust::device_vector d_buffer(memsize); + thrust::host_vector > strings(count); + thrust::host_vector h_offsets(count+1); + cudf::size_type offset = 0; + cudf::size_type nulls = 0; + h_offsets[0] = 0; + for( cudf::size_type idx=0; idx < count; ++idx ) + { + const char* str = h_test_strings[idx]; + if( !str ) + { + strings[idx] = thrust::pair{nullptr,0}; + nulls++; + } + else + { + cudf::size_type length = (cudf::size_type)strlen(str); + memcpy( h_buffer.data() + offset, str, length ); + strings[idx] = thrust::pair{d_buffer.data().get()+offset,length}; + offset += length; + } + h_offsets[idx+1] = offset; + } + rmm::device_vector> d_strings(strings); + cudaMemcpy( d_buffer.data().get(), h_buffer.data(), memsize, cudaMemcpyHostToDevice ); + auto column = cudf::make_strings_column( d_strings ); + EXPECT_EQ(column->type(), cudf::data_type{cudf::STRING}); + EXPECT_EQ(column->null_count(), nulls); + if( nulls ) + { + EXPECT_TRUE(column->nullable()); + EXPECT_TRUE(column->has_nulls()); + } + EXPECT_EQ(2, column->num_children()); + + cudf::strings_column_view strings_view(column->view()); + EXPECT_EQ( strings_view.size(), count); + EXPECT_EQ( strings_view.offsets().size(), count+1 ); + EXPECT_EQ( strings_view.chars().size(), memsize ); + + // check string data + auto strings_data = cudf::strings::create_offsets(strings_view); + thrust::host_vector h_chars_data(strings_data.first); + thrust::host_vector h_offsets_data(strings_data.second); + EXPECT_EQ( memcmp(h_buffer.data(), h_chars_data.data(), h_buffer.size()), 0 ); + EXPECT_EQ( memcmp(h_offsets.data(), h_offsets_data.data(), h_offsets.size()*sizeof(cudf::size_type)), 0); +} + +TEST_F(FactoriesTest, CreateColumnFromOffsets) +{ + std::vector h_test_strings{ "the quick brown fox jumps over the lazy dog", + "the fat cat lays next to the other accénted cat", + "a slow moving turtlé cannot catch the bird", + "which can be composéd together to form a more complete", + "thé result does not include the value in the sum in", + "absent stop words" }; + + cudf::size_type memsize = 0; + for( auto itr=h_test_strings.begin(); itr!=h_test_strings.end(); ++itr ) + memsize += *itr ? (cudf::size_type)strlen(*itr) : 0; + cudf::size_type count = (cudf::size_type)h_test_strings.size(); + thrust::host_vector h_buffer(memsize); + thrust::host_vector h_offsets(count+1); + cudf::size_type offset = 0; + h_offsets[0] = 0; + for( cudf::size_type idx=0; idx < count; ++idx ) + { + const char* str = h_test_strings[idx]; + if( str ) + { + cudf::size_type length = (cudf::size_type)strlen(str); + memcpy( h_buffer.data() + offset, str, length ); + offset += length; + } + h_offsets[idx+1] = offset; + } + rmm::device_vector d_buffer(h_buffer); + rmm::device_vector d_offsets(h_offsets); + rmm::device_vector d_nulls; + auto column = cudf::make_strings_column( d_buffer, d_offsets, d_nulls, 0 ); + EXPECT_EQ(column->type(), cudf::data_type{cudf::STRING}); + EXPECT_EQ(column->null_count(), 0); + EXPECT_EQ(2, column->num_children()); + + cudf::strings_column_view strings_view(column->view()); + EXPECT_EQ( strings_view.size(), count); + EXPECT_EQ( strings_view.offsets().size(), count+1 ); + EXPECT_EQ( strings_view.chars().size(), memsize ); + + // check string data + auto strings_data = cudf::strings::create_offsets(strings_view); + thrust::host_vector h_chars_data(strings_data.first); + thrust::host_vector h_offsets_data(strings_data.second); + EXPECT_EQ( memcmp(h_buffer.data(), h_chars_data.data(), h_buffer.size()), 0 ); + EXPECT_EQ( memcmp(h_offsets.data(), h_offsets_data.data(), h_offsets.size()*sizeof(cudf::size_type)), 0); +}