Skip to content

Commit

Permalink
Merge pull request #2811 from davidwendt/fea-ext-column-redesign
Browse files Browse the repository at this point in the history
[REVIEW] cudf strings column
  • Loading branch information
harrism authored Oct 7, 2019
2 parents 5082fcd + cda8b99 commit 6693374
Show file tree
Hide file tree
Showing 19 changed files with 1,792 additions and 97 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
- PR #2838 CSV Reader: Support ARROW_RANDOM_FILE input
- PR #2655 CuPy-based Series and Dataframe .values property
- PR #2803 Added `edit_distance_matrix()` function to calculate pairwise edit distance for each string on a given nvstrings object.
- PR #2811 Start of cudf strings column work based on 2207
- PR #2872 Add Java pinned memory pool allocator
- PR #2814 Add Datetimeindex.weekday

Expand Down
14 changes: 8 additions & 6 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ add_library(cudf
src/column/legacy/column.cpp
src/column/legacy/context.cpp
src/table/legacy/table.cpp
src/string/nvcategory_util.cpp
src/strings/nvcategory_util.cpp
src/join/joining.cu
src/orderby/orderby.cu
src/predicates/is_sorted.cu
Expand Down Expand Up @@ -432,13 +432,10 @@ add_library(cudf
src/table/table.cpp
src/bitmask/null_mask.cu
src/sort/sort.cu
src/strings/strings_column_factories.cu
src/strings/strings_column_view.cu
src/column/legacy/interop.cpp)

# Override RPATH for nvstrings
set_target_properties(libNVStrings PROPERTIES BUILD_RPATH "\$ORIGIN")
set_target_properties(libNVCategory PROPERTIES BUILD_RPATH "\$ORIGIN")
set_target_properties(libNVText PROPERTIES BUILD_RPATH "\$ORIGIN")

# Rename installation to proper names for later finding
set_target_properties(libNVStrings PROPERTIES OUTPUT_NAME "NVStrings")
set_target_properties(libNVCategory PROPERTIES OUTPUT_NAME "NVCategory")
Expand All @@ -447,6 +444,11 @@ set_target_properties(libNVText PROPERTIES OUTPUT_NAME "NVText")
# Override RPATH for cudf
set_target_properties(cudf PROPERTIES BUILD_RPATH "\$ORIGIN")

# Override RPATH for nvstrings
set_target_properties(libNVStrings PROPERTIES BUILD_RPATH "\$ORIGIN")
set_target_properties(libNVCategory PROPERTIES BUILD_RPATH "\$ORIGIN")
set_target_properties(libNVText PROPERTIES BUILD_RPATH "\$ORIGIN")

###################################################################################################
# - jitify ----------------------------------------------------------------------------------------

Expand Down
124 changes: 80 additions & 44 deletions cpp/include/cudf/column/column_device_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
#include <cudf/column/column_view.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/bit.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>

namespace cudf {

Expand Down Expand Up @@ -70,34 +72,6 @@ class alignas(16) column_device_view_base {
return head<T>() + _offset;
}

/**---------------------------------------------------------------------------*
* @brief Returns reference to element at the specified index.
*
* This function accounts for the offset.
*
* @tparam T The element type
* @param element_index Position of the desired element
*---------------------------------------------------------------------------**/
template <typename T>
__device__ T const& element(size_type element_index) const noexcept {
return data<T>()[element_index];
}

/**---------------------------------------------------------------------------*
* @brief Returns `string_view` to the string element at the specified index.
*
* This function accounts for the offset.
*
* @param element_index Position of the desired string
*---------------------------------------------------------------------------**/
/*
template <>
__device__ string_view const& element<string_view>(
size_type element_index) const noexcept {
// Fill this in
}
*/

/**---------------------------------------------------------------------------*
* @brief Returns the number of elements in the column
*---------------------------------------------------------------------------**/
Expand Down Expand Up @@ -234,6 +208,33 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
column_device_view& operator=(column_device_view const&) = default;
column_device_view& operator=(column_device_view&&) = default;

/**---------------------------------------------------------------------------*
* @brief Creates an instance of this class using the specified host memory
* pointer (h_ptr) to store child objects and the device memory pointer (d_ptr)
* as a base for any child object pointers.
*
* @param column Column view from which to create this instance.
* @param h_ptr Host memory pointer on which to place any child data.
* @param d_ptr Device memory pointer on which to base any child pointers.
*---------------------------------------------------------------------------**/
column_device_view( column_view column, ptrdiff_t h_ptr, ptrdiff_t d_ptr );

/**---------------------------------------------------------------------------*
* @brief Returns reference to element at the specified index.
*
* If the element at the specified index is NULL, i.e., `is_null(element_index) == true`,
* then any attempt to use the result will lead to undefined behavior.
*
* This function accounts for the offset.
*
* @tparam T The element type
* @param element_index Position of the desired element
*---------------------------------------------------------------------------**/
template <typename T>
__device__ T const element(size_type element_index) const noexcept {
return data<T>()[element_index];
}

/**---------------------------------------------------------------------------*
* @brief Factory to construct a column view that is usable in device memory.
*
Expand Down Expand Up @@ -265,6 +266,14 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
*---------------------------------------------------------------------------**/
void destroy();

/**---------------------------------------------------------------------------*
* @brief Return the size in bytes of the amount of memory needed to hold a
* device view of the specified column and it's children.
*
* @param source_view The `column_view` to use for this calculation.
*---------------------------------------------------------------------------**/
static size_type extent(column_view source_view);

/**---------------------------------------------------------------------------*
* @brief Returns the specified child
*
Expand Down Expand Up @@ -308,6 +317,17 @@ class alignas(16) mutable_column_device_view
default;
mutable_column_device_view& operator=(mutable_column_device_view&&) = default;

/**---------------------------------------------------------------------------*
* @brief Creates an instance of this class using the specified host memory
* pointer (h_ptr) to store child objects and the device memory pointer (d_ptr)
* as a base for any child object pointers.
*
* @param column Column view from which to create this instance.
* @param h_ptr Host memory pointer on which to place any child data.
* @param d_ptr Device memory pointer on which to base any child pointers.
*---------------------------------------------------------------------------**/
mutable_column_device_view( mutable_column_view column, ptrdiff_t h_ptr, ptrdiff_t d_ptr );

/**---------------------------------------------------------------------------*
* @brief Factory to construct a column view that is usable in device memory.
*
Expand All @@ -327,7 +347,8 @@ class alignas(16) mutable_column_device_view
* @return A `unique_ptr` to a `mutable_column_device_view` that makes the
*data from `source_view` available in device memory.
*---------------------------------------------------------------------------**/
static auto create(mutable_column_view source_view, cudaStream_t stream = 0);
static std::unique_ptr<mutable_column_device_view, std::function<void(mutable_column_device_view*)>>
create(mutable_column_view source_view, cudaStream_t stream = 0);

/**---------------------------------------------------------------------------*
* @brief Returns pointer to the base device memory allocation casted to
Expand Down Expand Up @@ -376,21 +397,6 @@ class alignas(16) mutable_column_device_view
return data<T>()[element_index];
}

/**---------------------------------------------------------------------------*
* @brief Returns `string_view` to the string element at the specified index.
*
* This function accounts for the offset.
*
* @param element_index Position of the desired string
*---------------------------------------------------------------------------**/
/*
template <>
__device__ string_view& element<string_view>(
size_type element_index) noexcept {
// Fill this in
}
*/

/**---------------------------------------------------------------------------*
* @brief Returns raw pointer to the underlying bitmask allocation.
*
Expand Down Expand Up @@ -454,6 +460,14 @@ class alignas(16) mutable_column_device_view
null_mask()[word_index] = new_word;
}

/**---------------------------------------------------------------------------*
* @brief Return the size in bytes of the amount of memory needed to hold a
* device view of the specified column and it's children.
*
* @param source_view The `column_view` to use for this calculation.
*---------------------------------------------------------------------------**/
static size_type extent(mutable_column_view source_view);

private:
mutable_column_device_view*
mutable_children{}; ///< Array of `mutable_column_device_view`
Expand All @@ -479,6 +493,28 @@ class alignas(16) mutable_column_device_view
* allocated to hold the child views.
*---------------------------------------------------------------------------**/
void destroy();

};

/**---------------------------------------------------------------------------*
* @brief Returns `string_view` to the string element at the specified index.
*
* If the element at the specified index is NULL, i.e., `is_null(element_index) == true`,
* then any attempt to use the result will lead to undefined behavior.
*
* This function accounts for the offset.
*
* @param element_index Position of the desired string element
* @return string_view instance representing this element at this index
*---------------------------------------------------------------------------**/
template <>
__device__ inline string_view const column_device_view::element<string_view>(
size_type element_index) const noexcept {
size_type index = element_index + offset(); // account for this view's _offset
const int32_t* d_offsets = d_children[strings_column_view::offsets_column_index].data<int32_t>();
const char* d_strings = d_children[strings_column_view::chars_column_index].data<char>();
size_type offset = d_offsets[index];
return string_view{d_strings + offset, d_offsets[index+1] - offset};
}

} // namespace cudf
66 changes: 66 additions & 0 deletions cpp/include/cudf/column/column_factories.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#include <cudf/types.hpp>
#include "column.hpp"

#include <rmm/thrust_rmm_allocator.h>

namespace cudf {
/**---------------------------------------------------------------------------*
* @brief Construct column with sufficient uninitialized storage
Expand All @@ -43,4 +45,68 @@ std::unique_ptr<column> make_numeric_column(
data_type type, size_type size, mask_state state = UNALLOCATED,
cudaStream_t stream = 0,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

/**---------------------------------------------------------------------------*
* @brief Construct STRING type column given an array of pointer/size pairs.
* The total number of char bytes must not exceed the maximum size of size_type.
* The string characters are expected to be UTF-8 encoded sequence of char bytes.
* Use the strings_column_view class to perform strings operations on this type
* of column.
*
* @note `null_count()` and `null_bitmask` are determined if a pair contains
* a null string. That is, for each pair, if `.first` is null, that string
* is considered null. Likewise, a string is considered empty (not null)
* if `.first` is not null and `.second` is 0. Otherwise the `.first` member
* must be a valid device address pointing to `.second` consecutive bytes.
*
* @throws std::bad_alloc if device memory allocation fails
*
* @param strings The pointer/size pair arrays.
* Each pointer must be a device memory address or `nullptr` (indicating a null string).
* The size must be the number of bytes.
* @param stream Optional stream for use with all memory allocation
* and device kernels
* @param mr Optional resource to use for device memory
* allocation of the column's `null_mask` and children.
*---------------------------------------------------------------------------**/
std::unique_ptr<column> make_strings_column(
const rmm::device_vector<thrust::pair<const char*,size_type>>& strings,
cudaStream_t stream = 0,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

/**---------------------------------------------------------------------------*
* @brief Construct STRING type column given an contiguous array of chars
* encoded as UTF-8, an array of byte offsets identifying individual strings
* within the char array, and a null bitmask.
* The total number of char bytes must not exceed the maximum size of size_type.
* Use the strings_column_view class to perform strings operations on this type
* of column.
* This function makes a deep copy of the strings, offsets, null_mask to create
* a new column.
*
* @throws std::bad_alloc if device memory allocation fails
*
* @param strings The contiguous array of chars in device memory.
* This char array is expected to be UTF-8 encoded characters.
* @param offsets The array of byte offsets in device memory.
* The number of elements is one more than the total number
* of strings so the offset[last] - offset[0] is the total
* number of bytes in the strings array.
* @param null_mask The array of bits specifying the null strings.
* This array must be in device memory.
* Arrow format for nulls is used for interpeting this bitmask.
* @param null_count The number of null string entries.
* @param stream Optional stream for use with all memory allocation
* and device kernels
* @param mr Optional resource to use for device memory
* allocation of the column's `null_mask` and children.
*---------------------------------------------------------------------------**/
std::unique_ptr<column> make_strings_column(
const rmm::device_vector<char>& strings,
const rmm::device_vector<size_type>& offsets,
const rmm::device_vector<bitmask_type>& null_mask,
size_type null_count,
cudaStream_t stream = 0,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

} // namespace cudf
Loading

0 comments on commit 6693374

Please sign in to comment.