Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support Segmented Min/Max Reduction on String Type #10447

Merged
merged 25 commits into from
Apr 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
068542a
Passes compile, initial layout for how test should look like
isVoid Mar 16, 2022
66bdb75
remove unused include
isVoid Mar 16, 2022
9bdcc65
Fix exponent bug
isVoid Mar 16, 2022
3ddad32
Fix to test for max aggregation and add string type test
isVoid Mar 17, 2022
889c253
Merge branch 'branch-22.04' of github.com:rapidsai/cudf into fea/seg_…
isVoid Mar 21, 2022
e78ee18
Remove excess brace
isVoid Mar 25, 2022
5dcd6a1
Merge branch 'branch-22.06' of github.com:rapidsai/cudf into fea/seg_…
isVoid Apr 1, 2022
eef7f21
Refactor `element_minmax_fn` to a shared utility file
isVoid Apr 4, 2022
a8dc8ea
Initial pass on string type min, max support
isVoid Apr 4, 2022
ed5eaf9
Merge branch 'branch-22.06' of https://github.com/rapidsai/cudf into …
isVoid Apr 7, 2022
24c8f91
Merge branch 'branch-22.06' of github.com:rapidsai/cudf into fea/seg_…
isVoid Apr 11, 2022
fe5da2d
Add docstrings for `string_segmented_reduction` and tparams cleanups,…
isVoid Apr 11, 2022
ab801f1
Move element argminmax to a separate file.
isVoid Apr 11, 2022
5e7ca89
Add tests for string segmented reduction.
isVoid Apr 11, 2022
680e0fe
Remove stale commented out codes.
isVoid Apr 11, 2022
19e5df8
clang-format
bdice Apr 11, 2022
d826cf9
Redefine null placeholder.
isVoid Apr 11, 2022
2d3bd6b
Use constexpr
isVoid Apr 15, 2022
d4ad909
adopt ctad for vector types
isVoid Apr 15, 2022
10af494
Merge branch 'temp' into fea/seg_reduction_dec_type
isVoid Apr 15, 2022
4a3a541
Wrap argminmax idx condition with lambda
isVoid Apr 15, 2022
6910c16
Minor bug fix
isVoid Apr 15, 2022
1d0b716
Consolidate input data as fixtures
isVoid Apr 15, 2022
424f540
Update cpp/include/cudf/detail/utilities/element_argminmax.cuh
isVoid Apr 15, 2022
8f2a687
style fix
isVoid Apr 15, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 14 additions & 13 deletions cpp/include/cudf/detail/reduction.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -227,36 +227,36 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
* @brief Compute the specified simple reduction over each of the segments in the
* input range of elements.
*
* @tparam Op the reduction operator with device binary operator
* @tparam InputIterator the input column iterator
* @tparam OffsetIterator the offset column iterator
* @tparam BinaryOp the device binary operator used to reduce
* @tparam OutputType the output type of reduction
*
* @param[in] d_in the begin iterator to input
* @param[in] d_offset the begin iterator to offset
* @param[in] num_segments the number of segments
* @param[in] sop the reduction operator
* @param[in] binary_op the reduction operator
* @param[in] identity the identity element of the reduction operator
* @param[in] stream CUDA stream used for device memory operations and kernel launches.
* @param[in] mr Device memory resource used to allocate the returned column's device
* memory
* @returns Output column in device memory
*
*/
template <typename Op,
typename InputIterator,
template <typename InputIterator,
typename OffsetIterator,
typename BinaryOp,
typename OutputType = typename thrust::iterator_value<InputIterator>::type,
typename std::enable_if_t<is_fixed_width<OutputType>() &&
not cudf::is_fixed_point<OutputType>()>* = nullptr>
!cudf::is_fixed_point<OutputType>()>* = nullptr>
isVoid marked this conversation as resolved.
Show resolved Hide resolved
std::unique_ptr<column> segmented_reduce(InputIterator d_in,
OffsetIterator d_offset,
cudf::size_type num_segments,
op::simple_op<Op> sop,
BinaryOp binary_op,
OutputType identity,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto binary_op = sop.get_binary_op();
auto identity = sop.template get_identity<OutputType>();
auto dev_result = make_fixed_width_column(
data_type{type_to_id<OutputType>()}, num_segments, mask_state::UNALLOCATED, stream, mr);
auto dev_result_mview = dev_result->mutable_view();
Expand Down Expand Up @@ -291,16 +291,17 @@ std::unique_ptr<column> segmented_reduce(InputIterator d_in,
return dev_result;
}

template <typename Op,
typename InputIterator,
template <typename InputIterator,
typename OffsetIterator,
typename BinaryOp,
typename OutputType = typename thrust::iterator_value<InputIterator>::type,
typename std::enable_if_t<not is_fixed_width<OutputType>() ||
is_fixed_point<OutputType>()>* = nullptr>
typename std::enable_if_t<!(is_fixed_width<OutputType>() &&
!cudf::is_fixed_point<OutputType>())>* = nullptr>
std::unique_ptr<column> segmented_reduce(InputIterator,
OffsetIterator,
cudf::size_type,
op::simple_op<Op>,
BinaryOp,
OutputType,
rmm::cuda_stream_view,
rmm::mr::device_memory_resource*)
{
Expand Down
61 changes: 61 additions & 0 deletions cpp/include/cudf/detail/utilities/element_argminmax.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/column/column_device_view.cuh>
#include <cudf/types.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/traits.hpp>

#include <type_traits>

namespace cudf {
namespace detail {

/**
* @brief Binary `argmin`/`argmax` operator
*
* @tparam T Type of the underlying column. Must support '<' operator.
*/
template <typename T>
struct element_argminmax_fn {
column_device_view const d_col;
bool const has_nulls;
bool const arg_min;

__device__ inline auto operator()(size_type const& lhs_idx, size_type const& rhs_idx) const
{
// The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
// github.com/NVIDIA/thrust/issues/1525
// where invalid random values may be passed here by thrust::reduce_by_key
auto out_of_bound_or_null = [this] __device__(size_type const& idx) {
return idx < 0 || idx >= this->d_col.size() ||
(this->has_nulls && this->d_col.is_null_nocheck(idx));
};
if (out_of_bound_or_null(lhs_idx)) { return rhs_idx; }
if (out_of_bound_or_null(rhs_idx)) { return lhs_idx; }
isVoid marked this conversation as resolved.
Show resolved Hide resolved

// Return `lhs_idx` iff:
// row(lhs_idx) < row(rhs_idx) and finding ArgMin, or
// row(lhs_idx) >= row(rhs_idx) and finding ArgMax.
auto const less = d_col.element<T>(lhs_idx) < d_col.element<T>(rhs_idx);
return less == arg_min ? lhs_idx : rhs_idx;
}
};

} // namespace detail
} // namespace cudf
36 changes: 3 additions & 33 deletions cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <cudf/column/column_view.hpp>
#include <cudf/detail/aggregation/aggregation.cuh>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/utilities/element_argminmax.cuh>
#include <cudf/detail/valid_if.cuh>
#include <cudf/table/row_operators.cuh>
#include <cudf/types.hpp>
Expand All @@ -40,37 +41,6 @@ namespace cudf {
namespace groupby {
namespace detail {

/**
* @brief Binary operator with index values into the input column.
*
* @tparam T Type of the underlying column. Must support '<' operator.
*/
template <typename T>
struct element_arg_minmax_fn {
column_device_view const d_col;
bool const has_nulls;
bool const arg_min;

__device__ inline auto operator()(size_type const& lhs_idx, size_type const& rhs_idx) const
{
// The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
// github.com/NVIDIA/thrust/issues/1525
// where invalid random values may be passed here by thrust::reduce_by_key
if (lhs_idx < 0 || lhs_idx >= d_col.size() || (has_nulls && d_col.is_null_nocheck(lhs_idx))) {
return rhs_idx;
}
if (rhs_idx < 0 || rhs_idx >= d_col.size() || (has_nulls && d_col.is_null_nocheck(rhs_idx))) {
return lhs_idx;
}

// Return `lhs_idx` iff:
// row(lhs_idx) < row(rhs_idx) and finding ArgMin, or
// row(lhs_idx) >= row(rhs_idx) and finding ArgMax.
auto const less = d_col.element<T>(lhs_idx) < d_col.element<T>(rhs_idx);
return less == arg_min ? lhs_idx : rhs_idx;
}
};

/**
* @brief Value accessor for column which supports dictionary column too.
*
Expand Down Expand Up @@ -211,8 +181,8 @@ struct group_reduction_functor<K, T, std::enable_if_t<is_group_reduction_support

if constexpr (K == aggregation::ARGMAX || K == aggregation::ARGMIN) {
auto const count_iter = thrust::make_counting_iterator<ResultType>(0);
auto const binop =
element_arg_minmax_fn<T>{*d_values_ptr, values.has_nulls(), K == aggregation::ARGMIN};
auto const binop = cudf::detail::element_argminmax_fn<T>{
*d_values_ptr, values.has_nulls(), K == aggregation::ARGMIN};
do_reduction(count_iter, result_begin, binop);
} else {
using OpType = cudf::detail::corresponding_operator_t<K>;
Expand Down
136 changes: 132 additions & 4 deletions cpp/src/reductions/simple_segmented.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,15 @@

#pragma once

#include <cudf/detail/aggregation/aggregation.hpp>
#include <cudf/detail/copy.hpp>
#include <cudf/detail/gather.hpp>
#include <cudf/detail/null_mask.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/reduction.cuh>
#include <cudf/detail/unary.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/element_argminmax.cuh>
#include <cudf/detail/valid_if.cuh>
#include <cudf/null_mask.hpp>
#include <cudf/types.hpp>
Expand All @@ -31,9 +34,12 @@

#include <rmm/cuda_stream_view.hpp>

#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/zip_iterator.h>

#include <type_traits>

namespace cudf {
namespace reduction {
namespace simple {
Expand Down Expand Up @@ -70,18 +76,21 @@ std::unique_ptr<column> simple_segmented_reduction(column_view const& col,
auto simple_op = Op{};
size_type num_segments = offsets.size() - 1;

auto binary_op = simple_op.get_binary_op();
auto identity = simple_op.template get_identity<ResultType>();

// TODO: Explore rewriting null_replacing_element_transformer/element_transformer with nullate
auto result = [&] {
if (col.has_nulls()) {
auto f = simple_op.template get_null_replacing_element_transformer<ResultType>();
auto it = thrust::make_transform_iterator(dcol->pair_begin<InputType, true>(), f);
return cudf::reduction::detail::segmented_reduce(
it, offsets.begin(), num_segments, simple_op, stream, mr);
it, offsets.begin(), num_segments, binary_op, identity, stream, mr);
} else {
auto f = simple_op.template get_element_transformer<ResultType>();
auto it = thrust::make_transform_iterator(dcol->begin<InputType>(), f);
return cudf::reduction::detail::segmented_reduce(
it, offsets.begin(), num_segments, simple_op, stream, mr);
it, offsets.begin(), num_segments, binary_op, identity, stream, mr);
}
}();

Expand All @@ -103,6 +112,112 @@ std::unique_ptr<column> simple_segmented_reduction(column_view const& col,
return result;
}

/**
* @brief String segmented reduction for 'min', 'max'.
*
* This algorithm uses argmin/argmax as a custom comparator to build a gather
* map, then builds the output.
*
* @tparam InputType the input column data-type
* @tparam Op the operator of cudf::reduction::op::
* @param col Input column of data to reduce.
* @param offsets Indices to segment boundaries.
* @param null_handling If `null_policy::INCLUDE`, all elements in a segment
* must be valid for the reduced value to be valid. If `null_policy::EXCLUDE`,
* the reduced value is valid if any element in the segment is valid.
* @param stream Used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory
* @return Output column in device memory
*/

template <typename InputType,
typename Op,
CUDF_ENABLE_IF(std::is_same_v<Op, cudf::reduction::op::min> ||
std::is_same_v<Op, cudf::reduction::op::max>)>
std::unique_ptr<column> string_segmented_reduction(column_view const& col,
device_span<size_type const> offsets,
null_policy null_handling,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
// Pass to simple_segmented_reduction, get indices to gather, perform gather here.
auto device_col = cudf::column_device_view::create(col, stream);

auto it = thrust::make_counting_iterator(0);
auto const num_segments = static_cast<size_type>(offsets.size()) - 1;

bool constexpr is_argmin = std::is_same_v<Op, cudf::reduction::op::min>;
auto string_comparator =
cudf::detail::element_argminmax_fn<InputType>{*device_col, col.has_nulls(), is_argmin};
auto constexpr identity =
is_argmin ? cudf::detail::ARGMIN_SENTINEL : cudf::detail::ARGMAX_SENTINEL;

auto gather_map =
cudf::reduction::detail::segmented_reduce(it,
offsets.begin(),
num_segments,
string_comparator,
identity,
stream,
rmm::mr::get_current_device_resource());
auto result = std::move(cudf::detail::gather(table_view{{col}},
*gather_map,
cudf::out_of_bounds_policy::NULLIFY,
cudf::detail::negative_index_policy::NOT_ALLOWED,
stream,
mr)
->release()[0]);
auto const [segmented_null_mask, segmented_null_count] =
cudf::detail::segmented_null_mask_reduction(col.null_mask(),
offsets.begin(),
offsets.end() - 1,
offsets.begin() + 1,
null_handling,
stream,
mr);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A super minor nit: we can determine if segmented_null_mask is a temporary memory based on result->null_count() and choose to use default resource v.s. supplied resource here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feel free to ignore or punt to next one @bdice.

Copy link
Contributor

@bdice bdice Apr 29, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are already using mr as efficiently (minimally) as possible. If result->null_count() == 0, we return the mr-allocated bitmask. Else, we do an inplace_bitmask_and that alters the mr-allocated bitmask before returning it.

The use of mr in inplace_bitmask_and, which calls inplace_bitmask_binop, is a little misleading. The allocations there are always temporary and are not part of the return value. I think it may be possible to refactor the inplace functions to remove mr as an argument, and require the use of the default memory resource to avoid an implication that it is used for the returned data (it is not, since it acts in-place on bitmasks passed in that were from some other allocator).

Copy link
Contributor Author

@isVoid isVoid Apr 29, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When the gather result contains null, its null_mask is allocated with mr. The inplace_bitmask_and uses this as the target null mask, in this situation, the segmented_null_mask doesn't need to use the supplied mr because this null mask is acting only as an operand in the inplace_nullmask_and, not as the result.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, that might be the case. The segmented_null_mask_reduction could use result->null_count() == 0 ? mr : rmm::mr::get_current_device_resource(). I'm not sure if I would recommend using a conditional mr for the sake of clarity, but it seems that it could be done.


// If the segmented null mask contains any null values, the segmented null mask
// must be combined with the result null mask.
if (segmented_null_count > 0) {
if (result->null_count() == 0) {
// The result has no nulls. Use the segmented null mask.
result->set_null_mask(segmented_null_mask, segmented_null_count, stream);
} else {
// Compute the logical AND of the segmented output null mask and the
// result null mask to update the result null mask and null count.
auto result_mview = result->mutable_view();
std::vector masks{static_cast<bitmask_type const*>(result_mview.null_mask()),
static_cast<bitmask_type const*>(segmented_null_mask.data())};
std::vector<size_type> begin_bits{0, 0};
auto const valid_count = cudf::detail::inplace_bitmask_and(
device_span<bitmask_type>(static_cast<bitmask_type*>(result_mview.null_mask()),
num_bitmask_words(result->size())),
masks,
begin_bits,
result->size(),
stream,
mr);
result->set_null_count(result->size() - valid_count);
}
}

return result;
}

template <typename InputType,
typename Op,
CUDF_ENABLE_IF(!std::is_same_v<Op, cudf::reduction::op::min>() &&
!std::is_same_v<Op, cudf::reduction::op::max>())>
std::unique_ptr<column> string_segmented_reduction(column_view const& col,
device_span<size_type const> offsets,
null_policy null_handling,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FAIL("Segmented reduction on string column only supports min and max reduction.");
}

/**
* @brief Call reduce and return a column of type bool.
*
Expand Down Expand Up @@ -153,7 +268,9 @@ struct same_column_type_dispatcher {
}

public:
template <typename ElementType, std::enable_if_t<is_supported<ElementType>()>* = nullptr>
template <typename ElementType,
CUDF_ENABLE_IF(is_supported<ElementType>() &&
!std::is_same_v<ElementType, string_view>)>
std::unique_ptr<column> operator()(column_view const& col,
device_span<size_type const> offsets,
null_policy null_handling,
Expand All @@ -164,7 +281,18 @@ struct same_column_type_dispatcher {
col, offsets, null_handling, stream, mr);
}

template <typename ElementType, std::enable_if_t<not is_supported<ElementType>()>* = nullptr>
template <typename ElementType,
CUDF_ENABLE_IF(is_supported<ElementType>() && std::is_same_v<ElementType, string_view>)>
std::unique_ptr<column> operator()(column_view const& col,
device_span<size_type const> offsets,
null_policy null_handling,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
return string_segmented_reduction<ElementType, Op>(col, offsets, null_handling, stream, mr);
}

template <typename ElementType, CUDF_ENABLE_IF(!is_supported<ElementType>())>
std::unique_ptr<column> operator()(column_view const&,
device_span<size_type const>,
null_policy,
Expand Down
Loading