Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix call to thrust::reduce_by_key in argmin/argmax libcudf groupby #9263

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 30 additions & 66 deletions cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,77 +31,50 @@
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/transform_output_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/reduce.h>

namespace cudf {
namespace groupby {
namespace detail {

// ArgMin binary operator with tuple of (value, index)
/**
* @brief ArgMin binary operator with index values into input column.
*
* @tparam T Type of the underlying column. Must support '<' operator.
*/
template <typename T>
struct ArgMin {
CUDA_HOST_DEVICE_CALLABLE auto operator()(thrust::tuple<T, size_type> const& lhs,
thrust::tuple<T, size_type> const& rhs) const
{
if (thrust::get<1>(lhs) == cudf::detail::ARGMIN_SENTINEL)
return rhs;
else if (thrust::get<1>(rhs) == cudf::detail::ARGMIN_SENTINEL)
return lhs;
else
return thrust::get<0>(lhs) < thrust::get<0>(rhs) ? lhs : rhs;
}
};

// ArgMax binary operator with tuple of (value, index)
template <typename T>
struct ArgMax {
CUDA_HOST_DEVICE_CALLABLE auto operator()(thrust::tuple<T, size_type> const& lhs,
thrust::tuple<T, size_type> const& rhs) const
{
if (thrust::get<1>(lhs) == cudf::detail::ARGMIN_SENTINEL)
return rhs;
else if (thrust::get<1>(rhs) == cudf::detail::ARGMIN_SENTINEL)
return lhs;
else
return thrust::get<0>(lhs) > thrust::get<0>(rhs) ? lhs : rhs;
}
};

struct get_tuple_second_element {
template <typename T>
__device__ size_type operator()(thrust::tuple<T, size_type> const& rhs) const
column_device_view const d_col;
CUDA_DEVICE_CALLABLE auto operator()(size_type const& lhs, size_type const& rhs) const
{
return thrust::get<1>(rhs);
// The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
// github.com/NVIDIA/thrust/issues/1525
// where invalid random values may be passed here by thrust::reduce_by_key
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

// where invalid random values

Is there ever valid random values?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It could happen. The data could be randomly valid. Either way, the thrust::reduce_by_key ignores the result -- it is just trying to fill a block/warp with minimal divergence.

if (lhs < 0 || lhs >= d_col.size() || d_col.is_null(lhs)) { return rhs; }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if rhs here is also out of bound or null? Is returning out of bound or null desired?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think the bounds checking is necessary seeing as this gets its values from a counting iterator which is always less than the size of d_col.

For null, I do think it's ok to return the index of a null element. If both are null then either can be returned, the winning idx will later be removed when compared against an idx corresponding to valid value. And if the entire group contains nulls then it'll be nullified in the group mask generation step.

Copy link
Contributor Author

@davidwendt davidwendt Sep 22, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The bounds checking is necessary because the two issues mentioned in the comment above.The thrust::reduce_by_key may actually pass an invalid (out-of-bounds) value because in certain cases it does not call through the iterator to retrieve the value it should pass. In these cases, it ignores the output but the damage is done.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I see. So bound check here is only to ensure the line 55 below works correctly, not to ensure the output to be used correctly later (since it is already handled).

if (rhs < 0 || rhs >= d_col.size() || d_col.is_null(rhs)) { return lhs; }
return d_col.element<T>(lhs) < d_col.element<T>(rhs) ? lhs : rhs;
}
};

/**
* @brief Functor to store the boolean value to null mask.
* @brief ArgMax binary operator with index values into input column.
*
* @tparam T Type of the underlying column. Must support '<' operator.
*/
struct bool_to_nullmask {
mutable_column_device_view d_result;
__device__ void operator()(size_type i, bool rhs)
template <typename T>
struct ArgMax {
column_device_view const d_col;
CUDA_DEVICE_CALLABLE auto operator()(size_type const& lhs, size_type const& rhs) const
{
if (rhs) {
d_result.set_valid(i);
} else {
d_result.set_null(i);
}
// The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
// github.com/NVIDIA/thrust/issues/1525
// where invalid random values may be passed here by thrust::reduce_by_key
if (lhs < 0 || lhs >= d_col.size() || d_col.is_null(lhs)) { return rhs; }
if (rhs < 0 || rhs >= d_col.size() || d_col.is_null(rhs)) { return lhs; }
return d_col.element<T>(rhs) < d_col.element<T>(lhs) ? lhs : rhs;
}
};

/**
* @brief Returns index for non-null element, and SENTINEL for null element in a column.
*
*/
struct null_as_sentinel {
column_device_view const col;
size_type const SENTINEL;
__device__ size_type operator()(size_type i) const { return col.is_null(i) ? SENTINEL : i; }
};

/**
* @brief Value accessor for column which supports dictionary column too.
*
Expand Down Expand Up @@ -191,25 +164,16 @@ struct reduce_functor {
auto resultview = mutable_column_device_view::create(result->mutable_view(), stream);
auto valuesview = column_device_view::create(values, stream);
if constexpr (K == aggregation::ARGMAX || K == aggregation::ARGMIN) {
constexpr auto SENTINEL =
(K == aggregation::ARGMAX ? cudf::detail::ARGMAX_SENTINEL : cudf::detail::ARGMIN_SENTINEL);
auto idx_begin =
cudf::detail::make_counting_transform_iterator(0, null_as_sentinel{*valuesview, SENTINEL});
// dictionary keys are sorted, so dictionary32 index comparison is enough.
auto column_begin = valuesview->begin<DeviceType>();
auto begin = thrust::make_zip_iterator(thrust::make_tuple(column_begin, idx_begin));
auto result_begin = thrust::make_transform_output_iterator(resultview->begin<ResultDType>(),
get_tuple_second_element{});
using OpType =
std::conditional_t<(K == aggregation::ARGMAX), ArgMax<DeviceType>, ArgMin<DeviceType>>;
thrust::reduce_by_key(rmm::exec_policy(stream),
group_labels.data(),
group_labels.data() + group_labels.size(),
begin,
thrust::make_counting_iterator<ResultType>(0),
thrust::make_discard_iterator(),
result_begin,
thrust::equal_to<size_type>{},
OpType{});
resultview->begin<ResultType>(),
thrust::equal_to<ResultType>{},
OpType{*valuesview});
} else {
auto init = OpType::template identity<DeviceType>();
auto begin = cudf::detail::make_counting_transform_iterator(
Expand Down