Support Segmented Min/Max Reduction on String Type (#10447)

This PR adds `min/max` segmented reduction to string type. Part of #10417 Authors: - Michael Wang (https://github.com/isVoid) - Bradley Dice (https://github.com/bdice) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Bradley Dice (https://github.com/bdice) URL: #10447
rapidsai · Apr 29, 2022 · 9112907 · 9112907
1 parent 9b8d26f
commit 9112907
Show file tree

Hide file tree

Showing 5 changed files with 331 additions and 50 deletions.
diff --git a/cpp/include/cudf/detail/reduction.cuh b/cpp/include/cudf/detail/reduction.cuh
@@ -227,36 +227,36 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
  * @brief Compute the specified simple reduction over each of the segments in the
  * input range of elements.
  *
- * @tparam Op               the reduction operator with device binary operator
  * @tparam InputIterator    the input column iterator
  * @tparam OffsetIterator   the offset column iterator
+ * @tparam BinaryOp         the device binary operator used to reduce
  * @tparam OutputType       the output type of reduction
  *
  * @param[in] d_in          the begin iterator to input
  * @param[in] d_offset      the begin iterator to offset
  * @param[in] num_segments  the number of segments
- * @param[in] sop           the reduction operator
+ * @param[in] binary_op     the reduction operator
+ * @param[in] identity      the identity element of the reduction operator
  * @param[in] stream        CUDA stream used for device memory operations and kernel launches.
  * @param[in] mr            Device memory resource used to allocate the returned column's device
  * memory
  * @returns   Output column in device memory
  *
  */
-template <typename Op,
-          typename InputIterator,
+template <typename InputIterator,
           typename OffsetIterator,
+          typename BinaryOp,
           typename OutputType = typename thrust::iterator_value<InputIterator>::type,
           typename std::enable_if_t<is_fixed_width<OutputType>() &&
-                                    not cudf::is_fixed_point<OutputType>()>* = nullptr>
+                                    !cudf::is_fixed_point<OutputType>()>* = nullptr>
 std::unique_ptr<column> segmented_reduce(InputIterator d_in,
                                          OffsetIterator d_offset,
                                          cudf::size_type num_segments,
-                                         op::simple_op<Op> sop,
+                                         BinaryOp binary_op,
+                                         OutputType identity,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
-  auto binary_op  = sop.get_binary_op();
-  auto identity   = sop.template get_identity<OutputType>();
   auto dev_result = make_fixed_width_column(
     data_type{type_to_id<OutputType>()}, num_segments, mask_state::UNALLOCATED, stream, mr);
   auto dev_result_mview = dev_result->mutable_view();
@@ -291,16 +291,17 @@ std::unique_ptr<column> segmented_reduce(InputIterator d_in,
   return dev_result;
 }
 
-template <typename Op,
-          typename InputIterator,
+template <typename InputIterator,
           typename OffsetIterator,
+          typename BinaryOp,
           typename OutputType = typename thrust::iterator_value<InputIterator>::type,
-          typename std::enable_if_t<not is_fixed_width<OutputType>() ||
-                                    is_fixed_point<OutputType>()>* = nullptr>
+          typename std::enable_if_t<!(is_fixed_width<OutputType>() &&
+                                      !cudf::is_fixed_point<OutputType>())>* = nullptr>
 std::unique_ptr<column> segmented_reduce(InputIterator,
                                          OffsetIterator,
                                          cudf::size_type,
-                                         op::simple_op<Op>,
+                                         BinaryOp,
+                                         OutputType,
                                          rmm::cuda_stream_view,
                                          rmm::mr::device_memory_resource*)
 {

diff --git a/cpp/include/cudf/detail/utilities/element_argminmax.cuh b/cpp/include/cudf/detail/utilities/element_argminmax.cuh
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/traits.hpp>
+
+#include <type_traits>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief Binary `argmin`/`argmax` operator
+ *
+ * @tparam T Type of the underlying column. Must support '<' operator.
+ */
+template <typename T>
+struct element_argminmax_fn {
+  column_device_view const d_col;
+  bool const has_nulls;
+  bool const arg_min;
+
+  __device__ inline auto operator()(size_type const& lhs_idx, size_type const& rhs_idx) const
+  {
+    // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
+    // github.com/NVIDIA/thrust/issues/1525
+    // where invalid random values may be passed here by thrust::reduce_by_key
+    auto out_of_bound_or_null = [this] __device__(size_type const& idx) {
+      return idx < 0 || idx >= this->d_col.size() ||
+             (this->has_nulls && this->d_col.is_null_nocheck(idx));
+    };
+    if (out_of_bound_or_null(lhs_idx)) { return rhs_idx; }
+    if (out_of_bound_or_null(rhs_idx)) { return lhs_idx; }
+
+    // Return `lhs_idx` iff:
+    //   row(lhs_idx) <  row(rhs_idx) and finding ArgMin, or
+    //   row(lhs_idx) >= row(rhs_idx) and finding ArgMax.
+    auto const less = d_col.element<T>(lhs_idx) < d_col.element<T>(rhs_idx);
+    return less == arg_min ? lhs_idx : rhs_idx;
+  }
+};
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -23,6 +23,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.cuh>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/element_argminmax.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/types.hpp>
@@ -40,37 +41,6 @@ namespace cudf {
 namespace groupby {
 namespace detail {
 
-/**
- * @brief Binary operator with index values into the input column.
- *
- * @tparam T Type of the underlying column. Must support '<' operator.
- */
-template <typename T>
-struct element_arg_minmax_fn {
-  column_device_view const d_col;
-  bool const has_nulls;
-  bool const arg_min;
-
-  __device__ inline auto operator()(size_type const& lhs_idx, size_type const& rhs_idx) const
-  {
-    // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
-    // github.com/NVIDIA/thrust/issues/1525
-    // where invalid random values may be passed here by thrust::reduce_by_key
-    if (lhs_idx < 0 || lhs_idx >= d_col.size() || (has_nulls && d_col.is_null_nocheck(lhs_idx))) {
-      return rhs_idx;
-    }
-    if (rhs_idx < 0 || rhs_idx >= d_col.size() || (has_nulls && d_col.is_null_nocheck(rhs_idx))) {
-      return lhs_idx;
-    }
-
-    // Return `lhs_idx` iff:
-    //   row(lhs_idx) <  row(rhs_idx) and finding ArgMin, or
-    //   row(lhs_idx) >= row(rhs_idx) and finding ArgMax.
-    auto const less = d_col.element<T>(lhs_idx) < d_col.element<T>(rhs_idx);
-    return less == arg_min ? lhs_idx : rhs_idx;
-  }
-};
-
 /**
  * @brief Value accessor for column which supports dictionary column too.
  *
@@ -211,8 +181,8 @@ struct group_reduction_functor<K, T, std::enable_if_t<is_group_reduction_support
 
     if constexpr (K == aggregation::ARGMAX || K == aggregation::ARGMIN) {
       auto const count_iter = thrust::make_counting_iterator<ResultType>(0);
-      auto const binop =
-        element_arg_minmax_fn<T>{*d_values_ptr, values.has_nulls(), K == aggregation::ARGMIN};
+      auto const binop      = cudf::detail::element_argminmax_fn<T>{
+        *d_values_ptr, values.has_nulls(), K == aggregation::ARGMIN};
       do_reduction(count_iter, result_begin, binop);
     } else {
       using OpType    = cudf::detail::corresponding_operator_t<K>;

diff --git a/cpp/src/reductions/simple_segmented.cuh b/cpp/src/reductions/simple_segmented.cuh
@@ -16,12 +16,15 @@
 
 #pragma once
 
+#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/gather.hpp>
 #include <cudf/detail/null_mask.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/reduction.cuh>
 #include <cudf/detail/unary.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/element_argminmax.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
@@ -31,9 +34,12 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 
+#include <type_traits>
+
 namespace cudf {
 namespace reduction {
 namespace simple {
@@ -70,18 +76,21 @@ std::unique_ptr<column> simple_segmented_reduction(column_view const& col,
   auto simple_op         = Op{};
   size_type num_segments = offsets.size() - 1;
 
+  auto binary_op = simple_op.get_binary_op();
+  auto identity  = simple_op.template get_identity<ResultType>();
+
   // TODO: Explore rewriting null_replacing_element_transformer/element_transformer with nullate
   auto result = [&] {
     if (col.has_nulls()) {
       auto f  = simple_op.template get_null_replacing_element_transformer<ResultType>();
       auto it = thrust::make_transform_iterator(dcol->pair_begin<InputType, true>(), f);
       return cudf::reduction::detail::segmented_reduce(
-        it, offsets.begin(), num_segments, simple_op, stream, mr);
+        it, offsets.begin(), num_segments, binary_op, identity, stream, mr);
     } else {
       auto f  = simple_op.template get_element_transformer<ResultType>();
       auto it = thrust::make_transform_iterator(dcol->begin<InputType>(), f);
       return cudf::reduction::detail::segmented_reduce(
-        it, offsets.begin(), num_segments, simple_op, stream, mr);
+        it, offsets.begin(), num_segments, binary_op, identity, stream, mr);
     }
   }();
 
@@ -103,6 +112,112 @@ std::unique_ptr<column> simple_segmented_reduction(column_view const& col,
   return result;
 }
 
+/**
+ * @brief String segmented reduction for 'min', 'max'.
+ *
+ * This algorithm uses argmin/argmax as a custom comparator to build a gather
+ * map, then builds the output.
+ *
+ * @tparam InputType    the input column data-type
+ * @tparam Op           the operator of cudf::reduction::op::
+
+ * @param col Input column of data to reduce.
+ * @param offsets Indices to segment boundaries.
+ * @param null_handling If `null_policy::INCLUDE`, all elements in a segment
+ * must be valid for the reduced value to be valid. If `null_policy::EXCLUDE`,
+ * the reduced value is valid if any element in the segment is valid.
+ * @param stream Used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Output column in device memory
+ */
+
+template <typename InputType,
+          typename Op,
+          CUDF_ENABLE_IF(std::is_same_v<Op, cudf::reduction::op::min> ||
+                         std::is_same_v<Op, cudf::reduction::op::max>)>
+std::unique_ptr<column> string_segmented_reduction(column_view const& col,
+                                                   device_span<size_type const> offsets,
+                                                   null_policy null_handling,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
+{
+  // Pass to simple_segmented_reduction, get indices to gather, perform gather here.
+  auto device_col = cudf::column_device_view::create(col, stream);
+
+  auto it                 = thrust::make_counting_iterator(0);
+  auto const num_segments = static_cast<size_type>(offsets.size()) - 1;
+
+  bool constexpr is_argmin = std::is_same_v<Op, cudf::reduction::op::min>;
+  auto string_comparator =
+    cudf::detail::element_argminmax_fn<InputType>{*device_col, col.has_nulls(), is_argmin};
+  auto constexpr identity =
+    is_argmin ? cudf::detail::ARGMIN_SENTINEL : cudf::detail::ARGMAX_SENTINEL;
+
+  auto gather_map =
+    cudf::reduction::detail::segmented_reduce(it,
+                                              offsets.begin(),
+                                              num_segments,
+                                              string_comparator,
+                                              identity,
+                                              stream,
+                                              rmm::mr::get_current_device_resource());
+  auto result = std::move(cudf::detail::gather(table_view{{col}},
+                                               *gather_map,
+                                               cudf::out_of_bounds_policy::NULLIFY,
+                                               cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                               stream,
+                                               mr)
+                            ->release()[0]);
+  auto const [segmented_null_mask, segmented_null_count] =
+    cudf::detail::segmented_null_mask_reduction(col.null_mask(),
+                                                offsets.begin(),
+                                                offsets.end() - 1,
+                                                offsets.begin() + 1,
+                                                null_handling,
+                                                stream,
+                                                mr);
+
+  // If the segmented null mask contains any null values, the segmented null mask
+  // must be combined with the result null mask.
+  if (segmented_null_count > 0) {
+    if (result->null_count() == 0) {
+      // The result has no nulls. Use the segmented null mask.
+      result->set_null_mask(segmented_null_mask, segmented_null_count, stream);
+    } else {
+      // Compute the logical AND of the segmented output null mask and the
+      // result null mask to update the result null mask and null count.
+      auto result_mview = result->mutable_view();
+      std::vector masks{static_cast<bitmask_type const*>(result_mview.null_mask()),
+                        static_cast<bitmask_type const*>(segmented_null_mask.data())};
+      std::vector<size_type> begin_bits{0, 0};
+      auto const valid_count = cudf::detail::inplace_bitmask_and(
+        device_span<bitmask_type>(static_cast<bitmask_type*>(result_mview.null_mask()),
+                                  num_bitmask_words(result->size())),
+        masks,
+        begin_bits,
+        result->size(),
+        stream,
+        mr);
+      result->set_null_count(result->size() - valid_count);
+    }
+  }
+
+  return result;
+}
+
+template <typename InputType,
+          typename Op,
+          CUDF_ENABLE_IF(!std::is_same_v<Op, cudf::reduction::op::min>() &&
+                         !std::is_same_v<Op, cudf::reduction::op::max>())>
+std::unique_ptr<column> string_segmented_reduction(column_view const& col,
+                                                   device_span<size_type const> offsets,
+                                                   null_policy null_handling,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FAIL("Segmented reduction on string column only supports min and max reduction.");
+}
+
 /**
  * @brief Call reduce and return a column of type bool.
  *
@@ -153,7 +268,9 @@ struct same_column_type_dispatcher {
   }
 
  public:
-  template <typename ElementType, std::enable_if_t<is_supported<ElementType>()>* = nullptr>
+  template <typename ElementType,
+            CUDF_ENABLE_IF(is_supported<ElementType>() &&
+                           !std::is_same_v<ElementType, string_view>)>
   std::unique_ptr<column> operator()(column_view const& col,
                                      device_span<size_type const> offsets,
                                      null_policy null_handling,
@@ -164,7 +281,18 @@ struct same_column_type_dispatcher {
       col, offsets, null_handling, stream, mr);
   }
 
-  template <typename ElementType, std::enable_if_t<not is_supported<ElementType>()>* = nullptr>
+  template <typename ElementType,
+            CUDF_ENABLE_IF(is_supported<ElementType>() && std::is_same_v<ElementType, string_view>)>
+  std::unique_ptr<column> operator()(column_view const& col,
+                                     device_span<size_type const> offsets,
+                                     null_policy null_handling,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    return string_segmented_reduction<ElementType, Op>(col, offsets, null_handling, stream, mr);
+  }
+
+  template <typename ElementType, CUDF_ENABLE_IF(!is_supported<ElementType>())>
   std::unique_ptr<column> operator()(column_view const&,
                                      device_span<size_type const>,
                                      null_policy,