Initial pass for reduce sums

rapidsai · Nov 10, 2021 · 7ccdeb8 · 7ccdeb8
1 parent a9a7b2c
commit 7ccdeb8
Show file tree

Hide file tree

Showing 9 changed files with 535 additions and 12 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -360,12 +360,14 @@ add_library(
   src/reductions/nth_element.cu
   src/reductions/product.cu
   src/reductions/reductions.cpp
+  src/reductions/segmented_reductions.cpp
   src/reductions/scan/rank_scan.cu
   src/reductions/scan/scan.cpp
   src/reductions/scan/scan_exclusive.cu
   src/reductions/scan/scan_inclusive.cu
   src/reductions/std.cu
   src/reductions/sum.cu
+  src/reductions/segmented_sum.cu
   src/reductions/sum_of_squares.cu
   src/reductions/var.cu
   src/replace/clamp.cu

diff --git a/cpp/include/cudf/detail/reduction.cuh b/cpp/include/cudf/detail/reduction.cuh
@@ -226,7 +226,6 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
  *
  * @param[in] d_in          the begin iterator to input
  * @param[in] d_offset      the begin iterator to offset
- * @param[in] num_items     the number of items
  * @param[in] num_segments  the number of segments
  * @param[in] op            the reduction operator
  * @param[in] stream        CUDA stream used for device memory operations and kernel launches.
@@ -244,15 +243,13 @@ template <typename Op,
                                     not cudf::is_fixed_point<OutputType>()>* = nullptr>
 std::unique_ptr<column> segmented_reduce(InputIterator d_in,
                                          OffsetIterator d_offset,
-                                         cudf::size_type num_items,
                                          cudf::size_type num_segments,
                                          op::simple_op<Op> sop,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
-  using InputType = OutputType;
-  auto binary_op  = sop.get_binary_op();
-  auto identity   = sop.template get_identity<OutputType>();
+  auto binary_op = sop.get_binary_op();
+  auto identity  = sop.template get_identity<OutputType>();
   // auto dev_result = rmm::device_scalar<OutputType>{identity, stream, mr};
   auto dev_result = make_fixed_width_column(
     data_type{type_to_id<OutputType>()}, num_segments, mask_state::UNALLOCATED, stream, mr);
@@ -264,11 +261,10 @@ std::unique_ptr<column> segmented_reduce(InputIterator d_in,
   cub::DeviceSegmentedReduce::Reduce(d_temp_storage.data(),
                                      temp_storage_bytes,
                                      d_in,
-                                     dev_result_mview.data<InputType>(),
+                                     dev_result_mview.data<OutputType>(),
                                      num_segments,
                                      d_offset,
-                                     d_offset + num_segments + 1,
-                                     num_items,
+                                     d_offset + 1,
                                      binary_op,
                                      identity,
                                      stream.value());
@@ -278,18 +274,27 @@ std::unique_ptr<column> segmented_reduce(InputIterator d_in,
   cub::DeviceSegmentedReduce::Reduce(d_temp_storage.data(),
                                      temp_storage_bytes,
                                      d_in,
-                                     dev_result_mview.data<InputType>(),
+                                     dev_result_mview.data<OutputType>(),
                                      num_segments,
                                      d_offset,
-                                     d_offset + num_segments + 1,
-                                     num_items,
+                                     d_offset + 1,
                                      binary_op,
                                      identity,
                                      stream.value());
 
   return dev_result;
 }
 
+template <typename Op,
+          typename InputIterator,
+          typename OutputType = typename thrust::iterator_value<InputIterator>::type,
+          typename std::enable_if_t<std::is_same_v<OutputType, string_view>>* = nullptr,
+          typename... Args>
+std::unique_ptr<column> segmented_reduce(Args&&...)
+{
+  CUDF_FAIL("Segment reduction for string type is unsupported.");
+}
+
 }  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/reduction_functions.hpp b/cpp/include/cudf/detail/reduction_functions.hpp
@@ -254,5 +254,125 @@ std::unique_ptr<scalar> nth_element(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Computes sums to each segment of the input column
+ *
+ * If all elements in an input segment are null, the result is null.
+ *
+ * @throw cudf::logic_error if input column type is not convertible to `output_dtype`
+ * @throw cudf::logic_error if `output_dtype` is not arithmetic point type
+ *
+ * @param col input column to compute sum
+ * @param offsets Indices to identify segment boundaries
+ * @param output_dtype data type of return type and typecast elements of input column
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Sum as column of type `output_dtype`.
+ */
+std::unique_ptr<column> segmented_sum(
+  column_view const& col,
+  column_view const& offsets,
+  data_type const output_dtype,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+// /**
+//  * @brief Computes minimum of elements in input column
+//  *
+//  * If all elements in input column are null, output scalar is null.
+//  *
+//  * @throw cudf::logic_error if input column type is convertible to `output_dtype`
+//  *
+//  * @param col input column to compute minimum.
+//  * @param output_dtype data type of return type and typecast elements of input column
+//  * @param stream CUDA stream used for device memory operations and kernel launches.
+//  * @param mr Device memory resource used to allocate the returned scalar's device memory
+//  * @return Minimum element as scalar of type `output_dtype`.
+//  */
+// std::unique_ptr<scalar> min(
+//   column_view const& col,
+//   data_type const output_dtype,
+//   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+//   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+// /**
+//  * @brief Computes maximum of elements in input column
+//  *
+//  * If all elements in input column are null, output scalar is null.
+//  *
+//  * @throw cudf::logic_error if input column type is convertible to `output_dtype`
+//  *
+//  * @param col input column to compute maximum.
+//  * @param output_dtype data type of return type and typecast elements of input column
+//  * @param stream CUDA stream used for device memory operations and kernel launches.
+//  * @param mr Device memory resource used to allocate the returned scalar's device memory
+//  * @return Maximum element as scalar of type `output_dtype`.
+//  */
+// std::unique_ptr<scalar> max(
+//   column_view const& col,
+//   data_type const output_dtype,
+//   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+//   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+// /**
+//  * @brief Computes any of elements in input column is true when typecasted to bool
+//  *
+//  * If all elements in input column are null, output scalar is null.
+//  *
+//  * @throw cudf::logic_error if input column type is not convertible to bool
+//  * @throw cudf::logic_error if `output_dtype` is not bool
+//  *
+//  * @param col input column to compute any_of.
+//  * @param output_dtype data type of return type and typecast elements of input column
+//  * @param stream CUDA stream used for device memory operations and kernel launches.
+//  * @param mr Device memory resource used to allocate the returned scalar's device memory
+//  * @return bool scalar if any of elements is true when typecasted to bool
+//  */
+// std::unique_ptr<scalar> any(
+//   column_view const& col,
+//   data_type const output_dtype,
+//   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+//   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+// /**
+//  * @brief Computes all of elements in input column is true when typecasted to bool
+//  *
+//  * If all elements in input column are null, output scalar is null.
+//  *
+//  * @throw cudf::logic_error if input column type is not convertible to bool
+//  * @throw cudf::logic_error if `output_dtype` is not bool
+//  *
+//  * @param col input column to compute all_of.
+//  * @param output_dtype data type of return type and typecast elements of input column
+//  * @param stream CUDA stream used for device memory operations and kernel launches.
+//  * @param mr Device memory resource used to allocate the returned scalar's device memory
+//  * @return bool scalar if all of elements is true when typecasted to bool
+//  */
+// std::unique_ptr<scalar> all(
+//   column_view const& col,
+//   data_type const output_dtype,
+//   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+//   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+// /**
+//  * @brief Computes product of elements in input column
+//  *
+//  * If all elements in input column are null, output scalar is null.
+//  *
+//  * @throw cudf::logic_error if input column type is not convertible to `output_dtype`
+//  * @throw cudf::logic_error if `output_dtype` is not arithmetic point type
+//  *
+//  * @param col input column to compute product.
+//  * @param output_dtype data type of return type and typecast elements of input column
+//  * @param stream CUDA stream used for device memory operations and kernel launches.
+//  * @param mr Device memory resource used to allocate the returned scalar's device memory
+//  * @return Product as scalar of type `output_dtype`.
+//  */
+// std::unique_ptr<scalar> product(
+//   column_view const& col,
+//   data_type const output_dtype,
+//   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+//   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/include/cudf/reduction.hpp b/cpp/include/cudf/reduction.hpp
@@ -69,6 +69,13 @@ std::unique_ptr<scalar> reduce(
   data_type output_dtype,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+std::unique_ptr<column> segmented_reduce(
+  column_view const& col,
+  column_view const& offsets,
+  std::unique_ptr<aggregation> const& agg,
+  data_type output_dtype,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief  Computes the scan of a column.
  *

diff --git a/cpp/src/reductions/segmented_reductions.cpp b/cpp/src/reductions/segmented_reductions.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/quantiles.hpp>
+#include <cudf/detail/reduction_functions.hpp>
+#include <cudf/detail/sorting.hpp>
+#include <cudf/detail/stream_compaction.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+
+#include <cudf/structs/structs_column_view.hpp>
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace detail {
+struct segmented_reduce_dispatch_functor {
+  column_view const col;
+  column_view const offsets;
+  data_type output_dtype;
+  rmm::mr::device_memory_resource* mr;
+  rmm::cuda_stream_view stream;
+
+  segmented_reduce_dispatch_functor(column_view const& col,
+                                    column_view const& offsets,
+                                    data_type output_dtype,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
+    : col(col), offsets(offsets), output_dtype(output_dtype), mr(mr), stream(stream)
+  {
+  }
+
+  template <aggregation::Kind k>
+  std::unique_ptr<column> operator()(std::unique_ptr<aggregation> const& agg)
+  {
+    switch (k) {
+      case aggregation::SUM:
+        return reduction::segmented_sum(col, offsets, output_dtype, stream, mr);
+        break;
+      // case aggregation::PRODUCT: return reduction::product(col, output_dtype, stream, mr); break;
+      // case aggregation::MIN: return reduction::min(col, output_dtype, stream, mr); break;
+      // case aggregation::MAX: return reduction::max(col, output_dtype, stream, mr); break;
+      // case aggregation::ANY: return reduction::any(col, output_dtype, stream, mr); break;
+      // case aggregation::ALL: return reduction::all(col, output_dtype, stream, mr); break;
+      default: CUDF_FAIL("Unsupported aggregation type.");
+    }
+  }
+};
+
+std::unique_ptr<column> segmented_reduce(
+  column_view const& col,
+  column_view const& offsets,
+  std::unique_ptr<aggregation> const& agg,
+  data_type output_dtype,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  // TODO: handle invalid inputs.
+
+  return aggregation_dispatcher(
+    agg->kind, segmented_reduce_dispatch_functor{col, offsets, output_dtype, stream, mr}, agg);
+}
+}  // namespace detail
+
+std::unique_ptr<column> segmented_reduce(column_view const& col,
+                                         column_view const& offsets,
+                                         std::unique_ptr<aggregation> const& agg,
+                                         data_type output_dtype,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::segmented_reduce(col, offsets, agg, output_dtype, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/src/reductions/segmented_sum.cu b/cpp/src/reductions/segmented_sum.cu
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/reduction_functions.hpp>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <reductions/simple_segmented.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace reduction {
+
+std::unique_ptr<cudf::column> segmented_sum(column_view const& col,
+                                            column_view const& offsets,
+                                            cudf::data_type const output_dtype,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
+{
+  return cudf::type_dispatcher(col.type(),
+                               simple::column_type_dispatcher<cudf::reduction::op::sum>{},
+                               col,
+                               offsets,
+                               output_dtype,
+                               stream,
+                               mr);
+}
+
+}  // namespace reduction
+}  // namespace cudf