From 182ee2c4964bccaf2334bfb176ba0ae646b3deab Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 3 Feb 2023 08:49:30 -0500
Subject: [PATCH] Add compound aggregations to cudf::segmented_reduce (#12573)

Adds mean, variance, and standard deviation aggregation support to `cudf::segmented_reduce`. These are compound (multi-step) aggregations and are modeled after the same aggregations supported but `cudf::reduce`. Once this approved and merged, the visitor pattern for this approach will be reworked for both `cudf::reduce` and `cudf::segmented_reduce` as per [#10432](https://github.com/rapidsai/cudf/issues/10432#issuecomment-1067560695).

The source tree for `src/reductions` has been adjusted to put all segmented-reduce source files into `src/reductions/segmented` and removing the `segmented_` prefix from those file names.
Also, the segmented-reduce functions have been moved from `cudf/detail/reduction_functions.hpp` into their own `cudf/detail/segmented_reduction_functions.hpp`. Likewise, the segmented-reduce CUB calls have been moved from `cudf/detail/reduction.cuh` to the new `cudf/detail/segmented_reduction.cuh` to help minimize including CUB headers.

Additionally, the sum-of-squares aggregation is also included since it was a simple reduction only requiring the appropriate aggregation class registration and source file.

Finally, gtests are added for these new types. The compound types only support floating-point outputs.

Follow on PRs will address the visitor pattern already mentioned above as well as additional data types. Discussion on additional aggregations will occur in the reference issue #10432.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/12573
---
 conda/recipes/libcudf/meta.yaml               |   1 +
 cpp/CMakeLists.txt                            |  19 +-
 .../cudf/detail/aggregation/aggregation.hpp   |  10 +-
 cpp/include/cudf/detail/reduction.cuh         |  91 +----
 .../cudf/detail/reduction_functions.hpp       | 172 +--------
 .../cudf/detail/segmented_reduction.cuh       | 198 ++++++++++
 .../detail/segmented_reduction_functions.hpp  | 338 ++++++++++++++++++
 cpp/src/aggregation/aggregation.cpp           |  10 +-
 .../stream_compaction/apply_boolean_mask.cu   |   4 +-
 cpp/src/reductions/compound.cuh               |  12 +-
 cpp/src/reductions/minmax.cu                  |   2 +-
 .../{segmented_all.cu => segmented/all.cu}    |   4 +-
 .../{segmented_any.cu => segmented/any.cu}    |   4 +-
 cpp/src/reductions/segmented/compound.cuh     | 182 ++++++++++
 .../{segmented_max.cu => segmented/max.cu}    |   4 +-
 cpp/src/reductions/segmented/mean.cu          |  40 +++
 .../{segmented_min.cu => segmented/min.cu}    |   4 +-
 .../product.cu}                               |   4 +-
 .../reductions.cpp}                           |  22 +-
 .../simple.cuh}                               |  83 +----
 cpp/src/reductions/segmented/std.cu           |  41 +++
 .../{segmented_sum.cu => segmented/sum.cu}    |   4 +-
 .../reductions/segmented/sum_of_squares.cu    |  39 ++
 .../reductions/segmented/update_validity.cu   |  49 +++
 .../reductions/segmented/update_validity.hpp  |  58 +++
 cpp/src/reductions/segmented/var.cu           |  40 +++
 cpp/src/reductions/std.cu                     |   4 +-
 cpp/src/reductions/var.cu                     |   4 +-
 .../reductions/segmented_reduction_tests.cpp  | 240 ++++++++++++-
 29 files changed, 1318 insertions(+), 365 deletions(-)
 create mode 100644 cpp/include/cudf/detail/segmented_reduction.cuh
 create mode 100644 cpp/include/cudf/detail/segmented_reduction_functions.hpp
 rename cpp/src/reductions/{segmented_all.cu => segmented/all.cu} (94%)
 rename cpp/src/reductions/{segmented_any.cu => segmented/any.cu} (94%)
 create mode 100644 cpp/src/reductions/segmented/compound.cuh
 rename cpp/src/reductions/{segmented_max.cu => segmented/max.cu} (94%)
 create mode 100644 cpp/src/reductions/segmented/mean.cu
 rename cpp/src/reductions/{segmented_min.cu => segmented/min.cu} (94%)
 rename cpp/src/reductions/{segmented_product.cu => segmented/product.cu} (94%)
 rename cpp/src/reductions/{segmented_reductions.cpp => segmented/reductions.cpp} (87%)
 rename cpp/src/reductions/{simple_segmented.cuh => segmented/simple.cuh} (84%)
 create mode 100644 cpp/src/reductions/segmented/std.cu
 rename cpp/src/reductions/{segmented_sum.cu => segmented/sum.cu} (95%)
 create mode 100644 cpp/src/reductions/segmented/sum_of_squares.cu
 create mode 100644 cpp/src/reductions/segmented/update_validity.cu
 create mode 100644 cpp/src/reductions/segmented/update_validity.hpp
 create mode 100644 cpp/src/reductions/segmented/var.cu

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 7372d4d4de6..911080ebdb6 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -110,6 +110,7 @@ outputs:
         - test -f $PREFIX/include/cudf/detail/scan.hpp
         - test -f $PREFIX/include/cudf/detail/scatter.hpp
         - test -f $PREFIX/include/cudf/detail/search.hpp
+        - test -f $PREFIX/include/cudf/detail/segmented_reduction_functions.hpp
         - test -f $PREFIX/include/cudf/detail/sequence.hpp
         - test -f $PREFIX/include/cudf/detail/sorting.hpp
         - test -f $PREFIX/include/cudf/detail/stream_compaction.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 9a9b5d1e5ed..19c118016bf 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -441,13 +441,18 @@ add_library(
   src/reductions/scan/scan.cpp
   src/reductions/scan/scan_exclusive.cu
   src/reductions/scan/scan_inclusive.cu
-  src/reductions/segmented_all.cu
-  src/reductions/segmented_any.cu
-  src/reductions/segmented_max.cu
-  src/reductions/segmented_min.cu
-  src/reductions/segmented_product.cu
-  src/reductions/segmented_reductions.cpp
-  src/reductions/segmented_sum.cu
+  src/reductions/segmented/all.cu
+  src/reductions/segmented/any.cu
+  src/reductions/segmented/max.cu
+  src/reductions/segmented/mean.cu
+  src/reductions/segmented/min.cu
+  src/reductions/segmented/product.cu
+  src/reductions/segmented/reductions.cpp
+  src/reductions/segmented/std.cu
+  src/reductions/segmented/sum.cu
+  src/reductions/segmented/sum_of_squares.cu
+  src/reductions/segmented/update_validity.cu
+  src/reductions/segmented/var.cu
   src/reductions/std.cu
   src/reductions/sum.cu
   src/reductions/sum_of_squares.cu
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 360c314f2db..e269d4d2e13 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -292,7 +292,9 @@ class all_aggregation final : public reduce_aggregation, public segmented_reduce
 /**
  * @brief Derived class for specifying a sum_of_squares aggregation
  */
-class sum_of_squares_aggregation final : public groupby_aggregation, public reduce_aggregation {
+class sum_of_squares_aggregation final : public groupby_aggregation,
+                                         public reduce_aggregation,
+                                         public segmented_reduce_aggregation {
  public:
   sum_of_squares_aggregation() : aggregation(SUM_OF_SQUARES) {}
 
@@ -313,7 +315,8 @@ class sum_of_squares_aggregation final : public groupby_aggregation, public redu
  */
 class mean_aggregation final : public rolling_aggregation,
                                public groupby_aggregation,
-                               public reduce_aggregation {
+                               public reduce_aggregation,
+                               public segmented_reduce_aggregation {
  public:
   mean_aggregation() : aggregation(MEAN) {}
 
@@ -353,7 +356,8 @@ class m2_aggregation : public groupby_aggregation {
  */
 class std_var_aggregation : public rolling_aggregation,
                             public groupby_aggregation,
-                            public reduce_aggregation {
+                            public reduce_aggregation,
+                            public segmented_reduce_aggregation {
  public:
   size_type _ddof;  ///< Delta degrees of freedom
 
diff --git a/cpp/include/cudf/detail/reduction.cuh b/cpp/include/cudf/detail/reduction.cuh
index f1fe0d2e1a9..9dc3b996afc 100644
--- a/cpp/include/cudf/detail/reduction.cuh
+++ b/cpp/include/cudf/detail/reduction.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "reduction_operators.cuh"
+#include <cudf/detail/reduction_operators.cuh>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -27,7 +27,6 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cub/device/device_reduce.cuh>
-#include <cub/device/device_segmented_reduce.cuh>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -229,92 +228,6 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
   return std::unique_ptr<scalar>(result);
 }
 
-/**
- * @brief Compute the specified simple reduction over each of the segments in the
- * input range of elements.
- *
- * @tparam InputIterator    the input column iterator
- * @tparam OffsetIterator   the offset column iterator
- * @tparam OutputIterator   the output column iterator
- * @tparam BinaryOp         the device binary operator used to reduce
- * @tparam OutputType       the output type of reduction
- *
- * @param[in] d_in          the begin iterator to input
- * @param[in] d_offset_begin the begin iterator to offset
- * @param[in] d_offset_end  the end iterator to offset. Note: This is
- * num_segments+1 elements past `d_offset_begin`.
- * @param[out] d_out        the begin iterator to output
- * @param[in] binary_op     the reduction operator
- * @param[in] identity      the identity element of the reduction operator
- * @param[in] initial_value Initial value of the reduction
- * @param[in] stream        CUDA stream used for device memory operations and kernel launches
- *
- */
-template <typename InputIterator,
-          typename OffsetIterator,
-          typename OutputIterator,
-          typename BinaryOp,
-          typename OutputType = typename thrust::iterator_value<OutputIterator>::type,
-          typename std::enable_if_t<is_fixed_width<OutputType>() &&
-                                    !cudf::is_fixed_point<OutputType>()>* = nullptr>
-void segmented_reduce(InputIterator d_in,
-                      OffsetIterator d_offset_begin,
-                      OffsetIterator d_offset_end,
-                      OutputIterator d_out,
-                      BinaryOp binary_op,
-                      OutputType initial_value,
-                      rmm::cuda_stream_view stream)
-{
-  auto const num_segments = static_cast<size_type>(std::distance(d_offset_begin, d_offset_end)) - 1;
-
-  // Allocate temporary storage
-  rmm::device_buffer d_temp_storage;
-  size_t temp_storage_bytes = 0;
-  cub::DeviceSegmentedReduce::Reduce(d_temp_storage.data(),
-                                     temp_storage_bytes,
-                                     d_in,
-                                     d_out,
-                                     num_segments,
-                                     d_offset_begin,
-                                     d_offset_begin + 1,
-                                     binary_op,
-                                     initial_value,
-                                     stream.value());
-  d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream};
-
-  // Run reduction
-  cub::DeviceSegmentedReduce::Reduce(d_temp_storage.data(),
-                                     temp_storage_bytes,
-                                     d_in,
-                                     d_out,
-                                     num_segments,
-                                     d_offset_begin,
-                                     d_offset_begin + 1,
-                                     binary_op,
-                                     initial_value,
-                                     stream.value());
-}
-
-template <typename InputIterator,
-          typename OffsetIterator,
-          typename OutputIterator,
-          typename BinaryOp,
-          typename OutputType = typename thrust::iterator_value<OutputIterator>::type,
-          typename std::enable_if_t<!(is_fixed_width<OutputType>() &&
-                                      !cudf::is_fixed_point<OutputType>())>* = nullptr>
-void segmented_reduce(InputIterator,
-                      OffsetIterator,
-                      OffsetIterator,
-                      OutputIterator,
-                      BinaryOp,
-                      OutputType,
-                      rmm::cuda_stream_view)
-{
-  CUDF_FAIL(
-    "Unsupported data types called on segmented_reduce. Only numeric and chrono types are "
-    "supported.");
-}
-
 }  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/reduction_functions.hpp b/cpp/include/cudf/detail/reduction_functions.hpp
index a2de286f283..1f892bb90c5 100644
--- a/cpp/include/cudf/detail/reduction_functions.hpp
+++ b/cpp/include/cudf/detail/reduction_functions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -216,7 +216,7 @@ std::unique_ptr<scalar> mean(
 std::unique_ptr<scalar> variance(
   column_view const& col,
   data_type const output_dtype,
-  cudf::size_type ddof,
+  size_type ddof,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
@@ -239,7 +239,7 @@ std::unique_ptr<scalar> variance(
 std::unique_ptr<scalar> standard_deviation(
   column_view const& col,
   data_type const output_dtype,
-  cudf::size_type ddof,
+  size_type ddof,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
@@ -338,171 +338,5 @@ std::unique_ptr<scalar> merge_sets(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Compute sum of each segment in input column.
- *
- * If an input segment is empty, the segment result is null.
- *
- * @throw cudf::logic_error if input column type is not convertible to `output_dtype`.
- * @throw cudf::logic_error if `output_dtype` is not an arithmetic type.
- *
- * @param col Input column to compute sum
- * @param offsets Indices to identify segment boundaries
- * @param output_dtype Data type of return type and typecast elements of input column
- * @param null_handling If `null_policy::INCLUDE`, all elements in a segment must be valid for the
- * reduced value to be valid. If `null_policy::EXCLUDE`, the reduced value is valid if any element
- * in the segment is valid.
- * @param init Initial value of each sum
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return Sums of segments in type `output_dtype`
- */
-std::unique_ptr<column> segmented_sum(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Computes product of each segment in input column.
- *
- * If an input segment is empty, the segment result is null.
- *
- * @throw cudf::logic_error if input column type is not convertible to `output_dtype`.
- * @throw cudf::logic_error if `output_dtype` is not an arithmetic type.
- *
- * @param col Input column to compute product
- * @param offsets Indices to identify segment boundaries
- * @param output_dtype data type of return type and typecast elements of input column
- * @param null_handling If `null_policy::INCLUDE`, all elements in a segment must be valid for the
- * reduced value to be valid. If `null_policy::EXCLUDE`, the reduced value is valid if any element
- * in the segment is valid.
- * @param init Initial value of each product
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned scalar's device memory
- * @return Product as scalar of type `output_dtype`
- */
-std::unique_ptr<column> segmented_product(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Compute minimum of each segment in input column.
- *
- * If an input segment is empty, the segment result is null.
- *
- * @throw cudf::logic_error if input column type is convertible to `output_dtype`.
- *
- * @param col Input column to compute minimum
- * @param offsets Indices to identify segment boundaries
- * @param output_dtype Data type of return type and typecast elements of input column
- * @param null_handling If `null_policy::INCLUDE`, all elements in a segment must be valid for the
- * reduced value to be valid. If `null_policy::EXCLUDE`, the reduced value is valid if any element
- * in the segment is valid.
- * @param init Initial value of each minimum
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned scalar's device memory
- * @return Minimums of segments in type `output_dtype`
- */
-std::unique_ptr<column> segmented_min(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Compute maximum of each segment in input column.
- *
- * If an input segment is empty, the segment result is null.
- *
- * @throw cudf::logic_error if input column type is convertible to `output_dtype`.
- *
- * @param col Input column to compute maximum
- * @param offsets Indices to identify segment boundaries
- * @param output_dtype Data type of return type and typecast elements of input column
- * @param null_handling If `null_policy::INCLUDE`, all elements in a segment must be valid for the
- * reduced value to be valid. If `null_policy::EXCLUDE`, the reduced value is valid if any element
- * in the segment is valid.
- * @param init Initial value of each maximum
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned scalar's device memory
- * @return Maximums of segments in type `output_dtype`
- */
-std::unique_ptr<column> segmented_max(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Compute if any of the values in the segment are true when typecasted to bool.
- *
- * If an input segment is empty, the segment result is null.
- *
- * @throw cudf::logic_error if input column type is not convertible to bool.
- * @throw cudf::logic_error if `output_dtype` is not bool8.
- *
- * @param col Input column to compute any
- * @param offsets Indices to identify segment boundaries
- * @param output_dtype Data type of return type and typecast elements of input column
- * @param null_handling If `null_policy::INCLUDE`, all elements in a segment must be valid for the
- * reduced value to be valid. If `null_policy::EXCLUDE`, the reduced value is valid if any element
- * in the segment is valid.
- * @param init Initial value of each any
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned scalar's device memory
- * @return Column of bool8 for the results of the segments
- */
-std::unique_ptr<column> segmented_any(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Compute if all of the values in the segment are true when typecasted to bool.
- *
- * If an input segment is empty, the segment result is null.
- *
- * @throw cudf::logic_error if input column type is not convertible to bool.
- * @throw cudf::logic_error if `output_dtype` is not bool8.
- *
- * @param col Input column to compute all
- * @param offsets Indices to identify segment boundaries
- * @param output_dtype Data type of return type and typecast elements of input column
- * @param null_handling If `null_policy::INCLUDE`, all elements in a segment must be valid for the
- * reduced value to be valid. If `null_policy::EXCLUDE`, the reduced value is valid if any element
- * in the segment is valid.
- * @param init Initial value of each all
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned scalar's device memory
- * @return Column of bool8 for the results of the segments
- */
-std::unique_ptr<column> segmented_all(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/segmented_reduction.cuh b/cpp/include/cudf/detail/segmented_reduction.cuh
new file mode 100644
index 00000000000..9a49c1abe38
--- /dev/null
+++ b/cpp/include/cudf/detail/segmented_reduction.cuh
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/reduction_operators.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cub/device/device_segmented_reduce.cuh>
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/transform.h>
+
+namespace cudf {
+namespace reduction {
+namespace detail {
+
+/**
+ * @brief Compute the specified simple reduction over each of the segments in the
+ * input range of elements
+ *
+ * @tparam InputIterator    Input iterator type
+ * @tparam OffsetIterator   Offset iterator type
+ * @tparam OutputIterator   Output iterator type
+ * @tparam BinaryOp         Binary operator used for reduce
+ * @tparam OutputType       The output type derived from the OutputIterator
+ *
+ * @param d_in           Input data iterator
+ * @param d_offset_begin Begin iterator to segment indices
+ * @param d_offset_end   End iterator to segment indices
+ * @param d_out          Output data iterator
+ * @param binary_op      The reduction operator
+ * @param initial_value  Initial value of the reduction
+ * @param stream         CUDA stream used for device memory operations and kernel launches
+ *
+ */
+template <typename InputIterator,
+          typename OffsetIterator,
+          typename OutputIterator,
+          typename BinaryOp,
+          typename OutputType = typename thrust::iterator_value<OutputIterator>::type,
+          typename std::enable_if_t<is_fixed_width<OutputType>() &&
+                                    !cudf::is_fixed_point<OutputType>()>* = nullptr>
+void segmented_reduce(InputIterator d_in,
+                      OffsetIterator d_offset_begin,
+                      OffsetIterator d_offset_end,
+                      OutputIterator d_out,
+                      BinaryOp binary_op,
+                      OutputType initial_value,
+                      rmm::cuda_stream_view stream)
+{
+  auto const num_segments = static_cast<size_type>(std::distance(d_offset_begin, d_offset_end)) - 1;
+
+  // Allocate temporary storage
+  size_t temp_storage_bytes = 0;
+  cub::DeviceSegmentedReduce::Reduce(nullptr,
+                                     temp_storage_bytes,
+                                     d_in,
+                                     d_out,
+                                     num_segments,
+                                     d_offset_begin,
+                                     d_offset_begin + 1,
+                                     binary_op,
+                                     initial_value,
+                                     stream.value());
+  auto d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream};
+
+  // Run reduction
+  cub::DeviceSegmentedReduce::Reduce(d_temp_storage.data(),
+                                     temp_storage_bytes,
+                                     d_in,
+                                     d_out,
+                                     num_segments,
+                                     d_offset_begin,
+                                     d_offset_begin + 1,
+                                     binary_op,
+                                     initial_value,
+                                     stream.value());
+}
+
+template <typename InputIterator,
+          typename OffsetIterator,
+          typename OutputIterator,
+          typename BinaryOp,
+          typename OutputType = typename thrust::iterator_value<OutputIterator>::type,
+          typename std::enable_if_t<!(is_fixed_width<OutputType>() &&
+                                      !cudf::is_fixed_point<OutputType>())>* = nullptr>
+void segmented_reduce(InputIterator,
+                      OffsetIterator,
+                      OffsetIterator,
+                      OutputIterator,
+                      BinaryOp,
+                      OutputType,
+                      rmm::cuda_stream_view)
+{
+  CUDF_FAIL(
+    "Unsupported data types called on segmented_reduce. Only numeric and chrono types are "
+    "supported.");
+}
+
+/**
+ * @brief Compute reduction by the compound operator (reduce and transform)
+ *
+ * The reduction operator must have an `intermediate::compute_result()` method.
+ * This method performs reduction using binary operator `Op::Op` and calculates the
+ * result to `OutputType` using `compute_result()` through the transform method.
+ *
+ * @tparam Op              Reduction operator
+ * @tparam InputIterator   Input iterator type
+ * @tparam OffsetIterator  Offsets iterator type
+ * @tparam OutputIterator  Output iterator type
+ *
+ * @param d_in           Input data iterator
+ * @param d_offset_begin Begin iterator to segment indices
+ * @param d_offset_end   End iterator to segment indices
+ * @param d_out          Output data iterator
+ * @param op             The reduction operator
+ * @param ddof           Delta degrees of freedom used for standard deviation and variance
+ * @param d_valid_counts Number of valid values per segment
+ * @param stream         CUDA stream used for device memory operations and kernel launches
+ */
+template <typename Op, typename InputIterator, typename OffsetIterator, typename OutputIterator>
+void segmented_reduce(InputIterator d_in,
+                      OffsetIterator d_offset_begin,
+                      OffsetIterator d_offset_end,
+                      OutputIterator d_out,
+                      op::compound_op<Op> op,
+                      size_type ddof,
+                      size_type* d_valid_counts,
+                      rmm::cuda_stream_view stream)
+{
+  using OutputType         = typename thrust::iterator_value<OutputIterator>::type;
+  using IntermediateType   = typename thrust::iterator_value<InputIterator>::type;
+  auto num_segments        = static_cast<size_type>(std::distance(d_offset_begin, d_offset_end));
+  auto const binary_op     = op.get_binary_op();
+  auto const initial_value = op.template get_identity<IntermediateType>();
+
+  rmm::device_uvector<IntermediateType> intermediate_result{static_cast<std::size_t>(num_segments),
+                                                            stream};
+
+  // Allocate temporary storage
+  size_t temp_storage_bytes = 0;
+  cub::DeviceSegmentedReduce::Reduce(nullptr,
+                                     temp_storage_bytes,
+                                     d_in,
+                                     intermediate_result.data(),
+                                     num_segments,
+                                     d_offset_begin,
+                                     d_offset_begin + 1,
+                                     binary_op,
+                                     initial_value,
+                                     stream.value());
+  auto d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream};
+
+  // Run reduction
+  cub::DeviceSegmentedReduce::Reduce(d_temp_storage.data(),
+                                     temp_storage_bytes,
+                                     d_in,
+                                     intermediate_result.data(),
+                                     num_segments,
+                                     d_offset_begin,
+                                     d_offset_begin + 1,
+                                     binary_op,
+                                     initial_value,
+                                     stream.value());
+
+  // compute the result value from intermediate value in device
+  thrust::transform(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_counting_iterator<size_type>(num_segments),
+    d_out,
+    [ir = intermediate_result.data(), op, d_valid_counts, ddof] __device__(auto idx) {
+      auto const count = d_valid_counts[idx];
+      return count > 0 ? op.template compute_result<OutputType>(ir[idx], count, ddof)
+                       : OutputType{0};
+    });
+}
+
+}  // namespace detail
+}  // namespace reduction
+}  // namespace cudf
diff --git a/cpp/include/cudf/detail/segmented_reduction_functions.hpp b/cpp/include/cudf/detail/segmented_reduction_functions.hpp
new file mode 100644
index 00000000000..7b5628fa49a
--- /dev/null
+++ b/cpp/include/cudf/detail/segmented_reduction_functions.hpp
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <optional>
+
+namespace cudf {
+namespace reduction {
+
+/**
+ * @brief Compute sum of each segment in the input column
+ *
+ * If an input segment is empty, the segment result is null.
+ *
+ * If `null_handling==null_policy::INCLUDE`, all elements in a segment must be valid
+ * for the reduced value to be valid.
+ * If `null_handling==null_policy::EXCLUDE`, the reduced value is valid if any element
+ * in the segment is valid.
+ *
+ * @throw cudf::logic_error if input column type is not convertible to `output_dtype`.
+ * @throw cudf::logic_error if `output_dtype` is not an arithmetic type.
+ *
+ * @param col Input column data
+ * @param offsets Indices to identify segment boundaries within input `col`
+ * @param output_dtype Data type of the output column
+ * @param null_handling Specifies how null elements are processed for each segment
+ * @param init Initial value of each segment
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Sums of segments as type `output_dtype`
+ */
+std::unique_ptr<column> segmented_sum(
+  column_view const& col,
+  device_span<size_type const> offsets,
+  data_type const output_dtype,
+  null_policy null_handling,
+  std::optional<std::reference_wrapper<scalar const>> init,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Computes product of each segment in the input column
+ *
+ * If an input segment is empty, the segment result is null.
+ *
+ * If `null_handling==null_policy::INCLUDE`, all elements in a segment must be valid
+ * for the reduced value to be valid.
+ * If `null_handling==null_policy::EXCLUDE`, the reduced value is valid if any element
+ * in the segment is valid.
+ *
+ * @throw cudf::logic_error if input column type is not convertible to `output_dtype`.
+ * @throw cudf::logic_error if `output_dtype` is not an arithmetic type.
+ *
+ * @param col Input column data
+ * @param offsets Indices to identify segment boundaries within input `col`
+ * @param output_dtype Data type of the output column
+ * @param null_handling Specifies how null elements are processed for each segment
+ * @param init Initial value of each segment
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Product of segments as type `output_dtype`
+ */
+std::unique_ptr<column> segmented_product(
+  column_view const& col,
+  device_span<size_type const> offsets,
+  data_type const output_dtype,
+  null_policy null_handling,
+  std::optional<std::reference_wrapper<scalar const>> init,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Compute minimum of each segment in the input column
+ *
+ * If an input segment is empty, the segment result is null.
+ *
+ * If `null_handling==null_policy::INCLUDE`, all elements in a segment must be valid
+ * for the reduced value to be valid.
+ * If `null_handling==null_policy::EXCLUDE`, the reduced value is valid if any element
+ * in the segment is valid.
+ *
+ * @throw cudf::logic_error if input column type is not convertible to `output_dtype`.
+ *
+ * @param col Input column data
+ * @param offsets Indices to identify segment boundaries within input `col`
+ * @param output_dtype Data type of the output column
+ * @param null_handling Specifies how null elements are processed for each segment
+ * @param init Initial value of each segment
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Minimums of segments as type `output_dtype`
+ */
+std::unique_ptr<column> segmented_min(
+  column_view const& col,
+  device_span<size_type const> offsets,
+  data_type const output_dtype,
+  null_policy null_handling,
+  std::optional<std::reference_wrapper<scalar const>> init,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Compute maximum of each segment in the input column
+ *
+ * If an input segment is empty, the segment result is null.
+ *
+ * If `null_handling==null_policy::INCLUDE`, all elements in a segment must be valid
+ * for the reduced value to be valid.
+ * If `null_handling==null_policy::EXCLUDE`, the reduced value is valid if any element
+ * in the segment is valid.
+ *
+ * @throw cudf::logic_error if input column type is not convertible to `output_dtype`.
+ *
+ * @param col Input column data
+ * @param offsets Indices to identify segment boundaries within input `col`
+ * @param output_dtype Data type of the output column
+ * @param null_handling Specifies how null elements are processed for each segment
+ * @param init Initial value of each segment
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Maximums of segments as type `output_dtype`
+ */
+std::unique_ptr<column> segmented_max(
+  column_view const& col,
+  device_span<size_type const> offsets,
+  data_type const output_dtype,
+  null_policy null_handling,
+  std::optional<std::reference_wrapper<scalar const>> init,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Compute if any of the values in the segment are true when typecasted to bool
+ *
+ * If an input segment is empty, the segment result is null.
+ *
+ * If `null_handling==null_policy::INCLUDE`, all elements in a segment must be valid
+ * for the reduced value to be valid.
+ * If `null_handling==null_policy::EXCLUDE`, the reduced value is valid if any element
+ * in the segment is valid.
+ *
+ * @throw cudf::logic_error if input column type is not convertible to bool.
+ * @throw cudf::logic_error if `output_dtype` is not BOOL8.
+ *
+ * @param col Input column data
+ * @param offsets Indices to identify segment boundaries within input `col`
+ * @param output_dtype Data type of the output column
+ * @param null_handling Specifies how null elements are processed for each segment
+ * @param init Initial value of each segment
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Column of type BOOL8 for the results of the segments
+ */
+std::unique_ptr<column> segmented_any(
+  column_view const& col,
+  device_span<size_type const> offsets,
+  data_type const output_dtype,
+  null_policy null_handling,
+  std::optional<std::reference_wrapper<scalar const>> init,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Compute if all of the values in the segment are true when typecasted to bool
+ *
+ * If an input segment is empty, the segment result is null.
+ *
+ * If `null_handling==null_policy::INCLUDE`, all elements in a segment must be valid
+ * for the reduced value to be valid.
+ * If `null_handling==null_policy::EXCLUDE`, the reduced value is valid if any element
+ * in the segment is valid.
+ *
+ * @throw cudf::logic_error if input column type is not convertible to bool.
+ * @throw cudf::logic_error if `output_dtype` is not BOOL8.
+ *
+ * @param col Input column data
+ * @param offsets Indices to identify segment boundaries within input `col`
+ * @param output_dtype Data type of the output column
+ * @param null_handling Specifies how null elements are processed for each segment
+ * @param init Initial value of each segment
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Column of BOOL8 for the results of the segments
+ */
+std::unique_ptr<column> segmented_all(
+  column_view const& col,
+  device_span<size_type const> offsets,
+  data_type const output_dtype,
+  null_policy null_handling,
+  std::optional<std::reference_wrapper<scalar const>> init,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Computes mean of elements of segments in the input column
+ *
+ * If input segment is empty, the segment result is null.
+ *
+ * If `null_handling==null_policy::INCLUDE`, all elements in a segment must be valid
+ * for the reduced value to be valid.
+ * If `null_handling==null_policy::EXCLUDE`, the reduced value is valid if any element
+ * in the segment is valid.
+ *
+ * @throw cudf::logic_error if input column type is not arithmetic type
+ * @throw cudf::logic_error if `output_dtype` is not floating point type
+ *
+ * @param col Input column data
+ * @param offsets Indices to identify segment boundaries within input `col`
+ * @param output_dtype Data type of the output column
+ * @param null_handling Specifies how null elements are processed for each segment
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Column of `output_dtype` for the reduction results of the segments
+ */
+std::unique_ptr<column> segmented_mean(
+  column_view const& col,
+  device_span<size_type const> offsets,
+  data_type const output_dtype,
+  null_policy null_handling,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Computes sum of squares of elements of segments in the input column
+ *
+ * If input segment is empty, the segment result is null.
+ *
+ * If `null_handling==null_policy::INCLUDE`, all elements in a segment must be valid
+ * for the reduced value to be valid.
+ * If `null_handling==null_policy::EXCLUDE`, the reduced value is valid if any element
+ * in the segment is valid.
+ *
+ * @throw cudf::logic_error if input column type is not arithmetic type
+ * @throw cudf::logic_error if `output_dtype` is not an arithmetic type
+ *
+ * @param col Input column data
+ * @param offsets Indices to identify segment boundaries within input `col`
+ * @param output_dtype Data type of the output column
+ * @param null_handling Specifies how null elements are processed for each segment
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Column of `output_dtype` for the reduction results of the segments
+ */
+std::unique_ptr<column> segmented_sum_of_squares(
+  column_view const& col,
+  device_span<size_type const> offsets,
+  data_type const output_dtype,
+  null_policy null_handling,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Computes the standard deviation of elements of segments in the input column
+ *
+ * If input segment is empty, the segment result is null.
+ *
+ * If `null_handling==null_policy::INCLUDE`, all elements in a segment must be valid
+ * for the reduced value to be valid.
+ * If `null_handling==null_policy::EXCLUDE`, the reduced value is valid if any element
+ * in the segment is valid.
+ *
+ * @throw cudf::logic_error if input column type is not arithmetic type
+ * @throw cudf::logic_error if `output_dtype` is not floating point type
+ *
+ * @param col Input column data
+ * @param offsets Indices to identify segment boundaries within input `col`
+ * @param output_dtype Data type of the output column
+ * @param null_handling Specifies how null elements are processed for each segment
+ * @param ddof Delta degrees of freedom.
+ *             The divisor used is N - ddof, where N the number of elements in each segment
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Column of `output_dtype` for the reduction results of the segments
+ */
+std::unique_ptr<column> segmented_standard_deviation(
+  column_view const& col,
+  device_span<size_type const> offsets,
+  data_type const output_dtype,
+  null_policy null_handling,
+  size_type ddof,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Computes the variance of elements of segments in the input column
+ *
+ * If input segment is empty, the segment result is null.
+ *
+ * If `null_handling==null_policy::INCLUDE`, all elements in a segment must be valid
+ * for the reduced value to be valid.
+ * If `null_handling==null_policy::EXCLUDE`, the reduced value is valid if any element
+ * in the segment is valid.
+ *
+ * @throw cudf::logic_error if input column type is not arithmetic type
+ * @throw cudf::logic_error if `output_dtype` is not floating point type
+ *
+ * @param col Input column data
+ * @param offsets Indices to identify segment boundaries within input `col`
+ * @param output_dtype Data type of the output column
+ * @param null_handling Specifies how null elements are processed for each segment
+ * @param ddof Delta degrees of freedom.
+ *             The divisor used is N - ddof, where N the number of elements in each segment
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Column of `output_dtype` for the reduction results of the segments
+ */
+std::unique_ptr<column> segmented_variance(
+  column_view const& col,
+  device_span<size_type const> offsets,
+  data_type const output_dtype,
+  null_policy null_handling,
+  size_type ddof,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace reduction
+}  // namespace cudf
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index 6dd014970c7..07c53b3a421 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -492,6 +492,8 @@ template std::unique_ptr<aggregation> make_sum_of_squares_aggregation<aggregatio
 template std::unique_ptr<groupby_aggregation>
 make_sum_of_squares_aggregation<groupby_aggregation>();
 template std::unique_ptr<reduce_aggregation> make_sum_of_squares_aggregation<reduce_aggregation>();
+template std::unique_ptr<segmented_reduce_aggregation>
+make_sum_of_squares_aggregation<segmented_reduce_aggregation>();
 
 /// Factory to create a MEAN aggregation
 template <typename Base>
@@ -503,6 +505,8 @@ template std::unique_ptr<aggregation> make_mean_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_mean_aggregation<rolling_aggregation>();
 template std::unique_ptr<groupby_aggregation> make_mean_aggregation<groupby_aggregation>();
 template std::unique_ptr<reduce_aggregation> make_mean_aggregation<reduce_aggregation>();
+template std::unique_ptr<segmented_reduce_aggregation>
+make_mean_aggregation<segmented_reduce_aggregation>();
 
 /// Factory to create a M2 aggregation
 template <typename Base>
@@ -526,6 +530,8 @@ template std::unique_ptr<groupby_aggregation> make_variance_aggregation<groupby_
   size_type ddof);
 template std::unique_ptr<reduce_aggregation> make_variance_aggregation<reduce_aggregation>(
   size_type ddof);
+template std::unique_ptr<segmented_reduce_aggregation>
+make_variance_aggregation<segmented_reduce_aggregation>(size_type ddof);
 
 /// Factory to create a STD aggregation
 template <typename Base>
@@ -540,6 +546,8 @@ template std::unique_ptr<groupby_aggregation> make_std_aggregation<groupby_aggre
   size_type ddof);
 template std::unique_ptr<reduce_aggregation> make_std_aggregation<reduce_aggregation>(
   size_type ddof);
+template std::unique_ptr<segmented_reduce_aggregation>
+make_std_aggregation<segmented_reduce_aggregation>(size_type ddof);
 
 /// Factory to create a MEDIAN aggregation
 template <typename Base>
diff --git a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
index c1c17dc0688..5acb1cb8849 100644
--- a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
+++ b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,8 +20,8 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/reduction_functions.hpp>
 #include <cudf/detail/replace.hpp>
+#include <cudf/detail/segmented_reduction_functions.hpp>
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/lists/detail/stream_compaction.hpp>
 #include <cudf/lists/stream_compaction.hpp>
diff --git a/cpp/src/reductions/compound.cuh b/cpp/src/reductions/compound.cuh
index f901ebd2c24..9458ae2d581 100644
--- a/cpp/src/reductions/compound.cuh
+++ b/cpp/src/reductions/compound.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ namespace detail {
 template <typename ElementType, typename ResultType, typename Op>
 std::unique_ptr<scalar> compound_reduction(column_view const& col,
                                            data_type const output_dtype,
-                                           cudf::size_type ddof,
+                                           size_type ddof,
                                            rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
@@ -99,7 +99,7 @@ struct result_type_dispatcher {
   template <typename ResultType, std::enable_if_t<is_supported_v<ResultType>()>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      cudf::data_type const output_dtype,
-                                     cudf::size_type ddof,
+                                     size_type ddof,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
@@ -109,7 +109,7 @@ struct result_type_dispatcher {
   template <typename ResultType, std::enable_if_t<not is_supported_v<ResultType>()>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      cudf::data_type const output_dtype,
-                                     cudf::size_type ddof,
+                                     size_type ddof,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
@@ -132,7 +132,7 @@ struct element_type_dispatcher {
   template <typename ElementType, std::enable_if_t<is_supported_v<ElementType>()>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      cudf::data_type const output_dtype,
-                                     cudf::size_type ddof,
+                                     size_type ddof,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
@@ -143,7 +143,7 @@ struct element_type_dispatcher {
   template <typename ElementType, std::enable_if_t<not is_supported_v<ElementType>()>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      cudf::data_type const output_dtype,
-                                     cudf::size_type ddof,
+                                     size_type ddof,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index 42288a67242..b1d951a783b 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -74,7 +74,7 @@ template <typename Op,
           typename InputIterator,
           typename OutputType = typename thrust::iterator_value<InputIterator>::type>
 rmm::device_scalar<OutputType> reduce_device(InputIterator d_in,
-                                             cudf::size_type num_items,
+                                             size_type num_items,
                                              Op binary_op,
                                              rmm::cuda_stream_view stream)
 {
diff --git a/cpp/src/reductions/segmented_all.cu b/cpp/src/reductions/segmented/all.cu
similarity index 94%
rename from cpp/src/reductions/segmented_all.cu
rename to cpp/src/reductions/segmented/all.cu
index 4536785fe82..f75fcd8066c 100644
--- a/cpp/src/reductions/segmented_all.cu
+++ b/cpp/src/reductions/segmented/all.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "simple_segmented.cuh"
+#include "simple.cuh"
 
 #include <cudf/detail/reduction_functions.hpp>
 
diff --git a/cpp/src/reductions/segmented_any.cu b/cpp/src/reductions/segmented/any.cu
similarity index 94%
rename from cpp/src/reductions/segmented_any.cu
rename to cpp/src/reductions/segmented/any.cu
index cc50eb1e1f4..6a4fc70d438 100644
--- a/cpp/src/reductions/segmented_any.cu
+++ b/cpp/src/reductions/segmented/any.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "simple_segmented.cuh"
+#include "simple.cuh"
 
 #include <cudf/detail/reduction_functions.hpp>
 
diff --git a/cpp/src/reductions/segmented/compound.cuh b/cpp/src/reductions/segmented/compound.cuh
new file mode 100644
index 00000000000..dc8a995d1b0
--- /dev/null
+++ b/cpp/src/reductions/segmented/compound.cuh
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "update_validity.hpp"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.cuh>
+#include <cudf/detail/segmented_reduction.cuh>
+#include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <thrust/adjacent_difference.h>
+#include <thrust/iterator/transform_iterator.h>
+
+namespace cudf {
+namespace reduction {
+namespace compound {
+namespace detail {
+/**
+ * @brief Multi-step reduction for operations such as mean, variance, and standard deviation.
+ *
+ * @tparam InputType  the input column data-type
+ * @tparam ResultType the output data-type
+ * @tparam Op         the compound operator derived from `cudf::reduction::op::compound_op`
+ *
+ * @param col Input column view
+ * @param offsets Indices identifying segments
+ * @param null_handling Indicates if null elements should be included in the reduction
+ * @param ddof Delta degrees of freedom used for standard deviation and variance.
+ *             The divisor used is N - ddof, where N represents the number of elements.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned scalar's device memory
+ * @return Segmented reduce result
+ */
+template <typename InputType, typename ResultType, typename Op>
+std::unique_ptr<column> compound_segmented_reduction(column_view const& col,
+                                                     device_span<size_type const> offsets,
+                                                     null_policy null_handling,
+                                                     size_type ddof,
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource* mr)
+{
+  auto d_col              = cudf::column_device_view::create(col, stream);
+  auto compound_op        = Op{};
+  auto const num_segments = offsets.size() - 1;
+
+  auto result = make_fixed_width_column(
+    data_type{type_to_id<ResultType>()}, num_segments, mask_state::UNALLOCATED, stream, mr);
+  auto out_itr = result->mutable_view().template begin<ResultType>();
+
+  // Compute valid counts
+  rmm::device_uvector<size_type> valid_counts(num_segments, stream);
+  if (col.has_nulls() && (null_handling == null_policy::EXCLUDE)) {
+    auto valid_fn = [] __device__(auto p) -> size_type { return static_cast<size_type>(p.second); };
+    auto itr      = thrust::make_transform_iterator(d_col->pair_begin<InputType, true>(), valid_fn);
+    cudf::reduction::detail::segmented_reduce(itr,
+                                              offsets.begin(),
+                                              offsets.end(),
+                                              valid_counts.data(),
+                                              thrust::plus<size_type>{},
+                                              0,
+                                              stream);
+  } else {
+    thrust::adjacent_difference(
+      rmm::exec_policy(stream), offsets.begin() + 1, offsets.end(), valid_counts.begin());
+  }
+
+  // Run segmented reduction
+  if (col.has_nulls()) {
+    auto nrt = compound_op.template get_null_replacing_element_transformer<ResultType>();
+    auto itr = thrust::make_transform_iterator(d_col->pair_begin<InputType, true>(), nrt);
+    cudf::reduction::detail::segmented_reduce(
+      itr, offsets.begin(), offsets.end(), out_itr, compound_op, ddof, valid_counts.data(), stream);
+  } else {
+    auto et  = compound_op.template get_element_transformer<ResultType>();
+    auto itr = thrust::make_transform_iterator(d_col->begin<InputType>(), et);
+    cudf::reduction::detail::segmented_reduce(
+      itr, offsets.begin(), offsets.end(), out_itr, compound_op, ddof, valid_counts.data(), stream);
+  }
+
+  // Compute the output null mask
+  cudf::reduction::detail::segmented_update_validity(
+    *result, col, offsets, null_handling, std::nullopt, stream, mr);
+
+  return result;
+};
+
+template <typename ElementType, typename Op>
+struct compound_float_output_dispatcher {
+ private:
+  template <typename ResultType>
+  static constexpr bool is_supported_v()
+  {
+    return std::is_floating_point_v<ResultType>;
+  }
+
+ public:
+  template <typename ResultType, std::enable_if_t<is_supported_v<ResultType>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& col,
+                                     device_span<size_type const> offsets,
+                                     null_policy null_handling,
+                                     size_type ddof,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    return compound_segmented_reduction<ElementType, ResultType, Op>(
+      col, offsets, null_handling, ddof, stream, mr);
+  }
+
+  template <typename ResultType, std::enable_if_t<not is_supported_v<ResultType>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const&,
+                                     device_span<size_type const>,
+                                     null_policy,
+                                     size_type,
+                                     rmm::cuda_stream_view,
+                                     rmm::mr::device_memory_resource*)
+  {
+    CUDF_FAIL("Unsupported output data type");
+  }
+};
+
+template <typename Op>
+struct compound_segmented_dispatcher {
+ private:
+  template <typename ElementType>
+  static constexpr bool is_supported_v()
+  {
+    return std::is_arithmetic_v<ElementType>;
+  }
+
+ public:
+  template <typename ElementType, std::enable_if_t<is_supported_v<ElementType>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& col,
+                                     device_span<size_type const> offsets,
+                                     cudf::data_type const output_dtype,
+                                     null_policy null_handling,
+                                     size_type ddof,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    return cudf::type_dispatcher(output_dtype,
+                                 compound_float_output_dispatcher<ElementType, Op>(),
+                                 col,
+                                 offsets,
+                                 null_handling,
+                                 ddof,
+                                 stream,
+                                 mr);
+  }
+
+  template <typename ElementType, std::enable_if_t<not is_supported_v<ElementType>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const&,
+                                     device_span<size_type const>,
+                                     cudf::data_type const,
+                                     null_policy,
+                                     size_type,
+                                     rmm::cuda_stream_view,
+                                     rmm::mr::device_memory_resource*)
+  {
+    CUDF_FAIL("Compound operators are not supported for non-arithmetic types");
+  }
+};
+
+}  // namespace detail
+}  // namespace compound
+}  // namespace reduction
+}  // namespace cudf
diff --git a/cpp/src/reductions/segmented_max.cu b/cpp/src/reductions/segmented/max.cu
similarity index 94%
rename from cpp/src/reductions/segmented_max.cu
rename to cpp/src/reductions/segmented/max.cu
index 494aff66797..d72b65301c1 100644
--- a/cpp/src/reductions/segmented_max.cu
+++ b/cpp/src/reductions/segmented/max.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "simple_segmented.cuh"
+#include "simple.cuh"
 
 #include <cudf/detail/reduction_functions.hpp>
 
diff --git a/cpp/src/reductions/segmented/mean.cu b/cpp/src/reductions/segmented/mean.cu
new file mode 100644
index 00000000000..b7a5bfa43d6
--- /dev/null
+++ b/cpp/src/reductions/segmented/mean.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compound.cuh"
+
+#include <cudf/detail/segmented_reduction_functions.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace reduction {
+
+std::unique_ptr<cudf::column> segmented_mean(column_view const& col,
+                                             device_span<size_type const> offsets,
+                                             cudf::data_type const output_dtype,
+                                             null_policy null_handling,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr)
+{
+  using reducer = compound::detail::compound_segmented_dispatcher<cudf::reduction::op::mean>;
+  constexpr size_type ddof = 1;  // ddof for mean calculation
+  return cudf::type_dispatcher(
+    col.type(), reducer{}, col, offsets, output_dtype, null_handling, ddof, stream, mr);
+}
+
+}  // namespace reduction
+}  // namespace cudf
diff --git a/cpp/src/reductions/segmented_min.cu b/cpp/src/reductions/segmented/min.cu
similarity index 94%
rename from cpp/src/reductions/segmented_min.cu
rename to cpp/src/reductions/segmented/min.cu
index dee6a989ad6..b7fbedf2690 100644
--- a/cpp/src/reductions/segmented_min.cu
+++ b/cpp/src/reductions/segmented/min.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "simple_segmented.cuh"
+#include "simple.cuh"
 
 #include <cudf/detail/reduction_functions.hpp>
 
diff --git a/cpp/src/reductions/segmented_product.cu b/cpp/src/reductions/segmented/product.cu
similarity index 94%
rename from cpp/src/reductions/segmented_product.cu
rename to cpp/src/reductions/segmented/product.cu
index 25b31d117b3..d5442126660 100644
--- a/cpp/src/reductions/segmented_product.cu
+++ b/cpp/src/reductions/segmented/product.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "simple_segmented.cuh"
+#include "simple.cuh"
 
 #include <cudf/detail/reduction_functions.hpp>
 
diff --git a/cpp/src/reductions/segmented_reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
similarity index 87%
rename from cpp/src/reductions/segmented_reductions.cpp
rename to cpp/src/reductions/segmented/reductions.cpp
index 6ccc87291cc..1de55b371b3 100644
--- a/cpp/src/reductions/segmented_reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -18,7 +18,7 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/reduction_functions.hpp>
+#include <cudf/detail/segmented_reduction_functions.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -65,7 +65,7 @@ struct segmented_reduce_dispatch_functor {
   }
 
   template <segmented_reduce_aggregation::Kind k>
-  std::unique_ptr<column> operator()()
+  std::unique_ptr<column> operator()(segmented_reduce_aggregation const& agg)
   {
     switch (k) {
       case segmented_reduce_aggregation::SUM:
@@ -86,6 +86,21 @@ struct segmented_reduce_dispatch_functor {
       case segmented_reduce_aggregation::ALL:
         return reduction::segmented_all(
           col, offsets, output_dtype, null_handling, init, stream, mr);
+      case segmented_reduce_aggregation::SUM_OF_SQUARES:
+        return reduction::segmented_sum_of_squares(
+          col, offsets, output_dtype, null_handling, stream, mr);
+      case segmented_reduce_aggregation::MEAN:
+        return reduction::segmented_mean(col, offsets, output_dtype, null_handling, stream, mr);
+      case aggregation::VARIANCE: {
+        auto var_agg = static_cast<var_aggregation const&>(agg);
+        return reduction::segmented_variance(
+          col, offsets, output_dtype, null_handling, var_agg._ddof, stream, mr);
+      }
+      case aggregation::STD: {
+        auto var_agg = static_cast<std_aggregation const&>(agg);
+        return reduction::segmented_standard_deviation(
+          col, offsets, output_dtype, null_handling, var_agg._ddof, stream, mr);
+      }
       default:
         CUDF_FAIL("Unsupported aggregation type.");
         // TODO: Add support for compound_ops. GH #10432
@@ -115,7 +130,8 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
   return aggregation_dispatcher(
     agg.kind,
     segmented_reduce_dispatch_functor{
-      segmented_values, offsets, output_dtype, null_handling, init, stream, mr});
+      segmented_values, offsets, output_dtype, null_handling, init, stream, mr},
+    agg);
 }
 }  // namespace detail
 
diff --git a/cpp/src/reductions/simple_segmented.cuh b/cpp/src/reductions/segmented/simple.cuh
similarity index 84%
rename from cpp/src/reductions/simple_segmented.cuh
rename to cpp/src/reductions/segmented/simple.cuh
index 2b31252c18c..fb080ebf67c 100644
--- a/cpp/src/reductions/simple_segmented.cuh
+++ b/cpp/src/reductions/segmented/simple.cuh
@@ -16,17 +16,16 @@
 
 #pragma once
 
+#include "update_validity.hpp"
+
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.hpp>
-#include <cudf/detail/null_mask.cuh>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/reduction.cuh>
+#include <cudf/detail/segmented_reduction.cuh>
 #include <cudf/detail/unary.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/element_argminmax.cuh>
 #include <cudf/detail/valid_if.cuh>
-#include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -47,7 +46,7 @@ namespace simple {
 namespace detail {
 
 /**
- * @brief Segment reduction for 'sum', 'product', 'min', 'max', 'sum of squares'
+ * @brief Segment reduction for 'sum', 'product', 'min', 'max', 'sum of squares', etc
  * which directly compute the reduction by a single step reduction call.
  *
  * @tparam InputType    the input column data-type
@@ -56,9 +55,7 @@ namespace detail {
 
  * @param col Input column of data to reduce
  * @param offsets Indices to segment boundaries
- * @param null_handling If `null_policy::INCLUDE`, all elements in a segment must be valid for the
- * reduced value to be valid. If `null_policy::EXCLUDE`, the reduced value is valid if any element
- * in the segment is valid.
+ * @param null_handling How null entries are processed within each segment
  * @param init Optional initial value of the reduction
  * @param stream Used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
@@ -73,7 +70,6 @@ std::unique_ptr<column> simple_segmented_reduction(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  // reduction by iterator
   auto dcol               = cudf::column_device_view::create(col, stream);
   auto simple_op          = Op{};
   auto const num_segments = offsets.size() - 1;
@@ -110,20 +106,8 @@ std::unique_ptr<column> simple_segmented_reduction(
   }
 
   // Compute the output null mask
-  auto const bitmask                         = col.null_mask();
-  auto const first_bit_indices_begin         = offsets.begin();
-  auto const first_bit_indices_end           = offsets.end() - 1;
-  auto const last_bit_indices_begin          = first_bit_indices_begin + 1;
-  auto [output_null_mask, output_null_count] = cudf::detail::segmented_null_mask_reduction(
-    bitmask,
-    first_bit_indices_begin,
-    first_bit_indices_end,
-    last_bit_indices_begin,
-    null_handling,
-    init.has_value() ? std::optional(init.value().get().is_valid()) : std::nullopt,
-    stream,
-    mr);
-  result->set_null_mask(std::move(output_null_mask), output_null_count);
+  cudf::reduction::detail::segmented_update_validity(
+    *result, col, offsets, null_handling, init, stream, mr);
 
   return result;
 }
@@ -139,9 +123,7 @@ std::unique_ptr<column> simple_segmented_reduction(
 
  * @param col Input column of data to reduce
  * @param offsets Indices to segment boundaries
- * @param null_handling If `null_policy::INCLUDE`, all elements in a segment must be valid for the
- * reduced value to be valid. If `null_policy::EXCLUDE`, the reduced value is valid if any element
- * in the segment is valid.
+ * @param null_handling How null entries are processed within each segment
  * @param stream Used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Output column in device memory
@@ -184,39 +166,10 @@ std::unique_ptr<column> string_segmented_reduction(column_view const& col,
                                                stream,
                                                mr)
                             ->release()[0]);
-  auto [segmented_null_mask, segmented_null_count] =
-    cudf::detail::segmented_null_mask_reduction(col.null_mask(),
-                                                offsets.begin(),
-                                                offsets.end() - 1,
-                                                offsets.begin() + 1,
-                                                null_handling,
-                                                std::nullopt,
-                                                stream,
-                                                mr);
-
-  // If the segmented null mask contains any null values, the segmented null mask
-  // must be combined with the result null mask.
-  if (segmented_null_count > 0) {
-    if (result->null_count() == 0) {
-      // The result has no nulls. Use the segmented null mask.
-      result->set_null_mask(std::move(segmented_null_mask), segmented_null_count);
-    } else {
-      // Compute the logical AND of the segmented output null mask and the
-      // result null mask to update the result null mask and null count.
-      auto result_mview = result->mutable_view();
-      std::vector masks{static_cast<bitmask_type const*>(result_mview.null_mask()),
-                        static_cast<bitmask_type const*>(segmented_null_mask.data())};
-      std::vector<size_type> begin_bits{0, 0};
-      auto const valid_count = cudf::detail::inplace_bitmask_and(
-        device_span<bitmask_type>(static_cast<bitmask_type*>(result_mview.null_mask()),
-                                  num_bitmask_words(result->size())),
-        masks,
-        begin_bits,
-        result->size(),
-        stream);
-      result->set_null_count(result->size() - valid_count);
-    }
-  }
+
+  // Compute the output null mask
+  cudf::reduction::detail::segmented_update_validity(
+    *result, col, offsets, null_handling, std::nullopt, stream, mr);
 
   return result;
 }
@@ -242,9 +195,7 @@ std::unique_ptr<column> string_segmented_reduction(column_view const& col,
 
  * @param col Input column of data to reduce
  * @param offsets Indices to segment boundaries
- * @param null_handling If `null_policy::INCLUDE`, all elements in a segment must be valid for the
- * reduced value to be valid. If `null_policy::EXCLUDE`, the reduced value is valid if any element
- * in the segment is valid.
+ * @param null_handling How null entries are processed within each segment
  * @param stream Used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Output column in device memory
@@ -390,7 +341,7 @@ struct same_column_type_dispatcher {
 /**
  * @brief Call reduce and return a column of the type specified.
  *
- * This is used by operations sum(), product(), and sum_of_squares().
+ * This is used by operations such as sum(), product(), sum_of_squares(), etc
  * It only supports numeric types. If the output type is not the
  * same as the input type, an extra cast operation may occur.
  *
@@ -456,10 +407,8 @@ struct column_type_dispatcher {
    * @tparam ElementType The input column type or key type
    * @param col Input column (must be numeric)
    * @param offsets Indices to segment boundaries
-   * @param output_type Requested type of the scalar result
-   * @param null_handling If `null_policy::INCLUDE`, all elements in a segment must be valid for the
-   * reduced value to be valid. If `null_policy::EXCLUDE`, the reduced value is valid if any element
-   * in the segment is valid.
+   * @param output_type Requested type of the output column
+   * @param null_handling How null entries are processed within each segment
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource used to allocate the returned scalar's device memory
    */
diff --git a/cpp/src/reductions/segmented/std.cu b/cpp/src/reductions/segmented/std.cu
new file mode 100644
index 00000000000..6af5a9cf9b6
--- /dev/null
+++ b/cpp/src/reductions/segmented/std.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compound.cuh"
+
+#include <cudf/detail/segmented_reduction_functions.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace reduction {
+
+std::unique_ptr<cudf::column> segmented_standard_deviation(column_view const& col,
+                                                           device_span<size_type const> offsets,
+                                                           cudf::data_type const output_dtype,
+                                                           null_policy null_handling,
+                                                           size_type ddof,
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource* mr)
+{
+  using reducer =
+    compound::detail::compound_segmented_dispatcher<cudf::reduction::op::standard_deviation>;
+  return cudf::type_dispatcher(
+    col.type(), reducer(), col, offsets, output_dtype, null_handling, ddof, stream, mr);
+}
+
+}  // namespace reduction
+}  // namespace cudf
diff --git a/cpp/src/reductions/segmented_sum.cu b/cpp/src/reductions/segmented/sum.cu
similarity index 95%
rename from cpp/src/reductions/segmented_sum.cu
rename to cpp/src/reductions/segmented/sum.cu
index 4caaa727371..0cb8decdc58 100644
--- a/cpp/src/reductions/segmented_sum.cu
+++ b/cpp/src/reductions/segmented/sum.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "simple_segmented.cuh"
+#include "simple.cuh"
 
 #include <cudf/detail/reduction_functions.hpp>
 
diff --git a/cpp/src/reductions/segmented/sum_of_squares.cu b/cpp/src/reductions/segmented/sum_of_squares.cu
new file mode 100644
index 00000000000..1ee4f992b6d
--- /dev/null
+++ b/cpp/src/reductions/segmented/sum_of_squares.cu
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "simple.cuh"
+
+#include <cudf/detail/segmented_reduction_functions.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace reduction {
+
+std::unique_ptr<cudf::column> segmented_sum_of_squares(column_view const& col,
+                                                       device_span<size_type const> offsets,
+                                                       cudf::data_type const output_dtype,
+                                                       null_policy null_handling,
+                                                       rmm::cuda_stream_view stream,
+                                                       rmm::mr::device_memory_resource* mr)
+{
+  using reducer = simple::detail::column_type_dispatcher<cudf::reduction::op::sum_of_squares>;
+  return cudf::type_dispatcher(
+    col.type(), reducer{}, col, offsets, output_dtype, null_handling, std::nullopt, stream, mr);
+}
+
+}  // namespace reduction
+}  // namespace cudf
diff --git a/cpp/src/reductions/segmented/update_validity.cu b/cpp/src/reductions/segmented/update_validity.cu
new file mode 100644
index 00000000000..8d70f961b63
--- /dev/null
+++ b/cpp/src/reductions/segmented/update_validity.cu
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "update_validity.hpp"
+
+#include <cudf/detail/null_mask.cuh>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/span.hpp>
+
+namespace cudf {
+namespace reduction {
+namespace detail {
+
+void segmented_update_validity(column& result,
+                               column_view const& col,
+                               device_span<size_type const> offsets,
+                               null_policy null_handling,
+                               std::optional<std::reference_wrapper<scalar const>> init,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
+{
+  auto [output_null_mask, output_null_count] = cudf::detail::segmented_null_mask_reduction(
+    col.null_mask(),
+    offsets.begin(),
+    offsets.end() - 1,
+    offsets.begin() + 1,
+    null_handling,
+    init.has_value() ? std::optional(init.value().get().is_valid()) : std::nullopt,
+    stream,
+    mr);
+  result.set_null_mask(std::move(output_null_mask), output_null_count);
+}
+
+}  // namespace detail
+}  // namespace reduction
+}  // namespace cudf
diff --git a/cpp/src/reductions/segmented/update_validity.hpp b/cpp/src/reductions/segmented/update_validity.hpp
new file mode 100644
index 00000000000..0003b98308a
--- /dev/null
+++ b/cpp/src/reductions/segmented/update_validity.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <optional>
+
+namespace cudf {
+namespace reduction {
+namespace detail {
+
+/**
+ * @brief Compute the validity mask and set it on the result column
+ *
+ * If `null_handling == null_policy::INCLUDE`, all elements in a segment must be valid for the
+ * reduced value to be valid.
+ * If `null_handling == null_policy::EXCLUDE`, the reduced value is valid if any element
+ * in the segment is valid.
+ *
+ * @param result Result of segmented reduce to update the null mask
+ * @param col Input column before reduce
+ * @param offsets Indices to segment boundaries
+ * @param null_handling How null entries are processed within each segment
+ * @param init Optional initial value
+ * @param stream Used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+void segmented_update_validity(column& result,
+                               column_view const& col,
+                               device_span<size_type const> offsets,
+                               null_policy null_handling,
+                               std::optional<std::reference_wrapper<scalar const>> init,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr);
+
+}  // namespace detail
+}  // namespace reduction
+}  // namespace cudf
diff --git a/cpp/src/reductions/segmented/var.cu b/cpp/src/reductions/segmented/var.cu
new file mode 100644
index 00000000000..84adf353700
--- /dev/null
+++ b/cpp/src/reductions/segmented/var.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compound.cuh"
+
+#include <cudf/detail/segmented_reduction_functions.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace reduction {
+
+std::unique_ptr<cudf::column> segmented_variance(column_view const& col,
+                                                 device_span<size_type const> offsets,
+                                                 cudf::data_type const output_dtype,
+                                                 null_policy null_handling,
+                                                 size_type ddof,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
+{
+  using reducer = compound::detail::compound_segmented_dispatcher<cudf::reduction::op::variance>;
+  return cudf::type_dispatcher(
+    col.type(), reducer(), col, offsets, output_dtype, null_handling, ddof, stream, mr);
+}
+
+}  // namespace reduction
+}  // namespace cudf
diff --git a/cpp/src/reductions/std.cu b/cpp/src/reductions/std.cu
index bb29e5cd030..e9ba75f68e6 100644
--- a/cpp/src/reductions/std.cu
+++ b/cpp/src/reductions/std.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@ namespace reduction {
 
 std::unique_ptr<cudf::scalar> standard_deviation(column_view const& col,
                                                  cudf::data_type const output_dtype,
-                                                 cudf::size_type ddof,
+                                                 size_type ddof,
                                                  rmm::cuda_stream_view stream,
                                                  rmm::mr::device_memory_resource* mr)
 {
diff --git a/cpp/src/reductions/var.cu b/cpp/src/reductions/var.cu
index 2df653858b0..4d86918d6c6 100644
--- a/cpp/src/reductions/var.cu
+++ b/cpp/src/reductions/var.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@ namespace reduction {
 
 std::unique_ptr<cudf::scalar> variance(column_view const& col,
                                        cudf::data_type const output_dtype,
-                                       cudf::size_type ddof,
+                                       size_type ddof,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
diff --git a/cpp/tests/reductions/segmented_reduction_tests.cpp b/cpp/tests/reductions/segmented_reduction_tests.cpp
index 74e69d2fb05..b4873a14509 100644
--- a/cpp/tests/reductions/segmented_reduction_tests.cpp
+++ b/cpp/tests/reductions/segmented_reduction_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,10 @@
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
 
+#include <limits>
+#include <utility>
+#include <vector>
+
 #define XXX 0  // null placeholder
 
 template <typename T>
@@ -770,6 +774,240 @@ TEST_F(SegmentedReductionTestUntyped, NonNullableInput)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, null_init_expect);
 }
 
+TEST_F(SegmentedReductionTestUntyped, Mean)
+{
+  auto const input =
+    cudf::test::fixed_width_column_wrapper<int32_t>{10, 20, 30, 40, 50, 60, 70, 80, 90};
+  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const agg         = cudf::make_mean_aggregation<cudf::segmented_reduce_aggregation>();
+  auto const output_type = cudf::data_type{cudf::type_id::FLOAT32};
+
+  auto const expected =
+    cudf::test::fixed_width_column_wrapper<float>{{10, 0, 30, 70}, {1, 0, 1, 1}};
+  auto result =
+    cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::INCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+  result = cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::EXCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TEST_F(SegmentedReductionTestUntyped, MeanNulls)
+{
+  auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
+    {10, 20, 30, 40, 50, 60, 0, 80, 90}, {1, 1, 1, 1, 1, 1, 0, 1, 1});
+  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const agg         = cudf::make_mean_aggregation<cudf::segmented_reduce_aggregation>();
+  auto const output_type = cudf::data_type{cudf::type_id::FLOAT64};
+
+  auto expected = cudf::test::fixed_width_column_wrapper<double>{{10, 0, 30, 70}, {1, 0, 1, 1}};
+  auto result =
+    cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::EXCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+
+  expected = cudf::test::fixed_width_column_wrapper<double>{{10, 0, 30, 0}, {1, 0, 1, 0}};
+  result = cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::INCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TEST_F(SegmentedReductionTestUntyped, SumOfSquares)
+{
+  auto const input =
+    cudf::test::fixed_width_column_wrapper<int32_t>{10, 20, 30, 40, 50, 60, 70, 80, 90};
+  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const agg = cudf::make_sum_of_squares_aggregation<cudf::segmented_reduce_aggregation>();
+  auto const output_type = cudf::data_type{cudf::type_id::INT32};
+
+  auto const expected =
+    cudf::test::fixed_width_column_wrapper<int32_t>{{100, 0, 2900, 25500}, {1, 0, 1, 1}};
+
+  auto result =
+    cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::INCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+  result = cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::EXCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TEST_F(SegmentedReductionTestUntyped, SumOfSquaresNulls)
+{
+  auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
+    {10, 20, 30, 40, 50, 60, 0, 80, 90}, {1, 1, 1, 1, 1, 1, 0, 1, 1});
+  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const agg = cudf::make_sum_of_squares_aggregation<cudf::segmented_reduce_aggregation>();
+  auto const output_type = cudf::data_type{cudf::type_id::INT64};
+
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<int64_t>{{100, 0, 2900, 20600}, {1, 0, 1, 1}};
+  auto result =
+    cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::EXCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+
+  expected = cudf::test::fixed_width_column_wrapper<int64_t>{{100, 0, 2900, 0}, {1, 0, 1, 0}};
+  result = cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::INCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TEST_F(SegmentedReductionTestUntyped, StandardDeviation)
+{
+  constexpr float NaN{std::numeric_limits<float>::quiet_NaN()};
+  auto const input =
+    cudf::test::fixed_width_column_wrapper<int32_t>{10, 20, 30, 40, 50, 60, 70, 80, 90};
+  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const agg         = cudf::make_std_aggregation<cudf::segmented_reduce_aggregation>();
+  auto const output_type = cudf::data_type{cudf::type_id::FLOAT32};
+
+  auto expected = cudf::test::fixed_width_column_wrapper<float>{
+    {NaN, 0.f, 10.f, static_cast<float>(std::sqrt(250.))}, {1, 0, 1, 1}};
+  auto result =
+    cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::INCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+  result = cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::EXCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TEST_F(SegmentedReductionTestUntyped, StandardDeviationNulls)
+{
+  constexpr double NaN{std::numeric_limits<double>::quiet_NaN()};
+  auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
+    {10, 0, 20, 30, 54, 63, 0, 72, 81}, {1, 0, 1, 1, 1, 1, 0, 1, 1});
+  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const agg         = cudf::make_std_aggregation<cudf::segmented_reduce_aggregation>();
+  auto const output_type = cudf::data_type{cudf::type_id::FLOAT64};
+
+  auto expected = cudf::test::fixed_width_column_wrapper<double>{
+    {NaN, 0., std::sqrt(50.), std::sqrt(135.)}, {1, 0, 1, 1}};
+  auto result =
+    cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::EXCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+
+  expected = cudf::test::fixed_width_column_wrapper<double>{{NaN, 0., 0., 0.}, {1, 0, 0, 0}};
+  result = cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::INCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TEST_F(SegmentedReductionTestUntyped, Variance)
+{
+  constexpr float NaN{std::numeric_limits<float>::quiet_NaN()};
+  auto const input =
+    cudf::test::fixed_width_column_wrapper<int32_t>{10, 20, 30, 40, 50, 60, 70, 80, 90};
+  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const agg         = cudf::make_variance_aggregation<cudf::segmented_reduce_aggregation>();
+  auto const output_type = cudf::data_type{cudf::type_id::FLOAT32};
+
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<float>{{NaN, 0.f, 100.f, 250.f}, {1, 0, 1, 1}};
+  auto result =
+    cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::INCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+  result = cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::EXCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TEST_F(SegmentedReductionTestUntyped, VarianceNulls)
+{
+  constexpr double NaN{std::numeric_limits<double>::quiet_NaN()};
+  auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
+    {10, 0, 20, 30, 54, 63, 0, 72, 81}, {1, 0, 1, 1, 1, 1, 0, 1, 1});
+  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const agg         = cudf::make_variance_aggregation<cudf::segmented_reduce_aggregation>();
+  auto const output_type = cudf::data_type{cudf::type_id::FLOAT64};
+
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<double>{{NaN, 0., 50., 135.}, {1, 0, 1, 1}};
+  auto result =
+    cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::EXCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+
+  expected = cudf::test::fixed_width_column_wrapper<double>{{NaN, 0., 0., 0.}, {1, 0, 0, 0}};
+  result = cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::INCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TEST_F(SegmentedReductionTestUntyped, Errors)
+{
+  auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
+    {10, 0, 20, 30, 54, 63, 0, 72, 81}, {1, 0, 1, 1, 1, 1, 0, 1, 1});
+  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const null_policy = cudf::null_policy::EXCLUDE;
+  auto const output_type = cudf::data_type{cudf::type_id::TIMESTAMP_DAYS};
+  auto const str_input =
+    cudf::test::strings_column_wrapper({"10", "0", "20", "30", "54", "63", "", "72", "81"});
+
+  auto const sum_agg = cudf::make_sum_aggregation<cudf::segmented_reduce_aggregation>();
+  EXPECT_THROW(cudf::segmented_reduce(input, d_offsets, *sum_agg, output_type, null_policy),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::segmented_reduce(str_input, d_offsets, *sum_agg, output_type, null_policy),
+               cudf::logic_error);
+
+  auto const product_agg = cudf::make_product_aggregation<cudf::segmented_reduce_aggregation>();
+  EXPECT_THROW(cudf::segmented_reduce(input, d_offsets, *product_agg, output_type, null_policy),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::segmented_reduce(str_input, d_offsets, *product_agg, output_type, null_policy),
+               cudf::logic_error);
+
+  auto const min_agg = cudf::make_min_aggregation<cudf::segmented_reduce_aggregation>();
+  EXPECT_THROW(cudf::segmented_reduce(input, d_offsets, *min_agg, output_type, null_policy),
+               cudf::logic_error);
+
+  auto const max_agg = cudf::make_max_aggregation<cudf::segmented_reduce_aggregation>();
+  EXPECT_THROW(cudf::segmented_reduce(input, d_offsets, *max_agg, output_type, null_policy),
+               cudf::logic_error);
+
+  auto const any_agg = cudf::make_any_aggregation<cudf::segmented_reduce_aggregation>();
+  EXPECT_THROW(cudf::segmented_reduce(input, d_offsets, *any_agg, output_type, null_policy),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::segmented_reduce(str_input, d_offsets, *any_agg, output_type, null_policy),
+               cudf::logic_error);
+
+  auto const all_agg = cudf::make_all_aggregation<cudf::segmented_reduce_aggregation>();
+  EXPECT_THROW(cudf::segmented_reduce(input, d_offsets, *all_agg, output_type, null_policy),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::segmented_reduce(str_input, d_offsets, *all_agg, output_type, null_policy),
+               cudf::logic_error);
+
+  auto const mean_agg = cudf::make_mean_aggregation<cudf::segmented_reduce_aggregation>();
+  EXPECT_THROW(cudf::segmented_reduce(input, d_offsets, *mean_agg, output_type, null_policy),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::segmented_reduce(str_input, d_offsets, *mean_agg, output_type, null_policy),
+               cudf::logic_error);
+
+  auto const std_agg = cudf::make_std_aggregation<cudf::segmented_reduce_aggregation>();
+  EXPECT_THROW(cudf::segmented_reduce(input, d_offsets, *std_agg, output_type, null_policy),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::segmented_reduce(str_input, d_offsets, *std_agg, output_type, null_policy),
+               cudf::logic_error);
+
+  auto const var_agg = cudf::make_variance_aggregation<cudf::segmented_reduce_aggregation>();
+  EXPECT_THROW(cudf::segmented_reduce(input, d_offsets, *var_agg, output_type, null_policy),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::segmented_reduce(str_input, d_offsets, *var_agg, output_type, null_policy),
+               cudf::logic_error);
+
+  auto const squares_agg =
+    cudf::make_sum_of_squares_aggregation<cudf::segmented_reduce_aggregation>();
+  EXPECT_THROW(cudf::segmented_reduce(input, d_offsets, *squares_agg, output_type, null_policy),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::segmented_reduce(str_input, d_offsets, *squares_agg, output_type, null_policy),
+               cudf::logic_error);
+}
+
 TEST_F(SegmentedReductionTestUntyped, ReduceEmptyColumn)
 {
   auto const input   = cudf::test::fixed_width_column_wrapper<int32_t>{};