From ba763105e006494a536c1a2fafc5112ab3dae362 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Fri, 24 Sep 2021 09:37:11 -0500
Subject: [PATCH] Support for using tdigests to compute approximate
 percentiles. (#8983)

Addresses  https://github.com/rapidsai/cudf/issues/7170

Adds 3 pieces of new functionality:

- A `TDIGEST` aggregation which creates a tdigest column (https://arxiv.org/pdf/1902.04023.pdf) from a stream of input scalars.
- A `MERGE_TDIGEST` aggregation which merges multiple tdigest columns into a new one.
- a `percentile_approx` function which performs percentile queries on tdigest data.

Also exposes several ::detail functions (`sort`, `merge`, `slice`) in detail headers.

Ready for review.  I do need to add more tests though.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Jake Hemstad (https://github.com/jrhemstad)
  - MithunR (https://github.com/mythrocks)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/8983
---
 conda/recipes/libcudf/meta.yaml               |   1 +
 cpp/CMakeLists.txt                            |   4 +-
 cpp/include/cudf/aggregation.hpp              |  79 +-
 .../cudf/detail/aggregation/aggregation.hpp   |  76 ++
 cpp/include/cudf/detail/copy.hpp              |   9 +
 cpp/include/cudf/detail/merge.cuh             |  17 +
 cpp/include/cudf/detail/quantiles.hpp         |  18 +-
 cpp/include/cudf/detail/sorting.hpp           |  16 +-
 cpp/include/cudf/detail/tdigest/tdigest.hpp   |  79 ++
 cpp/include/cudf/quantiles.hpp                |  28 +
 cpp/include/cudf/sorting.hpp                  |   6 +-
 cpp/include/cudf_test/column_utilities.hpp    |   7 +-
 cpp/src/aggregation/aggregation.cpp           |  41 +
 cpp/src/copying/slice.cu                      |  34 +-
 cpp/src/groupby/sort/aggregate.cpp            |  91 ++
 cpp/src/groupby/sort/group_reductions.hpp     |  88 ++
 cpp/src/groupby/sort/group_tdigest.cu         | 841 ++++++++++++++++++
 cpp/src/quantiles/tdigest/tdigest.cu          | 383 ++++++++
 cpp/src/sort/sort.cu                          |   8 +-
 cpp/src/sort/stable_sort.cu                   |   4 +-
 cpp/tests/CMakeLists.txt                      |   2 +
 cpp/tests/groupby/groupby_test_util.hpp       |  55 ++
 cpp/tests/groupby/tdigest_tests.cu            | 584 ++++++++++++
 cpp/tests/quantiles/percentile_approx_test.cu | 435 +++++++++
 cpp/tests/utilities/column_utilities.cu       |  61 +-
 25 files changed, 2919 insertions(+), 48 deletions(-)
 create mode 100644 cpp/include/cudf/detail/tdigest/tdigest.hpp
 create mode 100644 cpp/src/groupby/sort/group_tdigest.cu
 create mode 100644 cpp/src/quantiles/tdigest/tdigest.cu
 create mode 100644 cpp/tests/groupby/tdigest_tests.cu
 create mode 100644 cpp/tests/quantiles/percentile_approx_test.cu

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index c3450fe8d88..fd687de6698 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -93,6 +93,7 @@ test:
     - test -f $PREFIX/include/cudf/detail/sequence.hpp
     - test -f $PREFIX/include/cudf/detail/sorting.hpp
     - test -f $PREFIX/include/cudf/detail/stream_compaction.hpp
+    - test -f $PREFIX/include/cudf/detail/tdigest/tdigest.hpp
     - test -f $PREFIX/include/cudf/detail/transform.hpp
     - test -f $PREFIX/include/cudf/detail/transpose.hpp
     - test -f $PREFIX/include/cudf/detail/unary.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2df35aa0971..00af1973cfe 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -236,8 +236,9 @@ add_library(cudf
     src/groupby/sort/group_max_scan.cu
     src/groupby/sort/group_min_scan.cu
     src/groupby/sort/group_rank_scan.cu
-    src/groupby/sort/group_sum_scan.cu
     src/groupby/sort/group_replace_nulls.cu
+    src/groupby/sort/group_sum_scan.cu
+    src/groupby/sort/group_tdigest.cu
     src/groupby/sort/sort_helper.cu
     src/hash/hashing.cu
     src/hash/md5_hash.cu
@@ -318,6 +319,7 @@ add_library(cudf
     src/merge/merge.cu
     src/partitioning/partitioning.cu
     src/partitioning/round_robin.cu
+    src/quantiles/tdigest/tdigest.cu
     src/quantiles/quantile.cu
     src/quantiles/quantiles.cu
     src/reductions/all.cu
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index c302895880d..fb6401a3cc1 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -87,7 +87,9 @@ class aggregation {
     CUDA,            ///< CUDA UDF based reduction
     MERGE_LISTS,     ///< merge multiple lists values into one list
     MERGE_SETS,      ///< merge multiple lists values into one list then drop duplicate entries
-    MERGE_M2         ///< merge partial values of M2 aggregation
+    MERGE_M2,        ///< merge partial values of M2 aggregation
+    TDIGEST,         ///< create a tdigest from a set of input values
+    MERGE_TDIGEST    ///< create a tdigest by merging multiple tdigests together
   };
 
   aggregation() = delete;
@@ -493,5 +495,80 @@ std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal = nu
 template <typename Base = aggregation>
 std::unique_ptr<Base> make_merge_m2_aggregation();
 
+/**
+ * @brief Factory to create a TDIGEST aggregation
+ *
+ * Produces a tdigest (https://arxiv.org/pdf/1902.04023.pdf) column from input values.
+ * The input aggregation values are expected to be fixed-width numeric types.
+ *
+ * The tdigest column produced is of the following structure:
+ *
+ * struct {
+ *   // centroids for the digest
+ *   list {
+ *    struct {
+ *      double    // mean
+ *      double    // weight
+ *    },
+ *    ...
+ *   }
+ *   // these are from the input stream, not the centroids. they are used
+ *   // during the percentile_approx computation near the beginning or
+ *   // end of the quantiles
+ *   double       // min
+ *   double       // max
+ * }
+ *
+ * Each output row is a single tdigest.  The length of the row is the "size" of the
+ * tdigest, each element of which represents a weighted centroid (mean, weight).
+ *
+ * @param max_centroids Parameter controlling compression level and accuracy on subsequent
+ * queries on the output tdigest data.  `max_centroids` places an upper bound on the size of
+ * the computed tdigests: A value of 1000 will result in a tdigest containing no
+ * more than 1000 centroids (32 bytes each). Higher result in more accurate tdigest information.
+ *
+ * @returns A TDIGEST aggregation object.
+ */
+template <typename Base>
+std::unique_ptr<Base> make_tdigest_aggregation(int max_centroids = 1000);
+
+/**
+ * @brief Factory to create a MERGE_TDIGEST aggregation
+ *
+ * Merges the results from a previous aggregation resulting from a `make_tdigest_aggregation`
+ * or `make_merge_tdigest_aggregation` to produce a new a tdigest
+ * (https://arxiv.org/pdf/1902.04023.pdf) column.
+ *
+ * The tdigest column produced is of the following structure:
+ *
+ * struct {
+ *   // centroids for the digest
+ *   list {
+ *    struct {
+ *      double    // mean
+ *      double    // weight
+ *    },
+ *    ...
+ *   }
+ *   // these are from the input stream, not the centroids. they are used
+ *   // during the percentile_approx computation near the beginning or
+ *   // end of the quantiles
+ *   double       // min
+ *   double       // max
+ * }
+ *
+ * Each output row is a single tdigest.  The length of the row is the "size" of the
+ * tdigest, each element of which represents a weighted centroid (mean, weight).
+ *
+ * @param max_centroids Parameter controlling compression level and accuracy on subsequent
+ * queries on the output tdigest data.  `max_centroids` places an upper bound on the size of
+ * the computed tdigests: A value of 1000 will result in a tdigest containing no
+ * more than 1000 centroids (32 bytes each). Higher result in more accurate tdigest information.
+ *
+ * @returns A MERGE_TDIGEST aggregation object.
+ */
+template <typename Base>
+std::unique_ptr<Base> make_merge_tdigest_aggregation(int max_centroids = 1000);
+
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 5a1fc3b9398..05d1bf3e595 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -91,6 +91,10 @@ class simple_aggregations_collector {  // Declares the interface for the simple
                                                           class merge_sets_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class merge_m2_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class tdigest_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(
+    data_type col_type, class merge_tdigest_aggregation const& agg);
 };
 
 class aggregation_finalizer {  // Declares the interface for the finalizer
@@ -125,6 +129,8 @@ class aggregation_finalizer {  // Declares the interface for the finalizer
   virtual void visit(class merge_lists_aggregation const& agg);
   virtual void visit(class merge_sets_aggregation const& agg);
   virtual void visit(class merge_m2_aggregation const& agg);
+  virtual void visit(class tdigest_aggregation const& agg);
+  virtual void visit(class merge_tdigest_aggregation const& agg);
 };
 
 /**
@@ -884,6 +890,54 @@ class merge_m2_aggregation final : public groupby_aggregation {
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
+/**
+ * @brief Derived aggregation class for specifying TDIGEST aggregation
+ */
+class tdigest_aggregation final : public groupby_aggregation {
+ public:
+  explicit tdigest_aggregation(int max_centroids_)
+    : aggregation{TDIGEST}, max_centroids{max_centroids_}
+  {
+  }
+
+  int const max_centroids;
+
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<tdigest_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
+/**
+ * @brief Derived aggregation class for specifying MERGE_TDIGEST aggregation
+ */
+class merge_tdigest_aggregation final : public groupby_aggregation {
+ public:
+  explicit merge_tdigest_aggregation(int max_centroids_)
+    : aggregation{MERGE_TDIGEST}, max_centroids{max_centroids_}
+  {
+  }
+
+  int const max_centroids;
+
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<merge_tdigest_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
 /**
  * @brief Sentinel value used for `ARGMAX` aggregation.
  *
@@ -1120,6 +1174,24 @@ struct target_type_impl<SourceType, aggregation::MERGE_M2> {
   using type = struct_view;
 };
 
+// Always use numeric types for TDIGEST
+template <typename Source>
+struct target_type_impl<Source,
+                        aggregation::TDIGEST,
+                        std::enable_if_t<(is_numeric<Source>() || is_fixed_point<Source>())>> {
+  using type = struct_view;
+};
+
+// TDIGEST_MERGE. The root column type for a tdigest column is a list_view. Strictly
+// speaking, this check is not sufficient to guarantee we are actually being given a
+// real tdigest column, but we will do further verification inside the aggregation code.
+template <typename Source>
+struct target_type_impl<Source,
+                        aggregation::MERGE_TDIGEST,
+                        std::enable_if_t<std::is_same_v<Source, cudf::struct_view>>> {
+  using type = struct_view;
+};
+
 /**
  * @brief Helper alias to get the accumulator type for performing aggregation
  * `k` on elements of type `Source`
@@ -1224,6 +1296,10 @@ CUDA_HOST_DEVICE_CALLABLE decltype(auto) aggregation_dispatcher(aggregation::Kin
       return f.template operator()<aggregation::MERGE_SETS>(std::forward<Ts>(args)...);
     case aggregation::MERGE_M2:
       return f.template operator()<aggregation::MERGE_M2>(std::forward<Ts>(args)...);
+    case aggregation::TDIGEST:
+      return f.template operator()<aggregation::TDIGEST>(std::forward<Ts>(args)...);
+    case aggregation::MERGE_TDIGEST:
+      return f.template operator()<aggregation::MERGE_TDIGEST>(std::forward<Ts>(args)...);
     default: {
 #ifndef __CUDA_ARCH__
       CUDF_FAIL("Unsupported aggregation.");
diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp
index fb5cfad6186..9f06661c8d1 100644
--- a/cpp/include/cudf/detail/copy.hpp
+++ b/cpp/include/cudf/detail/copy.hpp
@@ -75,6 +75,15 @@ std::vector<column_view> slice(column_view const& input,
                                std::vector<size_type> const& indices,
                                rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
+/**
+ * @copydoc cudf::slice(table_view const&,std::vector<size_type> const&)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::vector<table_view> slice(table_view const& input,
+                              std::vector<size_type> const& indices,
+                              rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
 /**
  * @copydoc cudf::shift(column_view const&,size_type,scalar const&,
  * rmm::mr::device_memory_resource*)
diff --git a/cpp/include/cudf/detail/merge.cuh b/cpp/include/cudf/detail/merge.cuh
index a779c3defbb..ec83e348e33 100644
--- a/cpp/include/cudf/detail/merge.cuh
+++ b/cpp/include/cudf/detail/merge.cuh
@@ -145,5 +145,22 @@ struct row_lexicographic_tagged_comparator {
   order const* _column_order{};
 };
 
+/**
+ * @copydoc std::unique_ptr<cudf::table> merge(
+ *            std::vector<table_view> const& tables_to_merge,
+ *            std::vector<cudf::size_type> const& key_cols,
+ *            std::vector<cudf::order> const& column_order,
+ *            std::vector<cudf::null_order> const& null_precedence,
+ *            rmm::mr::device_memory_resource* mr)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+std::unique_ptr<cudf::table> merge(std::vector<table_view> const& tables_to_merge,
+                                   std::vector<cudf::size_type> const& key_cols,
+                                   std::vector<cudf::order> const& column_order,
+                                   std::vector<cudf::null_order> const& null_precedence,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/quantiles.hpp b/cpp/include/cudf/detail/quantiles.hpp
index 5fb2ce4cbe6..7a76f9cab88 100644
--- a/cpp/include/cudf/detail/quantiles.hpp
+++ b/cpp/include/cudf/detail/quantiles.hpp
@@ -22,7 +22,8 @@
 namespace cudf {
 namespace detail {
 
-/** @copydoc cudf::quantile()
+/**
+ * @copydoc cudf::quantile()
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -35,7 +36,8 @@ std::unique_ptr<column> quantile(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/** @copydoc cudf::quantiles()
+/**
+ * @copydoc cudf::quantiles()
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -49,5 +51,17 @@ std::unique_ptr<table> quantiles(
   rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc cudf::percentile_approx(column_view const&, column_view const&,
+ * rmm::mr::device_memory_resource*)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> percentile_approx(
+  column_view const& input,
+  column_view const& percentiles,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/sorting.hpp b/cpp/include/cudf/detail/sorting.hpp
index 3127a5f89f1..b5dfb34c043 100644
--- a/cpp/include/cudf/detail/sorting.hpp
+++ b/cpp/include/cudf/detail/sorting.hpp
@@ -32,7 +32,7 @@ namespace detail {
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> sorted_order(
-  table_view input,
+  table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
@@ -44,7 +44,7 @@ std::unique_ptr<column> sorted_order(
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> stable_sorted_order(
-  table_view input,
+  table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
@@ -90,5 +90,17 @@ std::unique_ptr<table> segmented_sort_by_key(
   rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc cudf::sort
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<table> sort(
+  table_view const& values,
+  std::vector<order> const& column_order         = {},
+  std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp
new file mode 100644
index 00000000000..94c22911c1e
--- /dev/null
+++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace detail {
+
+namespace tdigest {
+
+// mean and weight column indices within tdigest inner struct columns
+constexpr size_type mean_column_index   = 0;
+constexpr size_type weight_column_index = 1;
+
+// min and max column indices within tdigest outer struct columns
+constexpr size_type centroid_column_index = 0;
+constexpr size_type min_column_index      = 1;
+constexpr size_type max_column_index      = 2;
+
+/**
+ * @brief Verifies that the input column is a valid tdigest column.
+ *
+ * struct {
+ *   // centroids for the digest
+ *   list {
+ *    struct {
+ *      double    // mean
+ *      double    // weight
+ *    },
+ *    ...
+ *   }
+ *   // these are from the input stream, not the centroids. they are used
+ *   // during the percentile_approx computation near the beginning or
+ *   // end of the quantiles
+ *   double       // min
+ *   double       // max
+ * }
+ *
+ * Each output row is a single tdigest.  The length of the row is the "size" of the
+ * tdigest, each element of which represents a weighted centroid (mean, weight).
+ *
+ * @param col    Column to be checkeed
+ *
+ * @throws cudf::logic error if the column is not a valid tdigest column.
+ */
+void check_is_valid_tdigest_column(column_view const& col);
+
+/**
+ * @brief Create an empty tdigest column.
+ *
+ * An empty tdigest column contains a single row of length 0
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ *
+ * @returns An empty tdigest column.
+ */
+std::unique_ptr<column> make_empty_tdigest_column(
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace tdigest
+}  // namespace detail
+}  // namespace cudf
\ No newline at end of file
diff --git a/cpp/include/cudf/quantiles.hpp b/cpp/include/cudf/quantiles.hpp
index 94b5c344f4f..d21f6dff79c 100644
--- a/cpp/include/cudf/quantiles.hpp
+++ b/cpp/include/cudf/quantiles.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
@@ -94,5 +95,32 @@ std::unique_ptr<table> quantiles(
   std::vector<null_order> const& null_precedence = {},
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Calculate approximate percentiles on an input tdigest column.
+ *
+ * tdigest (https://arxiv.org/pdf/1902.04023.pdf) columns are produced specifically
+ * by the TDIGEST and MERGE_TDIGEST aggregations.  These columns represent
+ * compressed representations of a very large input data set that can be
+ * queried for quantile information.
+ *
+ * Produces a LIST column where each row `i` represents output from querying the
+ * corresponding tdigest from `input` row `i`. The length of each output list
+ * is the number of percentages specified in `percentages`.
+ *
+ * @param input           tdigest input data. One tdigest per row.
+ * @param percentiles     Desired percentiles in range [0, 1].
+ * @param mr              Device memory resource used to allocate the returned column's device
+ * memory
+ *
+ * @throws cudf::logic_error if `input` is not a valid tdigest column.
+ * @throws cudf::logic_error if `percentiles` is not a FLOAT64 column.
+ *
+ * @returns LIST Column containing requested percentile values as FLOAT64.
+ */
+std::unique_ptr<column> percentile_approx(
+  structs_column_view const& input,
+  column_view const& percentiles,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
index 36a8131a78e..69eb8b3490a 100644
--- a/cpp/include/cudf/sorting.hpp
+++ b/cpp/include/cudf/sorting.hpp
@@ -58,7 +58,7 @@ enum class rank_method {
  * `input` if it were sorted
  */
 std::unique_ptr<column> sorted_order(
-  table_view input,
+  table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
@@ -72,7 +72,7 @@ std::unique_ptr<column> sorted_order(
  * @copydoc cudf::sorted_order
  */
 std::unique_ptr<column> stable_sorted_order(
-  table_view input,
+  table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
@@ -112,7 +112,7 @@ bool is_sorted(cudf::table_view const& table,
  * @return New table containing the desired sorted order of `input`
  */
 std::unique_ptr<table> sort(
-  table_view input,
+  table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp
index 553d8a97bd2..aa77686fee4 100644
--- a/cpp/include/cudf_test/column_utilities.hpp
+++ b/cpp/include/cudf_test/column_utilities.hpp
@@ -38,6 +38,8 @@ enum class debug_output_level {
   QUIET             // no debug output
 };
 
+constexpr size_type default_ulp = 4;
+
 /**
  * @brief Verifies the property equality of two columns.
  *
@@ -93,12 +95,15 @@ bool expect_columns_equal(cudf::column_view const& lhs,
  * @param lhs The first column
  * @param rhs The second column
  * @param verbosity Level of debug output verbosity
+ * @param fp_ulps # of ulps of tolerance to allow when comparing
+ * floating point values
  *
  * @returns True if the columns (and their properties) are equivalent, false otherwise
  */
 bool expect_columns_equivalent(cudf::column_view const& lhs,
                                cudf::column_view const& rhs,
-                               debug_output_level verbosity = debug_output_level::FIRST_ERROR);
+                               debug_output_level verbosity = debug_output_level::FIRST_ERROR,
+                               size_type fp_ulps            = cudf::test::default_ulp);
 
 /**
  * @brief Verifies the bitwise equality of two device memory buffers.
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index c3d992e1181..b550b61785b 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -202,6 +202,18 @@ std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   return visit(col_type, static_cast<aggregation const&>(agg));
 }
 
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, tdigest_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, merge_tdigest_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
 // aggregation_finalizer ----------------------------------------
 
 void aggregation_finalizer::visit(aggregation const& agg) {}
@@ -346,6 +358,16 @@ void aggregation_finalizer::visit(merge_m2_aggregation const& agg)
   visit(static_cast<aggregation const&>(agg));
 }
 
+void aggregation_finalizer::visit(tdigest_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(merge_tdigest_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
 }  // namespace detail
 
 std::vector<std::unique_ptr<aggregation>> aggregation::get_simple_aggregations(
@@ -668,6 +690,25 @@ std::unique_ptr<Base> make_merge_m2_aggregation()
 template std::unique_ptr<aggregation> make_merge_m2_aggregation<aggregation>();
 template std::unique_ptr<groupby_aggregation> make_merge_m2_aggregation<groupby_aggregation>();
 
+template <typename Base>
+std::unique_ptr<Base> make_tdigest_aggregation(int max_centroids)
+{
+  return std::make_unique<detail::tdigest_aggregation>(max_centroids);
+}
+template std::unique_ptr<aggregation> make_tdigest_aggregation<aggregation>(int max_centroids);
+template std::unique_ptr<groupby_aggregation> make_tdigest_aggregation<groupby_aggregation>(
+  int max_centroids);
+
+template <typename Base>
+std::unique_ptr<Base> make_merge_tdigest_aggregation(int max_centroids)
+{
+  return std::make_unique<detail::merge_tdigest_aggregation>(max_centroids);
+}
+template std::unique_ptr<aggregation> make_merge_tdigest_aggregation<aggregation>(
+  int max_centroids);
+template std::unique_ptr<groupby_aggregation> make_merge_tdigest_aggregation<groupby_aggregation>(
+  int max_centroids);
+
 namespace detail {
 namespace {
 struct target_type_functor {
diff --git a/cpp/src/copying/slice.cu b/cpp/src/copying/slice.cu
index 0e41689dc4b..d1c12056393 100644
--- a/cpp/src/copying/slice.cu
+++ b/cpp/src/copying/slice.cu
@@ -63,17 +63,9 @@ std::vector<column_view> slice(column_view const& input,
   return std::vector<column_view>{begin, begin + indices.size() / 2};
 }
 
-}  // namespace detail
-
-std::vector<cudf::column_view> slice(cudf::column_view const& input,
-                                     std::vector<size_type> const& indices)
-{
-  CUDF_FUNC_RANGE();
-  return detail::slice(input, indices, rmm::cuda_stream_default);
-}
-
-std::vector<cudf::table_view> slice(cudf::table_view const& input,
-                                    std::vector<size_type> const& indices)
+std::vector<table_view> slice(table_view const& input,
+                              std::vector<size_type> const& indices,
+                              rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(indices.size() % 2 == 0, "indices size must be even");
@@ -81,7 +73,7 @@ std::vector<cudf::table_view> slice(cudf::table_view const& input,
 
   // 2d arrangement of column_views that represent the outgoing table_views sliced_table[i][j]
   // where i is the i'th column of the j'th table_view
-  auto op = [&indices](auto const& c) { return cudf::slice(c, indices); };
+  auto op = [&indices, stream](auto const& c) { return cudf::detail::slice(c, indices, stream); };
   auto f  = thrust::make_transform_iterator(input.begin(), op);
 
   auto sliced_table = std::vector<std::vector<cudf::column_view>>(f, f + input.num_columns());
@@ -99,6 +91,22 @@ std::vector<cudf::table_view> slice(cudf::table_view const& input,
   }
 
   return result;
-};
+}
+
+}  // namespace detail
+
+std::vector<cudf::column_view> slice(cudf::column_view const& input,
+                                     std::vector<size_type> const& indices)
+{
+  CUDF_FUNC_RANGE();
+  return detail::slice(input, indices, rmm::cuda_stream_default);
+}
+
+std::vector<cudf::table_view> slice(cudf::table_view const& input,
+                                    std::vector<size_type> const& indices)
+{
+  CUDF_FUNC_RANGE();
+  return detail::slice(input, indices, rmm::cuda_stream_default);
+}
 
 }  // namespace cudf
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 726b51b7702..9f3d67ac38b 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -525,6 +525,97 @@ void aggregate_result_functor::operator()<aggregation::MERGE_M2>(aggregation con
       get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr));
 };
 
+/**
+ * @brief Generate a tdigest column from a grouped set of numeric input values.
+ *
+ * The tdigest column produced is of the following structure:
+ *
+ * struct {
+ *   // centroids for the digest
+ *   list {
+ *    struct {
+ *      double    // mean
+ *      double    // weight
+ *    },
+ *    ...
+ *   }
+ *   // these are from the input stream, not the centroids. they are used
+ *   // during the percentile_approx computation near the beginning or
+ *   // end of the quantiles
+ *   double       // min
+ *   double       // max
+ * }
+ *
+ * Each output row is a single tdigest.  The length of the row is the "size" of the
+ * tdigest, each element of which represents a weighted centroid (mean, weight).
+ */
+template <>
+void aggregate_result_functor::operator()<aggregation::TDIGEST>(aggregation const& agg)
+{
+  if (cache.has_result(col_idx, agg)) { return; }
+
+  auto const max_centroids =
+    dynamic_cast<cudf::detail::tdigest_aggregation const&>(agg).max_centroids;
+
+  auto count_agg = make_count_aggregation();
+  operator()<aggregation::COUNT_VALID>(*count_agg);
+  column_view valid_counts = cache.get_result(col_idx, *count_agg);
+
+  cache.add_result(col_idx,
+                   agg,
+                   detail::group_tdigest(
+                     get_sorted_values(),
+                     helper.group_offsets(stream),
+                     helper.group_labels(stream),
+                     {valid_counts.begin<size_type>(), static_cast<size_t>(valid_counts.size())},
+                     helper.num_groups(stream),
+                     max_centroids,
+                     stream,
+                     mr));
+};
+
+/**
+ * @brief Generate a merged tdigest column from a grouped set of input tdigest columns.
+ *
+ * The tdigest column produced is of the following structure:
+ *
+ * struct {
+ *   // centroids for the digest
+ *   list {
+ *    struct {
+ *      double    // mean
+ *      double    // weight
+ *    },
+ *    ...
+ *   }
+ *   // these are from the input stream, not the centroids. they are used
+ *   // during the percentile_approx computation near the beginning or
+ *   // end of the quantiles
+ *   double       // min
+ *   double       // max
+ * }
+ *
+ * Each output row is a single tdigest.  The length of the row is the "size" of the
+ * tdigest, each element of which represents a weighted centroid (mean, weight).
+ */
+template <>
+void aggregate_result_functor::operator()<aggregation::MERGE_TDIGEST>(aggregation const& agg)
+{
+  if (cache.has_result(col_idx, agg)) { return; }
+
+  auto const max_centroids =
+    dynamic_cast<cudf::detail::merge_tdigest_aggregation const&>(agg).max_centroids;
+  cache.add_result(col_idx,
+                   agg,
+                   detail::group_merge_tdigest(get_grouped_values(),
+                                               helper.group_offsets(stream),
+                                               helper.group_labels(stream),
+                                               helper.num_groups(stream),
+                                               max_centroids,
+                                               stream,
+                                               mr));
+};
+
 }  // namespace detail
 
 // Sort-based groupby
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index 2770162da2d..cb01ee8e053 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -442,6 +442,94 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Generate a tdigest column from a grouped set of numeric input values.
+ *
+ * The tdigest column produced is of the following structure:
+ *
+ * struct {
+ *   // centroids for the digest
+ *   list {
+ *    struct {
+ *      double    // mean
+ *      double    // weight
+ *    },
+ *    ...
+ *   }
+ *   // these are from the input stream, not the centroids. they are used
+ *   // during the percentile_approx computation near the beginning or
+ *   // end of the quantiles
+ *   double       // min
+ *   double       // max
+ * }
+ *
+ * Each output row is a single tdigest.  The length of the row is the "size" of the
+ * tdigest, each element of which represents a weighted centroid (mean, weight).
+ *
+ * @param values Grouped (and sorted) values to merge.
+ * @param group_offsets Offsets of groups' starting points within @p values.
+ * @param group_labels 0-based ID of group that the corresponding value belongs to
+ * @param group_valid_counts Per-group counts of valid elements.
+ * @param num_groups Number of groups.
+ * @param max_centroids Parameter controlling the level of compression of the tdigest. Higher
+ * values result in a larger, more precise tdigest.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns tdigest column, with 1 tdigest per row
+ */
+std::unique_ptr<column> group_tdigest(column_view const& values,
+                                      cudf::device_span<size_type const> group_offsets,
+                                      cudf::device_span<size_type const> group_labels,
+                                      cudf::device_span<size_type const> group_valid_counts,
+                                      size_type num_groups,
+                                      int max_centroids,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Merges tdigests within the same group to generate a new tdigest.
+ *
+ * The tdigest column produced is of the following structure:
+ *
+ * struct {
+ *   // centroids for the digest
+ *   list {
+ *    struct {
+ *      double    // mean
+ *      double    // weight
+ *    },
+ *    ...
+ *   }
+ *   // these are from the input stream, not the centroids. they are used
+ *   // during the percentile_approx computation near the beginning or
+ *   // end of the quantiles
+ *   double       // min
+ *   double       // max
+ * }
+ *
+ * Each output row is a single tdigest.  The length of the row is the "size" of the
+ * tdigest, each element of which represents a weighted centroid (mean, weight).
+ *
+ * @param values Grouped tdigests to merge.
+ * @param group_offsets Offsets of groups' starting points within @p values.
+ * @param group_labels 0-based ID of group that the corresponding value belongs to
+ * @param num_groups Number of groups.
+ * @param max_centroids Parameter controlling the level of compression of the tdigest. Higher
+ * values result in a larger, more precise tdigest.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns tdigest column, with 1 tdigest per row
+ */
+std::unique_ptr<column> group_merge_tdigest(column_view const& values,
+                                            cudf::device_span<size_type const> group_offsets,
+                                            cudf::device_span<size_type const> group_labels,
+                                            size_type num_groups,
+                                            int max_centroids,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr);
+
 /** @endinternal
  *
  */
diff --git a/cpp/src/groupby/sort/group_tdigest.cu b/cpp/src/groupby/sort/group_tdigest.cu
new file mode 100644
index 00000000000..5b4252a9063
--- /dev/null
+++ b/cpp/src/groupby/sort/group_tdigest.cu
@@ -0,0 +1,841 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/concatenate.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/merge.cuh>
+#include <cudf/detail/sorting.hpp>
+#include <cudf/detail/tdigest/tdigest.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/utilities/span.hpp>
+
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/iterator/discard_iterator.h>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+
+namespace {
+
+// the most representative point within a cluster of similar
+// values. {mean, weight}
+// NOTE: Using a tuple here instead of a struct to take advantage of
+// thrust zip iterators for output.
+using centroid = thrust::tuple<double, double, bool>;
+
+// make a centroid from a scalar with a weight of 1.
+template <typename T>
+struct make_centroid {
+  column_device_view const col;
+
+  centroid operator() __device__(size_type index)
+  {
+    return {static_cast<double>(col.element<T>(index)), 1, col.is_valid(index)};
+  }
+};
+
+// make a centroid from an input stream of mean/weight values.
+struct make_weighted_centroid {
+  double const* mean;
+  double const* weight;
+
+  centroid operator() __device__(size_type index) { return {mean[index], weight[index], true}; }
+};
+
+// merge two centroids
+struct merge_centroids {
+  centroid operator() __device__(centroid const& lhs, centroid const& rhs)
+  {
+    bool const lhs_valid = thrust::get<2>(lhs);
+    bool const rhs_valid = thrust::get<2>(rhs);
+    if (!lhs_valid && !rhs_valid) { return {0, 0, false}; }
+    if (!lhs_valid) { return rhs; }
+    if (!rhs_valid) { return lhs; }
+
+    double const lhs_mean   = thrust::get<0>(lhs);
+    double const rhs_mean   = thrust::get<0>(rhs);
+    double const lhs_weight = thrust::get<1>(lhs);
+    double const rhs_weight = thrust::get<1>(rhs);
+    double const new_weight = lhs_weight + rhs_weight;
+    return {(lhs_mean * lhs_weight + rhs_mean * rhs_weight) / new_weight, new_weight, true};
+  }
+};
+
+/**
+ * @brief A functor which returns the nearest cumulative weight in the input stream prior to the
+ * specified next weight limit.
+ *
+ * This functor assumes the weight for all scalars is simply 1. Under this assumption,
+ * the nearest weight that will be <= the next limit is simply the nearest integer < the limit,
+ * which we can get by just taking floor(next_limit).  For example if our next limit is 3.56, the
+ * nearest whole number <= it is floor(3.56) == 3.
+ */
+struct nearest_value_scalar_weights {
+  thrust::pair<double, int> operator() __device__(double next_limit, size_type)
+  {
+    double const f = floor(next_limit);
+    return {f, max(0, static_cast<int>(next_limit) - 1)};
+  }
+};
+
+/**
+ * @brief A functor which returns the nearest cumulative weight in the input stream prior to the
+ * specified next weight limit.
+ *
+ * This functor assumes we are dealing with grouped, sorted, weighted centroids.
+ */
+struct nearest_value_centroid_weights {
+  double const* cumulative_weights;
+  offset_type const* outer_offsets;  // groups
+  offset_type const* inner_offsets;  // tdigests within a group
+
+  thrust::pair<double, int> operator() __device__(double next_limit, size_type group_index)
+  {
+    auto const tdigest_begin = outer_offsets[group_index];
+    auto const tdigest_end   = outer_offsets[group_index + 1];
+    auto const num_weights   = inner_offsets[tdigest_end] - inner_offsets[tdigest_begin];
+    double const* group_cumulative_weights = cumulative_weights + inner_offsets[tdigest_begin];
+
+    auto const index = ((thrust::lower_bound(thrust::seq,
+                                             group_cumulative_weights,
+                                             group_cumulative_weights + num_weights,
+                                             next_limit)) -
+                        group_cumulative_weights);
+
+    return index == 0 ? thrust::pair<double, int>{0, 0}
+                      : thrust::pair<double, int>{group_cumulative_weights[index - 1], index - 1};
+  }
+};
+
+/**
+ * @brief A functor which returns the cumulative input weight for a given index in a
+ * set of grouped input values.
+ *
+ * This functor assumes the weight for all scalars is simply 1. Under this assumption,
+ * the cumulative weight for a given value index I is simply I+1.
+ */
+struct cumulative_scalar_weight {
+  cudf::device_span<size_type const> group_offsets;
+  cudf::device_span<size_type const> group_labels;
+  std::tuple<size_type, size_type, double> operator() __device__(size_type value_index) const
+  {
+    auto const group_index          = group_labels[value_index];
+    auto const relative_value_index = value_index - group_offsets[group_index];
+    return {group_index, relative_value_index, relative_value_index + 1};
+  }
+};
+
+/**
+ * @brief A functor which returns the cumulative input weight for a given index in a
+ * set of grouped input centroids.
+ *
+ * This functor assumes we are dealing with grouped, weighted centroids.
+ */
+struct cumulative_centroid_weight {
+  double const* cumulative_weights;
+  cudf::device_span<size_type const> group_labels;
+  offset_type const* outer_offsets;                    // groups
+  cudf::device_span<offset_type const> inner_offsets;  // tdigests with a group
+
+  std::tuple<size_type, size_type, double> operator() __device__(size_type value_index) const
+  {
+    auto const tdigest_index =
+      static_cast<size_type>(
+        thrust::upper_bound(thrust::seq, inner_offsets.begin(), inner_offsets.end(), value_index) -
+        inner_offsets.begin()) -
+      1;
+    auto const group_index                 = group_labels[tdigest_index];
+    auto const first_tdigest_index         = outer_offsets[group_index];
+    auto const first_weight_index          = inner_offsets[first_tdigest_index];
+    auto const relative_value_index        = value_index - first_weight_index;
+    double const* group_cumulative_weights = cumulative_weights + first_weight_index;
+
+    return {group_index, relative_value_index, group_cumulative_weights[relative_value_index]};
+  }
+};
+
+// a monotonically increasing scale function which produces a distribution
+// of centroids that is more densely packed in the middle of the input
+// than at the ends.
+__device__ double scale_func_k1(double quantile, double delta_norm)
+{
+  double k = delta_norm * asin(2.0 * quantile - 1.0);
+  k += 1.0;
+  double q = (sin(k / delta_norm) + 1.0) / 2.0;
+  return q;
+}
+
+/**
+ * @brief Compute a set of cluster limits (brackets, essentially) for a
+ * given tdigest based on the specified delta and the total weight of values
+ * to be added.
+ *
+ * The number of clusters generated will always be <= delta_, where delta_ is
+ * a reasonably small number likely << 10000.
+ *
+ * Each input group gets an independent set of clusters generated. 1 thread
+ * per group.
+ *
+ * This kernel is called in a two-pass style.  Once to compute the per-group
+ * cluster sizes and total # of clusters, and once to compute the actual
+ * weight limits per cluster.
+ *
+ * @param delta_              tdigest compression level
+ * @param num_groups          The number of input groups
+ * @param nearest_weight_     A functor which returns the nearest weight in the input
+ * stream that falls before our current cluster limit
+ * @param total_weight_       A functor which returns the expected total weight for
+ * the entire stream of input values for the specified group.
+ * @param group_cluster_wl    Output.  The set of cluster weight limits for each group.
+ * @param group_num_clusters  Output.  The number of output clusters for each input group.
+ * @param group_cluster_offsets  Offsets per-group to the start of it's clusters
+ *
+ */
+template <typename TotalWeightIter, typename NearestWeightFunc, typename CumulativeWeight>
+__global__ void generate_cluster_limits_kernel(int delta_,
+                                               size_type num_groups,
+                                               NearestWeightFunc nearest_weight,
+                                               TotalWeightIter total_weight_,
+                                               CumulativeWeight cumulative_weight,
+                                               double* group_cluster_wl,
+                                               size_type* group_num_clusters,
+                                               offset_type const* group_cluster_offsets)
+{
+  int const tid          = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const group_index = tid;
+  if (group_index >= num_groups) { return; }
+
+  // we will generate at most delta clusters.
+  double const delta              = static_cast<double>(delta_);
+  double const delta_norm         = delta / (2.0 * M_PI);
+  double const total_weight       = total_weight_[group_index];
+  group_num_clusters[group_index] = 0;
+  // a group with nothing in it.
+  if (total_weight <= 0) { return; }
+
+  // start at the correct place based on our cluster offset.
+  double* cluster_wl =
+    group_cluster_wl ? group_cluster_wl + group_cluster_offsets[group_index] : nullptr;
+
+  double cur_limit        = 0.0;
+  double cur_weight       = 0.0;
+  double next_limit       = -1.0;
+  int last_inserted_index = -1;
+
+  // compute the first cluster limit
+  double nearest_w;
+  int nearest_w_index;
+  while (1) {
+    cur_weight = next_limit < 0 ? 0 : max(cur_weight + 1, nearest_w);
+    if (cur_weight >= total_weight) { break; }
+
+    // based on where we are closing the cluster off (not including the incoming weight),
+    // compute the next cluster limit
+    double const quantile = cur_weight / total_weight;
+    next_limit            = total_weight * scale_func_k1(quantile, delta_norm);
+
+    // if the next limit is < the cur limit, we're past the end of the distribution, so we're done.
+    if (next_limit <= cur_limit) {
+      if (cluster_wl) { cluster_wl[group_num_clusters[group_index]] = total_weight; }
+      group_num_clusters[group_index]++;
+      break;
+    }
+
+    // compute the weight we will be at in the input values just before closing off the current
+    // cluster (because adding the next value will cross the current limit).
+    // NOTE: can't use structured bindings here.
+    thrust::tie(nearest_w, nearest_w_index) = nearest_weight(next_limit, group_index);
+
+    if (cluster_wl) {
+      // because of the way the scale functions work, it is possible to generate clusters
+      // in such a way that we end up with "gaps" where there are no input values that
+      // fall into a given cluster.  An example would be this:
+      //
+      // cluster weight limits = 0.00003, 1.008, 3.008
+      //
+      // input values(weight) = A(1), B(2), C(3)
+      //
+      // naively inserting these values into the clusters simply by taking a lower_bound,
+      // we would get the following distribution of input values into those 3 clusters.
+      //  (), (A), (B,C)
+      //
+      // whereas what we really want is:
+      //
+      //  (A), (B), (C)
+      //
+      // to fix this, we will artificially adjust the output cluster limits to guarantee
+      // at least 1 input value will be put in each cluster during the reduction step.
+      // this does not affect final centroid results as we still use the "real" weight limits
+      // to compute subsequent clusters - the purpose is only to allow cluster selection
+      // during the reduction step to be trivial.
+      //
+      double adjusted_next_limit = next_limit;
+      if (nearest_w_index == last_inserted_index || last_inserted_index < 0) {
+        nearest_w_index       = last_inserted_index + 1;
+        auto [r, i, adjusted] = cumulative_weight(nearest_w_index);
+        adjusted_next_limit   = max(next_limit, adjusted);
+      }
+      cluster_wl[group_num_clusters[group_index]] = adjusted_next_limit;
+      last_inserted_index                         = nearest_w_index;
+    }
+    group_num_clusters[group_index]++;
+    cur_limit = next_limit;
+  }
+}
+
+/**
+ * @brief Compute a set of cluster limits (brackets, essentially) for a
+ * given tdigest based on the specified delta and the total weight of values
+ * to be added.
+ *
+ * The number of clusters generated will always be <= delta_, where delta_ is
+ * a reasonably small number likely << 10000.
+ *
+ * Each input group gets an independent set of clusters generated.
+ *
+ * @param delta_             tdigest compression level
+ * @param num_groups         The number of input groups
+ * @param nearest_weight     A functor which returns the nearest weight in the input
+ * stream that falls before our current cluster limit
+ * @param total_weight       A functor which returns the expected total weight for
+ * the entire stream of input values for the specified group.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns A tuple containing the set of cluster weight limits for each group, a set of
+ * list-style offsets indicating group sizes, and the total number of clusters
+ */
+template <typename TotalWeightIter, typename NearestWeight, typename CumulativeWeight>
+std::tuple<rmm::device_uvector<double>, std::unique_ptr<column>, size_type>
+generate_group_cluster_info(int delta,
+                            size_type num_groups,
+                            NearestWeight nearest_weight,
+                            TotalWeightIter total_weight,
+                            CumulativeWeight cumulative_weight,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr)
+{
+  constexpr size_type block_size = 256;
+  cudf::detail::grid_1d const grid(num_groups, block_size);
+
+  // compute number of clusters per group
+  // each thread computes 1 set of clusters (# of cluster sets == # of groups)
+  rmm::device_uvector<size_type> group_num_clusters(num_groups, stream);
+  generate_cluster_limits_kernel<<<grid.num_blocks, block_size, 0, stream.value()>>>(
+    delta,
+    num_groups,
+    nearest_weight,
+    total_weight,
+    cumulative_weight,
+    nullptr,
+    group_num_clusters.begin(),
+    nullptr);
+
+  // generate group cluster offsets (where the clusters for a given group start and end)
+  auto group_cluster_offsets = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, num_groups + 1, mask_state::UNALLOCATED, stream, mr);
+  auto cluster_size = cudf::detail::make_counting_transform_iterator(
+    0, [group_num_clusters = group_num_clusters.begin(), num_groups] __device__(size_type index) {
+      return index == num_groups ? 0 : group_num_clusters[index];
+    });
+  thrust::exclusive_scan(rmm::exec_policy(stream),
+                         cluster_size,
+                         cluster_size + num_groups + 1,
+                         group_cluster_offsets->mutable_view().begin<offset_type>(),
+                         0);
+
+  // total # of clusters
+  offset_type total_clusters =
+    cudf::detail::get_value<offset_type>(group_cluster_offsets->view(), num_groups, stream);
+
+  // fill in the actual cluster weight limits
+  rmm::device_uvector<double> group_cluster_wl(total_clusters, stream);
+  generate_cluster_limits_kernel<<<grid.num_blocks, block_size, 0, stream.value()>>>(
+    delta,
+    num_groups,
+    nearest_weight,
+    total_weight,
+    cumulative_weight,
+    group_cluster_wl.begin(),
+    group_num_clusters.begin(),
+    group_cluster_offsets->view().begin<offset_type>());
+
+  return {std::move(group_cluster_wl),
+          std::move(group_cluster_offsets),
+          static_cast<size_type>(total_clusters)};
+}
+
+/**
+ * @brief Compute a column of tdigests.
+ *
+ * Assembles the output tdigest column based on the specified delta, a stream of
+ * input values (either scalar or centroids), and an assortment of per-group
+ * clustering information.
+ *
+ * This function is effectively just a reduce_by_key that performs a reduction
+ * from input values -> centroid clusters as defined by the the cluster weight
+ * boundaries.
+ *
+ * @param delta              tdigest compression level
+ * @param values_begin       Beginning of the range of input values.
+ * @param values_end         End of the range of input values.
+ * @param cumulative_weight  Functor which returns cumulative weight and group information for
+ * an absolute input value index.
+ * @param min_col            Column containing the minimum value per group.
+ * @param max_col            Column containing the maximum value per group.
+ * @param group_cluster_wl   Cluster weight limits for each group.
+ * @param group_cluster_offsets R-value reference of offsets into the cluster weight limits.
+ * @param total_clusters     Total number of clusters in all groups.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns A tdigest column with 1 row per output tdigest.
+ */
+template <typename CentroidIter, typename CumulativeWeight>
+std::unique_ptr<column> compute_tdigests(int delta,
+                                         CentroidIter centroids_begin,
+                                         CentroidIter centroids_end,
+                                         CumulativeWeight group_cumulative_weight,
+                                         std::unique_ptr<column>&& min_col,
+                                         std::unique_ptr<column>&& max_col,
+                                         rmm::device_uvector<double> const& group_cluster_wl,
+                                         std::unique_ptr<column>&& group_cluster_offsets,
+                                         size_type total_clusters,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  // the output for each group is column of data that represents the tdigest. since we want 1 row
+  // per group, each row will be a list the length of the tdigest for that group. so our output
+  // column is of the form:
+  // struct {
+  //   centroids for the digest
+  //   list {
+  //     struct {
+  //       double    // mean
+  //       double    // weight
+  //     }
+  //   }
+  //   double       // min
+  //   double       // max
+  // }
+  //
+  //
+  if (total_clusters == 0) { return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr); }
+  std::vector<std::unique_ptr<column>> inner_children;
+  // mean
+  inner_children.push_back(cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, total_clusters, mask_state::UNALLOCATED, stream, mr));
+  // weight
+  inner_children.push_back(cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, total_clusters, mask_state::UNALLOCATED, stream, mr));
+  // tdigest struct
+  auto tdigests =
+    cudf::make_structs_column(total_clusters, std::move(inner_children), 0, {}, stream, mr);
+
+  // each input group represents an individual tdigest.  within each tdigest, we want the keys
+  // to represent cluster indices (for example, if a tdigest had 100 clusters, the keys should fall
+  // into the range 0-99).  But since we have multiple tdigests, we need to keep the keys unique
+  // between the groups, so we add our group start offset.
+  auto keys = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(0),
+    [delta,
+     group_cluster_wl      = group_cluster_wl.data(),
+     group_cluster_offsets = group_cluster_offsets->view().begin<offset_type>(),
+     group_cumulative_weight] __device__(size_type value_index) -> size_type {
+      auto [group_index, relative_value_index, cumulative_weight] =
+        group_cumulative_weight(value_index);
+
+      // compute start of cluster weight limits for this group
+      double const* weight_limits = group_cluster_wl + group_cluster_offsets[group_index];
+      auto const num_clusters =
+        group_cluster_offsets[group_index + 1] - group_cluster_offsets[group_index];
+
+      // local cluster index
+      size_type const group_cluster_index =
+        min(num_clusters - 1,
+            static_cast<size_type>(
+              thrust::lower_bound(
+                thrust::seq, weight_limits, weight_limits + num_clusters, cumulative_weight) -
+              weight_limits));
+
+      // add the cluster offset to generate a globally unique key
+      return group_cluster_index + group_cluster_offsets[group_index];
+    });
+
+  // reduce the centroids down by key.
+  cudf::mutable_column_view mean_col =
+    tdigests->child(cudf::detail::tdigest::mean_column_index).mutable_view();
+  cudf::mutable_column_view weight_col =
+    tdigests->child(cudf::detail::tdigest::weight_column_index).mutable_view();
+  auto output           = thrust::make_zip_iterator(thrust::make_tuple(
+    mean_col.begin<double>(), weight_col.begin<double>(), thrust::make_discard_iterator()));
+  auto const num_values = std::distance(centroids_begin, centroids_end);
+  thrust::reduce_by_key(rmm::exec_policy(stream),
+                        keys,
+                        keys + num_values,                // keys
+                        centroids_begin,                  // values
+                        thrust::make_discard_iterator(),  // key output
+                        output,                           // output
+                        thrust::equal_to<size_type>{},    // key equality check
+                        merge_centroids{});
+
+  // create the list
+  auto const num_groups = group_cluster_offsets->size() - 1;
+  auto list             = cudf::make_lists_column(
+    num_groups, std::move(group_cluster_offsets), std::move(tdigests), 0, {});
+
+  // create final tdigest column
+  std::vector<std::unique_ptr<column>> children;
+  children.push_back(std::move(list));
+  children.push_back(std::move(min_col));
+  children.push_back(std::move(max_col));
+  return make_structs_column(num_groups, std::move(children), 0, {}, stream, mr);
+}
+
+// retrieve total weight of scalar inputs by group index
+struct scalar_total_weight {
+  size_type const* group_valid_counts;
+  __device__ double operator()(size_type group_index) { return group_valid_counts[group_index]; }
+};
+
+// return the min/max value of scalar inputs by group index
+template <typename T>
+struct get_scalar_minmax {
+  column_device_view const col;
+  device_span<size_type const> group_offsets;
+  size_type const* group_valid_counts;
+
+  __device__ thrust::tuple<double, double> operator()(size_type group_index)
+  {
+    // note: .element<T>() is taking care of fixed-point conversions for us.
+    return {static_cast<double>(col.element<T>(group_offsets[group_index])),
+            static_cast<double>(
+              col.element<T>(group_offsets[group_index] + (group_valid_counts[group_index] - 1)))};
+  }
+};
+
+struct typed_group_tdigest {
+  template <
+    typename T,
+    typename std::enable_if_t<cudf::is_numeric<T>() || cudf::is_fixed_point<T>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& col,
+                                     cudf::device_span<size_type const> group_offsets,
+                                     cudf::device_span<size_type const> group_labels,
+                                     cudf::device_span<size_type const> group_valid_counts,
+                                     size_type num_groups,
+                                     int delta,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    // first, generate cluster weight information for each input group
+    auto total_weight = cudf::detail::make_counting_transform_iterator(
+      0, scalar_total_weight{group_valid_counts.begin()});
+    auto [group_cluster_wl, group_cluster_offsets, total_clusters] =
+      generate_group_cluster_info(delta,
+                                  num_groups,
+                                  nearest_value_scalar_weights{},
+                                  total_weight,
+                                  cumulative_scalar_weight{group_offsets, group_labels},
+                                  stream,
+                                  mr);
+
+    // device column view. handy because the .element() function
+    // automatically handles fixed-point conversions for us
+    auto d_col = cudf::column_device_view::create(col);
+
+    // compute min and max columns
+    auto min_col = cudf::make_fixed_width_column(
+      data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
+    auto max_col = cudf::make_fixed_width_column(
+      data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
+    thrust::transform(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(0) + num_groups,
+      thrust::make_zip_iterator(thrust::make_tuple(min_col->mutable_view().begin<double>(),
+                                                   max_col->mutable_view().begin<double>())),
+      get_scalar_minmax<T>{*d_col, group_offsets, group_valid_counts.begin()});
+
+    // for simple input values, the "centroids" all have a weight of 1.
+    auto scalar_to_centroid =
+      cudf::detail::make_counting_transform_iterator(0, make_centroid<T>{*d_col});
+
+    // generate the final tdigest
+    return compute_tdigests(delta,
+                            scalar_to_centroid,
+                            scalar_to_centroid + col.size(),
+                            cumulative_scalar_weight{group_offsets, group_labels},
+                            std::move(min_col),
+                            std::move(max_col),
+                            group_cluster_wl,
+                            std::move(group_cluster_offsets),
+                            total_clusters,
+                            stream,
+                            mr);
+  }
+
+  template <
+    typename T,
+    typename std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& col,
+                                     cudf::device_span<size_type const> group_offsets,
+                                     cudf::device_span<size_type const> group_labels,
+                                     cudf::device_span<size_type const> group_valid_counts,
+                                     size_type num_groups,
+                                     int delta,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    CUDF_FAIL("Non-numeric type in group_tdigest");
+  }
+};
+
+}  // anonymous namespace
+
+std::unique_ptr<column> group_tdigest(column_view const& col,
+                                      cudf::device_span<size_type const> group_offsets,
+                                      cudf::device_span<size_type const> group_labels,
+                                      cudf::device_span<size_type const> group_valid_counts,
+                                      size_type num_groups,
+                                      int max_centroids,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  if (col.size() == 0) { return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr); }
+
+  auto const delta = max_centroids;
+  return cudf::type_dispatcher(col.type(),
+                               typed_group_tdigest{},
+                               col,
+                               group_offsets,
+                               group_labels,
+                               group_valid_counts,
+                               num_groups,
+                               delta,
+                               stream,
+                               mr);
+}
+
+std::unique_ptr<column> group_merge_tdigest(column_view const& input,
+                                            cudf::device_span<size_type const> group_offsets,
+                                            cudf::device_span<size_type const> group_labels,
+                                            size_type num_groups,
+                                            int max_centroids,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
+{
+  cudf::detail::tdigest::check_is_valid_tdigest_column(input);
+
+  if (num_groups == 0 || input.size() == 0) {
+    return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr);
+  }
+
+  structs_column_view scv(input);
+  lists_column_view lcv(scv.child(cudf::detail::tdigest::centroid_column_index));
+  // ideally, we would just call .parent().child() here because tdigests cannot be
+  // sliced. however, lists_column_view() hides that particular interface. However,
+  // for the same reason, get_sliced_child() should be just as cheap.
+  auto data = lcv.get_sliced_child(stream);
+  structs_column_view tdigest(data);
+  auto mean   = tdigest.child(cudf::detail::tdigest::mean_column_index);
+  auto weight = tdigest.child(cudf::detail::tdigest::weight_column_index);
+
+  // first step is to merge all the tdigests in each group. at the moment the only way to
+  // make this work is to retrieve the group sizes (via group_offsets) and the individual digest
+  // sizes (via input.offsets()) to the gpu and do the merges.  The scale problem is that while the
+  // size of each group will likely be small (size of each group will typically map to # of batches
+  // the input data was chopped into for tdigest generation), the -number- of groups can be
+  // arbitrarily large.
+  //
+  // thrust::merge and thrust::merge_by_key don't provide what we need.  What we would need is an
+  // algorithm like a super-merge that takes two layers of keys: one which identifies the outer
+  // grouping of tdigests, and one which identifies the inner groupings of the tdigests within the
+  // outer groups.
+
+  // bring group offsets back to the host
+  std::vector<size_type> h_outer_offsets(group_offsets.size());
+  cudaMemcpyAsync(h_outer_offsets.data(),
+                  group_offsets.data(),
+                  sizeof(size_type) * group_offsets.size(),
+                  cudaMemcpyDeviceToHost,
+                  stream);
+
+  // bring tdigest offsets back to the host
+  auto tdigest_offsets = lcv.offsets();
+  std::vector<size_type> h_inner_offsets(tdigest_offsets.size());
+  cudaMemcpyAsync(h_inner_offsets.data(),
+                  tdigest_offsets.begin<size_type>(),
+                  sizeof(size_type) * tdigest_offsets.size(),
+                  cudaMemcpyDeviceToHost,
+                  stream);
+
+  stream.synchronize();
+
+  // extract all means and weights into a table
+  cudf::table_view tdigests_unsliced({mean, weight});
+
+  // generate the merged (but not yet compressed) tdigests for each group.
+  std::vector<std::unique_ptr<table>> tdigests;
+  tdigests.reserve(num_groups);
+  std::transform(
+    h_outer_offsets.begin(),
+    h_outer_offsets.end() - 1,
+    std::next(h_outer_offsets.begin()),
+    std::back_inserter(tdigests),
+    [&](auto tdigest_start, auto tdigest_end) {
+      // the range of tdigests in this group
+      auto const num_tdigests = tdigest_end - tdigest_start;
+
+      // slice each tdigest from the input
+      std::vector<table_view> unmerged_tdigests;
+      unmerged_tdigests.reserve(num_tdigests);
+      auto offset_iter = std::next(h_inner_offsets.begin(), tdigest_start);
+      std::transform(offset_iter,
+                     offset_iter + num_tdigests,
+                     std::next(offset_iter),
+                     std::back_inserter(unmerged_tdigests),
+                     [&](auto start, auto end) {
+                       return cudf::detail::slice(tdigests_unsliced, {start, end}, stream);
+                     });
+
+      // merge
+      return cudf::detail::merge(unmerged_tdigests, {0}, {order::ASCENDING}, {}, stream, mr);
+    });
+
+  // generate min and max values
+  auto min_col        = scv.child(cudf::detail::tdigest::min_column_index);
+  auto merged_min_col = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
+  thrust::reduce_by_key(rmm::exec_policy(stream),
+                        group_labels.begin(),
+                        group_labels.end(),
+                        min_col.begin<double>(),
+                        thrust::make_discard_iterator(),
+                        merged_min_col->mutable_view().begin<double>(),
+                        thrust::equal_to<size_type>{},  // key equality check
+                        thrust::minimum<double>{});
+
+  auto max_col        = scv.child(cudf::detail::tdigest::max_column_index);
+  auto merged_max_col = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
+  thrust::reduce_by_key(rmm::exec_policy(stream),
+                        group_labels.begin(),
+                        group_labels.end(),
+                        max_col.begin<double>(),
+                        thrust::make_discard_iterator(),
+                        merged_max_col->mutable_view().begin<double>(),
+                        thrust::equal_to<size_type>{},  // key equality check
+                        thrust::maximum<double>{});
+
+  // concatenate all the merged tdigests back into one table.
+  std::vector<table_view> tdigest_views;
+  tdigest_views.reserve(num_groups);
+  std::transform(tdigests.begin(),
+                 tdigests.end(),
+                 std::back_inserter(tdigest_views),
+                 [](std::unique_ptr<table> const& t) { return t->view(); });
+  auto merged = cudf::detail::concatenate(tdigest_views, stream, mr);
+
+  // generate cumulative weights
+  auto merged_weights     = merged->get_column(cudf::detail::tdigest::weight_column_index).view();
+  auto cumulative_weights = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, merged_weights.size(), mask_state::UNALLOCATED);
+  auto keys = cudf::detail::make_counting_transform_iterator(
+    0,
+    [group_labels      = group_labels.begin(),
+     inner_offsets     = tdigest_offsets.begin<size_type>(),
+     num_inner_offsets = tdigest_offsets.size()] __device__(int index) {
+      // what -original- tdigest index this absolute index corresponds to
+      auto const iter = thrust::prev(
+        thrust::upper_bound(thrust::seq, inner_offsets, inner_offsets + num_inner_offsets, index));
+      auto const tdigest_index = thrust::distance(inner_offsets, iter);
+
+      // what group index the original tdigest belongs to
+      return group_labels[tdigest_index];
+    });
+  thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
+                                keys,
+                                keys + cumulative_weights->size(),
+                                merged_weights.begin<double>(),
+                                cumulative_weights->mutable_view().begin<double>());
+
+  auto const delta = max_centroids;
+
+  // generate cluster info
+  auto total_group_weight = cudf::detail::make_counting_transform_iterator(
+    0,
+    [outer_offsets = group_offsets.data(),
+     inner_offsets = tdigest_offsets.begin<size_type>(),
+     cumulative_weights =
+       cumulative_weights->view().begin<double>()] __device__(size_type group_index) {
+      auto const last_weight_index = inner_offsets[outer_offsets[group_index + 1]] - 1;
+      return cumulative_weights[last_weight_index];
+    });
+  auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info(
+    delta,
+    num_groups,
+    nearest_value_centroid_weights{cumulative_weights->view().begin<double>(),
+                                   group_offsets.data(),
+                                   tdigest_offsets.begin<size_type>()},
+    total_group_weight,
+    cumulative_centroid_weight{
+      cumulative_weights->view().begin<double>(),
+      group_labels,
+      group_offsets.data(),
+      {tdigest_offsets.begin<offset_type>(), static_cast<size_t>(tdigest_offsets.size())}},
+    stream,
+    mr);
+
+  // input centroid values
+  auto centroids = cudf::detail::make_counting_transform_iterator(
+    0,
+    make_weighted_centroid{
+      merged->get_column(cudf::detail::tdigest::mean_column_index).view().begin<double>(),
+      merged_weights.begin<double>()});
+
+  // compute the tdigest
+  return compute_tdigests(delta,
+                          centroids,
+                          centroids + merged->num_rows(),
+                          cumulative_centroid_weight{cumulative_weights->view().begin<double>(),
+                                                     group_labels,
+                                                     group_offsets.data(),
+                                                     {tdigest_offsets.begin<offset_type>(),
+                                                      static_cast<size_t>(tdigest_offsets.size())}},
+                          std::move(merged_min_col),
+                          std::move(merged_max_col),
+                          group_cluster_wl,
+                          std::move(group_cluster_offsets),
+                          total_clusters,
+                          stream,
+                          mr);
+}
+
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
new file mode 100644
index 00000000000..9aea59a195b
--- /dev/null
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -0,0 +1,383 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/tdigest/tdigest.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/valid_if.cuh>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+
+namespace cudf {
+namespace detail {
+namespace tdigest {
+
+// https://developer.nvidia.com/blog/lerp-faster-cuda/
+template <typename T>
+__device__ inline T lerp(T v0, T v1, T t)
+{
+  return fma(t, v1, fma(-t, v0, v0));
+}
+
+struct centroid {
+  double mean;
+  double weight;
+};
+
+struct make_centroid {
+  double const* means;
+  double const* weights;
+  __device__ centroid operator()(size_type i) { return {means[i], weights[i]}; }
+};
+
+// kernel for computing percentiles on input tdigest (mean, weight) centroid data.
+template <typename CentroidIter>
+__global__ void compute_percentiles_kernel(device_span<offset_type const> tdigest_offsets,
+                                           column_device_view percentiles,
+                                           CentroidIter centroids_,
+                                           double const* min_,
+                                           double const* max_,
+                                           double const* cumulative_weight_,
+                                           double* output)
+{
+  int const tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  auto const num_tdigests  = tdigest_offsets.size() - 1;
+  auto const tdigest_index = tid / percentiles.size();
+  if (tdigest_index >= num_tdigests) { return; }
+  auto const pindex = tid % percentiles.size();
+
+  // size of the digest we're querying
+  auto const tdigest_size = tdigest_offsets[tdigest_index + 1] - tdigest_offsets[tdigest_index];
+  // no work to do. values will be set to null
+  if (tdigest_size == 0 || !percentiles.is_valid(pindex)) { return; }
+
+  output[tid] = [&]() {
+    double const percentage         = percentiles.element<double>(pindex);
+    double const* cumulative_weight = cumulative_weight_ + tdigest_offsets[tdigest_index];
+
+    // centroids for this particular tdigest
+    CentroidIter centroids = centroids_ + tdigest_offsets[tdigest_index];
+
+    // min and max for the digest
+    double const* min_val = min_ + tdigest_index;
+    double const* max_val = max_ + tdigest_index;
+
+    double const total_weight = cumulative_weight[tdigest_size - 1];
+
+    // The following Arrow code serves as a basis for this computation
+    // https://github.com/apache/arrow/blob/master/cpp/src/arrow/util/tdigest.cc#L280
+    double const weighted_q = percentage * total_weight;
+    if (weighted_q <= 1) {
+      return *min_val;
+    } else if (weighted_q >= total_weight - 1) {
+      return *max_val;
+    }
+
+    // determine what centroid this weighted quantile falls within.
+    size_type const centroid_index = static_cast<size_type>(thrust::distance(
+      cumulative_weight,
+      thrust::lower_bound(
+        thrust::seq, cumulative_weight, cumulative_weight + tdigest_size, weighted_q)));
+    centroid c                     = centroids[centroid_index];
+
+    // diff == how far from the "center" of the centroid we are,
+    // in unit weights.
+    // visually:
+    //
+    // centroid of weight 7
+    //        C       <-- center of the centroid
+    //    |-------|
+    //      | |  |
+    //      X Y  Z
+    // X has a diff of -2 (2 units to the left of the center of the centroid)
+    // Y has a diff of 0 (directly in the middle of the centroid)
+    // Z has a diff of 3 (3 units to the right of the center of the centroid)
+    double const diff = weighted_q + c.weight / 2 - cumulative_weight[centroid_index];
+
+    // if we're completely within a centroid of weight 1, just return that.
+    if (c.weight == 1 && std::abs(diff) < 0.5) { return c.mean; }
+
+    // otherwise, interpolate between two centroids.
+
+    // get the two centroids we want to interpolate between
+    auto const look_left  = diff < 0;
+    auto const [lhs, rhs] = [&]() {
+      if (look_left) {
+        // if we're at the first centroid, "left" of us is the min value
+        auto const first_centroid = centroid_index == 0;
+        auto const lhs = first_centroid ? centroid{*min_val, 0} : centroids[centroid_index - 1];
+        auto const rhs = c;
+        return std::pair<centroid, centroid>{lhs, rhs};
+      } else {
+        // if we're at the last centroid, "right" of us is the max value
+        auto const last_centroid = (centroid_index == tdigest_size - 1);
+        auto const lhs           = c;
+        auto const rhs = last_centroid ? centroid{*max_val, 0} : centroids[centroid_index + 1];
+        return std::pair<centroid, centroid>{lhs, rhs};
+      }
+    }();
+
+    // compute interpolation value t
+
+    // total interpolation range. the total range of "space" between the lhs and rhs centroids.
+    auto const tip = lhs.weight / 2 + rhs.weight / 2;
+    // if we're looking left, diff is negative, so shift it so that we are interpolating
+    // from lhs -> rhs.
+    auto const t = (look_left) ? (diff + tip) / tip : diff / tip;
+
+    // interpolate
+    return lerp(lhs.mean, rhs.mean, t);
+  }();
+}
+
+/**
+ * @brief Calculate approximate percentiles on a provided tdigest column.
+ *
+ * Produces a LIST column where each row `i` represents output from querying the
+ * corresponding tdigest of from row `i` in `input`. The length of each output list
+ * is the number of percentiles specified in `percentiles`
+ *
+ * @param input           tdigest input data. One tdigest per row.
+ * @param percentiles     Desired percentiles in range [0, 1].
+ * @param stream          CUDA stream used for device memory operations and kernel launches
+ * @param mr              Device memory resource used to allocate the returned column's device
+ * memory
+ *
+ * @returns Column of doubles containing requested percentile values.
+ */
+std::unique_ptr<column> compute_approx_percentiles(structs_column_view const& input,
+                                                   column_view const& percentiles,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
+{
+  lists_column_view lcv(input.child(centroid_column_index));
+  column_view min_col = input.child(min_column_index);
+  column_view max_col = input.child(max_column_index);
+
+  // offsets, representing the size of each tdigest
+  auto offsets = lcv.offsets();
+
+  // extract means and weights
+  auto data = lcv.parent().child(lists_column_view::child_column_index);
+  structs_column_view tdigest(data);
+  auto mean   = tdigest.child(mean_column_index);
+  auto weight = tdigest.child(weight_column_index);
+
+  // compute summed weights
+  auto cumulative_weights = cudf::make_fixed_width_column(data_type{type_id::FLOAT64},
+                                                          mean.size(),
+                                                          mask_state::UNALLOCATED,
+                                                          stream,
+                                                          rmm::mr::get_current_device_resource());
+  auto keys               = cudf::detail::make_counting_transform_iterator(
+    0,
+    [offsets_begin = offsets.begin<offset_type>(),
+     offsets_end   = offsets.end<offset_type>()] __device__(size_type i) {
+      return thrust::distance(
+        offsets_begin,
+        thrust::prev(thrust::upper_bound(thrust::seq, offsets_begin, offsets_end, i)));
+    });
+  thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
+                                keys,
+                                keys + weight.size(),
+                                weight.begin<double>(),
+                                cumulative_weights->mutable_view().begin<double>());
+
+  auto percentiles_cdv = column_device_view::create(percentiles);
+
+  // leaf is a column of size input.size() * percentiles.size()
+  auto const num_output_values = input.size() * percentiles.size();
+
+  // null percentiles become null results.
+  auto [null_mask, null_count] = [&]() {
+    return percentiles.null_count() != 0
+             ? cudf::detail::valid_if(
+                 thrust::make_counting_iterator<size_type>(0),
+                 thrust::make_counting_iterator<size_type>(0) + num_output_values,
+                 [percentiles = *percentiles_cdv] __device__(size_type i) {
+                   return percentiles.is_valid(i % percentiles.size());
+                 })
+             : std::pair<rmm::device_buffer, size_type>{rmm::device_buffer{}, 0};
+  }();
+
+  auto result = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, num_output_values, std::move(null_mask), null_count, stream, mr);
+
+  auto centroids = cudf::detail::make_counting_transform_iterator(
+    0, make_centroid{mean.begin<double>(), weight.begin<double>()});
+
+  constexpr size_type block_size = 256;
+  cudf::detail::grid_1d const grid(percentiles.size() * input.size(), block_size);
+  compute_percentiles_kernel<<<grid.num_blocks, block_size, 0, stream.value()>>>(
+    {offsets.begin<offset_type>(), static_cast<size_t>(offsets.size())},
+    *percentiles_cdv,
+    centroids,
+    min_col.begin<double>(),
+    max_col.begin<double>(),
+    cumulative_weights->view().begin<double>(),
+    result->mutable_view().begin<double>());
+
+  return result;
+}
+
+void check_is_valid_tdigest_column(column_view const& col)
+{
+  // sanity check that this is actually tdigest data
+  CUDF_EXPECTS(col.type().id() == type_id::STRUCT, "Encountered invalid tdigest column");
+  CUDF_EXPECTS(col.size() > 0, "tdigest columns must have > 0 rows");
+  CUDF_EXPECTS(col.offset() == 0, "Encountered a sliced tdigest column");
+  CUDF_EXPECTS(col.nullable() == false, "Encountered nullable tdigest column");
+
+  structs_column_view scv(col);
+  CUDF_EXPECTS(scv.num_children() == 3, "Encountered invalid tdigest column");
+  CUDF_EXPECTS(scv.child(min_column_index).type().id() == type_id::FLOAT64,
+               "Encountered invalid tdigest column");
+  CUDF_EXPECTS(scv.child(max_column_index).type().id() == type_id::FLOAT64,
+               "Encountered invalid tdigest column");
+
+  lists_column_view lcv(scv.child(centroid_column_index));
+  auto data = lcv.child();
+  CUDF_EXPECTS(data.type().id() == type_id::STRUCT, "Encountered invalid tdigest column");
+  CUDF_EXPECTS(data.num_children() == 2,
+               "Encountered tdigest column with an invalid number of children");
+  auto mean = data.child(mean_column_index);
+  CUDF_EXPECTS(mean.type().id() == type_id::FLOAT64, "Encountered invalid tdigest mean column");
+  auto weight = data.child(weight_column_index);
+  CUDF_EXPECTS(weight.type().id() == type_id::FLOAT64, "Encountered invalid tdigest weight column");
+}
+
+std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr)
+{
+  // mean/weight columns
+  std::vector<std::unique_ptr<column>> inner_children;
+  inner_children.push_back(make_empty_column(data_type(type_id::FLOAT64)));
+  inner_children.push_back(make_empty_column(data_type(type_id::FLOAT64)));
+
+  auto offsets = cudf::make_fixed_width_column(
+    data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr);
+  thrust::fill(rmm::exec_policy(stream),
+               offsets->mutable_view().begin<offset_type>(),
+               offsets->mutable_view().end<offset_type>(),
+               0);
+  auto list =
+    make_lists_column(1,
+                      std::move(offsets),
+                      cudf::make_structs_column(0, std::move(inner_children), 0, {}, stream, mr),
+                      0,
+                      {});
+
+  auto min_col =
+    cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
+  thrust::fill(rmm::exec_policy(stream),
+               min_col->mutable_view().begin<double>(),
+               min_col->mutable_view().end<double>(),
+               0);
+  auto max_col =
+    cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
+  thrust::fill(rmm::exec_policy(stream),
+               max_col->mutable_view().begin<double>(),
+               max_col->mutable_view().end<double>(),
+               0);
+
+  std::vector<std::unique_ptr<column>> children;
+  children.push_back(std::move(list));
+  children.push_back(std::move(min_col));
+  children.push_back(std::move(max_col));
+
+  return make_structs_column(1, std::move(children), 0, {}, stream, mr);
+}
+
+}  // namespace tdigest.
+
+std::unique_ptr<column> percentile_approx(structs_column_view const& input,
+                                          column_view const& percentiles,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  tdigest::check_is_valid_tdigest_column(input);
+  CUDF_EXPECTS(percentiles.type().id() == type_id::FLOAT64,
+               "percentile_approx expects float64 percentile inputs");
+
+  // output is a list column with each row containing percentiles.size() percentile values
+  auto offsets = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, input.size() + 1, mask_state::UNALLOCATED, stream, mr);
+  auto row_size_iter = thrust::make_constant_iterator(percentiles.size());
+  thrust::exclusive_scan(rmm::exec_policy(stream),
+                         row_size_iter,
+                         row_size_iter + input.size() + 1,
+                         offsets->mutable_view().begin<offset_type>());
+
+  if (percentiles.size() == 0) {
+    return cudf::make_lists_column(
+      input.size(),
+      std::move(offsets),
+      cudf::make_empty_column(data_type{type_id::FLOAT64}),
+      input.size(),
+      cudf::detail::create_null_mask(
+        input.size(), mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr));
+  }
+
+  // if any of the input digests are empty, nullify the corresponding output rows (values will be
+  // uninitialized)
+  auto [bitmask, null_count] = [stream, mr, input]() {
+    lists_column_view lcv(input.child(tdigest::centroid_column_index));
+    auto iter = cudf::detail::make_counting_transform_iterator(
+      0, [offsets = lcv.offsets().begin<offset_type>()] __device__(size_type index) {
+        return offsets[index + 1] - offsets[index] == 0 ? 1 : 0;
+      });
+    auto const null_count = thrust::reduce(rmm::exec_policy(stream), iter, iter + input.size(), 0);
+    if (null_count == 0) {
+      return std::pair<rmm::device_buffer, size_type>{rmm::device_buffer{}, null_count};
+    }
+    return cudf::detail::valid_if(
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(0) + input.size(),
+      [offsets = lcv.offsets().begin<offset_type>()] __device__(size_type index) {
+        return offsets[index + 1] - offsets[index] == 0 ? 0 : 1;
+      },
+      stream,
+      mr);
+  }();
+
+  return cudf::make_lists_column(
+    input.size(),
+    std::move(offsets),
+    tdigest::compute_approx_percentiles(input, percentiles, stream, mr),
+    null_count,
+    std::move(bitmask),
+    stream,
+    mr);
+}
+
+}  // namespace detail
+
+std::unique_ptr<column> percentile_approx(structs_column_view const& input,
+                                          column_view const& percentiles,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  return percentile_approx(input, percentiles, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu
index dc74a5f4ff1..42b57bdb47a 100644
--- a/cpp/src/sort/sort.cu
+++ b/cpp/src/sort/sort.cu
@@ -26,7 +26,7 @@
 
 namespace cudf {
 namespace detail {
-std::unique_ptr<column> sorted_order(table_view input,
+std::unique_ptr<column> sorted_order(table_view const& input,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
                                      rmm::cuda_stream_view stream,
@@ -75,7 +75,7 @@ struct inplace_column_sort_fn {
   }
 };
 
-std::unique_ptr<table> sort(table_view input,
+std::unique_ptr<table> sort(table_view const& input,
                             std::vector<order> const& column_order,
                             std::vector<null_order> const& null_precedence,
                             rmm::cuda_stream_view stream,
@@ -101,7 +101,7 @@ std::unique_ptr<table> sort(table_view input,
 
 }  // namespace detail
 
-std::unique_ptr<column> sorted_order(table_view input,
+std::unique_ptr<column> sorted_order(table_view const& input,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
                                      rmm::mr::device_memory_resource* mr)
@@ -110,7 +110,7 @@ std::unique_ptr<column> sorted_order(table_view input,
   return detail::sorted_order(input, column_order, null_precedence, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<table> sort(table_view input,
+std::unique_ptr<table> sort(table_view const& input,
                             std::vector<order> const& column_order,
                             std::vector<null_order> const& null_precedence,
                             rmm::mr::device_memory_resource* mr)
diff --git a/cpp/src/sort/stable_sort.cu b/cpp/src/sort/stable_sort.cu
index 860e88ae76e..75335579de2 100644
--- a/cpp/src/sort/stable_sort.cu
+++ b/cpp/src/sort/stable_sort.cu
@@ -25,7 +25,7 @@
 
 namespace cudf {
 namespace detail {
-std::unique_ptr<column> stable_sorted_order(table_view input,
+std::unique_ptr<column> stable_sorted_order(table_view const& input,
                                             std::vector<order> const& column_order,
                                             std::vector<null_order> const& null_precedence,
                                             rmm::cuda_stream_view stream,
@@ -36,7 +36,7 @@ std::unique_ptr<column> stable_sorted_order(table_view input,
 
 }  // namespace detail
 
-std::unique_ptr<column> stable_sorted_order(table_view input,
+std::unique_ptr<column> stable_sorted_order(table_view const& input,
                                             std::vector<order> const& column_order,
                                             std::vector<null_order> const& null_precedence,
                                             rmm::mr::device_memory_resource* mr)
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 03f7967cee0..6d385ff969d 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -85,6 +85,7 @@ ConfigureTest(GROUPBY_TEST
     groupby/sum_of_squares_tests.cpp
     groupby/sum_scan_tests.cpp
     groupby/sum_tests.cpp
+    groupby/tdigest_tests.cu
     groupby/var_tests.cpp)
 
 ###################################################################################################
@@ -123,6 +124,7 @@ ConfigureTest(HASH_MAP_TEST
 ###################################################################################################
 # - quantiles tests -------------------------------------------------------------------------------
 ConfigureTest(QUANTILES_TEST
+    quantiles/percentile_approx_test.cu
     quantiles/quantile_test.cpp
     quantiles/quantiles_test.cpp)
 
diff --git a/cpp/tests/groupby/groupby_test_util.hpp b/cpp/tests/groupby/groupby_test_util.hpp
index 542205b5b51..b333d9dacba 100644
--- a/cpp/tests/groupby/groupby_test_util.hpp
+++ b/cpp/tests/groupby/groupby_test_util.hpp
@@ -27,6 +27,9 @@
 #include <cudf/sorting.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
+#include <cudf/unary.hpp>
+
+#include <random>
 
 namespace cudf {
 namespace test {
@@ -128,5 +131,57 @@ inline void test_single_scan(column_view const& keys,
     expect_vals, *result.second[0].results[0], debug_output_level::ALL_ERRORS);
 }
 
+template <typename T>
+inline T frand()
+{
+  return static_cast<T>(rand()) / static_cast<T>(RAND_MAX);
+}
+
+template <typename T>
+inline T rand_range(T min, T max)
+{
+  return min + static_cast<T>(frand<T>() * (max - min));
+}
+
+inline std::unique_ptr<column> generate_typed_percentile_distribution(
+  std::vector<double> const& buckets,
+  std::vector<int> const& sizes,
+  data_type t,
+  bool sorted = false)
+{
+  srand(0);
+
+  std::vector<double> values;
+  size_t total_size = std::reduce(sizes.begin(), sizes.end(), 0);
+  values.reserve(total_size);
+  for (size_t idx = 0; idx < sizes.size(); idx++) {
+    double min = idx == 0 ? 0.0f : buckets[idx - 1];
+    double max = buckets[idx];
+
+    for (int v_idx = 0; v_idx < sizes[idx]; v_idx++) {
+      values.push_back(rand_range(min, max));
+    }
+  }
+
+  if (sorted) { std::sort(values.begin(), values.end()); }
+
+  cudf::test::fixed_width_column_wrapper<double> src(values.begin(), values.end());
+  return cudf::cast(src, t);
+}
+
+// "standardized" means the parameters sent into generate_typed_percentile_distribution. the intent
+// is to provide a standardized set of inputs for use with tdigest generation tests and
+// percentile_approx tests. std::vector<double>
+// buckets{10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0}; std::vector<int>
+// sizes{50000, 50000, 50000, 50000, 50000, 100000, 100000, 100000, 100000, 100000};
+inline std::unique_ptr<column> generate_standardized_percentile_distribution(
+  data_type t = data_type{type_id::FLOAT64}, bool sorted = false)
+{
+  std::vector<double> buckets{10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0, 90.0f, 100.0f};
+  std::vector<int> b_sizes{
+    50000, 50000, 50000, 50000, 50000, 100000, 100000, 100000, 100000, 100000};
+  return generate_typed_percentile_distribution(buckets, b_sizes, t, sorted);
+}
+
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
new file mode 100644
index 00000000000..818999867c1
--- /dev/null
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -0,0 +1,584 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arrow/util/tdigest.h"
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/tdigest/tdigest.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <thrust/fill.h>
+
+namespace cudf {
+namespace test {
+
+using namespace cudf;
+
+typedef thrust::tuple<size_type, double, double> expected_value;
+
+template <typename T>
+struct TDigestAllTypes : public cudf::test::BaseFixture {
+};
+TYPED_TEST_CASE(TDigestAllTypes, cudf::test::NumericTypes);
+
+struct tdigest_gen {
+  template <
+    typename T,
+    typename std::enable_if_t<cudf::is_numeric<T>() || cudf::is_fixed_point<T>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& keys, column_view const& values, int delta)
+  {
+    cudf::table_view t({keys});
+    cudf::groupby::groupby gb(t);
+    std::vector<cudf::groupby::aggregation_request> requests;
+    std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+    aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+    requests.push_back({values, std::move(aggregations)});
+    auto result = gb.aggregate(requests);
+    return std::move(result.second[0].results[0]);
+  }
+
+  template <
+    typename T,
+    typename std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& keys, column_view const& values, int delta)
+  {
+    CUDF_FAIL("Invalid tdigest test type");
+  }
+};
+
+void tdigest_sample_compare(column_view const& result,
+                            std::vector<expected_value> const& h_expected)
+{
+  cudf::detail::tdigest::check_is_valid_tdigest_column(result);
+  cudf::structs_column_view scv(result);
+  cudf::lists_column_view lcv(scv.child(cudf::detail::tdigest::centroid_column_index));
+  cudf::structs_column_view tdigests(lcv.child());
+  column_view result_mean   = tdigests.child(cudf::detail::tdigest::mean_column_index);
+  column_view result_weight = tdigests.child(cudf::detail::tdigest::weight_column_index);
+
+  auto expected_mean = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, h_expected.size(), mask_state::UNALLOCATED);
+  auto expected_weight = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, h_expected.size(), mask_state::UNALLOCATED);
+  auto sampled_result_mean = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, h_expected.size(), mask_state::UNALLOCATED);
+  auto sampled_result_weight = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, h_expected.size(), mask_state::UNALLOCATED);
+
+  rmm::device_vector<expected_value> expected(h_expected.begin(), h_expected.end());
+  auto iter = thrust::make_counting_iterator(0);
+  thrust::for_each(
+    rmm::exec_policy(rmm::cuda_stream_default),
+    iter,
+    iter + expected.size(),
+    [expected            = expected.data().get(),
+     expected_mean       = expected_mean->mutable_view().begin<double>(),
+     expected_weight     = expected_weight->mutable_view().begin<double>(),
+     result_mean         = result_mean.begin<double>(),
+     result_weight       = result_weight.begin<double>(),
+     sampled_result_mean = sampled_result_mean->mutable_view().begin<double>(),
+     sampled_result_weight =
+       sampled_result_weight->mutable_view().begin<double>()] __device__(size_type index) {
+      expected_mean[index]         = thrust::get<1>(expected[index]);
+      expected_weight[index]       = thrust::get<2>(expected[index]);
+      auto const src_index         = thrust::get<0>(expected[index]);
+      sampled_result_mean[index]   = result_mean[src_index];
+      sampled_result_weight[index] = result_weight[src_index];
+    });
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected_mean, *sampled_result_mean);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_weight, *sampled_result_weight);
+}
+
+template <typename T>
+std::unique_ptr<column> make_expected_tdigest(column_view const& mean,
+                                              column_view const& weight,
+                                              T min,
+                                              T max)
+{
+  std::vector<std::unique_ptr<column>> inner_children;
+  inner_children.push_back(std::make_unique<cudf::column>(mean));
+  inner_children.push_back(std::make_unique<cudf::column>(weight));
+  // tdigest struct
+  auto tdigests = cudf::make_structs_column(mean.size(), std::move(inner_children), 0, {});
+
+  std::vector<offset_type> h_offsets{0, mean.size()};
+  auto offsets =
+    cudf::make_fixed_width_column(data_type{type_id::INT32}, 2, mask_state::UNALLOCATED);
+  cudaMemcpy(offsets->mutable_view().begin<offset_type>(),
+             h_offsets.data(),
+             sizeof(offset_type) * 2,
+             cudaMemcpyHostToDevice);
+
+  auto list = cudf::make_lists_column(1, std::move(offsets), std::move(tdigests), 0, {});
+
+  auto min_col =
+    cudf::make_fixed_width_column(data_type{type_id::FLOAT64}, 1, mask_state::UNALLOCATED);
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+               min_col->mutable_view().begin<double>(),
+               min_col->mutable_view().end<double>(),
+               static_cast<double>(min));
+  auto max_col =
+    cudf::make_fixed_width_column(data_type{type_id::FLOAT64}, 1, mask_state::UNALLOCATED);
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+               max_col->mutable_view().begin<double>(),
+               max_col->mutable_view().end<double>(),
+               static_cast<double>(max));
+
+  std::vector<std::unique_ptr<column>> children;
+  children.push_back(std::move(list));
+  children.push_back(std::move(min_col));
+  children.push_back(std::move(max_col));
+  return make_structs_column(1, std::move(children), 0, {});
+}
+
+TYPED_TEST(TDigestAllTypes, Simple)
+{
+  using T = TypeParam;
+
+  // create a tdigest that has far fewer values in it than the delta value. this should result
+  // in every value remaining uncompressed
+  cudf::test::fixed_width_column_wrapper<T> values{126, 15, 1, 99, 67};
+  cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0};
+  int const delta = 1000;
+  auto result     = cudf::type_dispatcher(
+    static_cast<column_view>(values).type(), tdigest_gen{}, keys, values, delta);
+
+  cudf::test::fixed_width_column_wrapper<T> raw_mean({1, 15, 67, 99, 126});
+  cudf::test::fixed_width_column_wrapper<double> weight{1, 1, 1, 1, 1};
+  auto mean        = cudf::cast(raw_mean, data_type{type_id::FLOAT64});
+  double const min = 1;
+  double const max = 126;
+  auto expected = make_expected_tdigest<T>(*mean, weight, static_cast<T>(min), static_cast<T>(max));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
+}
+
+TYPED_TEST(TDigestAllTypes, SimpleWithNulls)
+{
+  using T = TypeParam;
+
+  // create a tdigest that has far fewer values in it than the delta value. this should result
+  // in every value remaining uncompressed
+  cudf::test::fixed_width_column_wrapper<T> values{{122, 15, 1, 99, 67, 101, 100, 84, 44, 2},
+                                                   {1, 0, 1, 0, 1, 0, 1, 0, 1, 0}};
+  cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  int const delta = 1000;
+  auto result     = cudf::type_dispatcher(
+    static_cast<column_view>(values).type(), tdigest_gen{}, keys, values, delta);
+
+  cudf::test::fixed_width_column_wrapper<T> raw_mean({1, 44, 67, 100, 122});
+  cudf::test::fixed_width_column_wrapper<double> weight{1, 1, 1, 1, 1};
+  auto mean        = cudf::cast(raw_mean, data_type{type_id::FLOAT64});
+  double const min = 1;
+  double const max = 122;
+  auto expected = make_expected_tdigest<T>(*mean, weight, static_cast<T>(min), static_cast<T>(max));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
+}
+
+TYPED_TEST(TDigestAllTypes, AllNull)
+{
+  using T = TypeParam;
+
+  // create a tdigest that has far fewer values in it than the delta value. this should result
+  // in every value remaining uncompressed
+  cudf::test::fixed_width_column_wrapper<T> values{{122, 15, 1, 99, 67, 101, 100, 84, 44, 2},
+                                                   {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}};
+  cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  int const delta = 1000;
+  auto result     = cudf::type_dispatcher(
+    static_cast<column_view>(values).type(), tdigest_gen{}, keys, values, delta);
+
+  // NOTE: an empty tdigest column still has 1 row.
+  auto expected = cudf::detail::tdigest::make_empty_tdigest_column();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
+}
+
+TYPED_TEST(TDigestAllTypes, LargeGroups)
+{
+  auto _values    = generate_standardized_percentile_distribution(data_type{type_id::FLOAT64});
+  int const delta = 1000;
+
+  // generate a random set of keys
+  std::vector<int> h_keys;
+  h_keys.reserve(_values->size());
+  auto iter = thrust::make_counting_iterator(0);
+  std::transform(iter, iter + _values->size(), std::back_inserter(h_keys), [](int i) {
+    return static_cast<int>(round(rand_range(0, 8)));
+  });
+  cudf::test::fixed_width_column_wrapper<int> _keys(h_keys.begin(), h_keys.end());
+
+  // group the input values together
+  cudf::table_view k({_keys});
+  cudf::groupby::groupby setup_gb(k);
+  cudf::table_view v({*_values});
+  auto groups = setup_gb.get_groups(v);
+
+  // slice it all up so we have keys/columns for everything.
+  std::vector<column_view> keys;
+  std::vector<column_view> values;
+  for (size_t idx = 0; idx < groups.offsets.size() - 1; idx++) {
+    auto k =
+      cudf::slice(groups.keys->get_column(0), {groups.offsets[idx], groups.offsets[idx + 1]});
+    keys.push_back(k[0]);
+
+    auto v =
+      cudf::slice(groups.values->get_column(0), {groups.offsets[idx], groups.offsets[idx + 1]});
+    values.push_back(v[0]);
+  }
+
+  // generate a seperate tdigest for each group
+  std::vector<std::unique_ptr<column>> parts;
+  std::transform(
+    iter, iter + values.size(), std::back_inserter(parts), [&keys, &values, delta](int i) {
+      cudf::table_view t({keys[i]});
+      cudf::groupby::groupby gb(t);
+      std::vector<cudf::groupby::aggregation_request> requests;
+      std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+      aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+      requests.push_back({values[i], std::move(aggregations)});
+      auto result = gb.aggregate(requests);
+      return std::move(result.second[0].results[0]);
+    });
+  std::vector<column_view> part_views;
+  std::transform(parts.begin(),
+                 parts.end(),
+                 std::back_inserter(part_views),
+                 [](std::unique_ptr<column> const& col) { return col->view(); });
+  auto merged_parts = cudf::concatenate(part_views);
+
+  // generate a tdigest on the whole input set
+  cudf::table_view t({_keys});
+  cudf::groupby::groupby gb(t);
+  std::vector<cudf::groupby::aggregation_request> requests;
+  std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+  aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+  requests.push_back({*_values, std::move(aggregations)});
+  auto result = gb.aggregate(requests);
+
+  // verify that they end up the same.
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[0].results[0], *merged_parts);
+}
+
+struct TDigestTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(TDigestTest, LargeInputDouble)
+{
+  // these tests are being done explicitly because of the way we have to precompute the correct
+  // answers. since the input values generated by the generate_distribution() function below are
+  // cast to specific types -before- being sent into the aggregation, I can't (safely) just use the
+  // expected values that you get when using doubles all the way through.  so I have to pregenerate
+  // the correct answers for each type by hand. so, we'll choose a reasonable subset (double,
+  // decimal, int, bool)
+
+  auto values = generate_standardized_percentile_distribution(data_type{type_id::FLOAT64});
+  // all in the same group
+  auto keys = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+               keys->mutable_view().template begin<int>(),
+               keys->mutable_view().template end<int>(),
+               0);
+
+  // compare against a sample of known/expected values (which themselves were verified against the
+  // Arrow implementation)
+
+  // delta 1000
+  {
+    int const delta = 1000;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 0.00040692343794663995, 7},
+                                         {10, 0.16234555627091204477, 153},
+                                         {59, 5.12764811246045937310, 858},
+                                         {250, 62.54581814492237157310, 2356},
+                                         {368, 87.85834376680742252574, 1735},
+                                         {409, 94.07685720279611985006, 1272},
+                                         {491, 99.94197663121231300920, 130},
+                                         {500, 99.99969880795092080916, 2}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+
+  // delta 100
+  {
+    int const delta = 100;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 0.07265722021410986331, 739},
+                                         {7, 8.19766194442652640362, 10693},
+                                         {16, 36.82277869518204482802, 20276},
+                                         {29, 72.95424834129075009059, 22623},
+                                         {38, 90.61229683516096145013, 15581},
+                                         {46, 99.07283498858802772702, 5142},
+                                         {50, 99.99970905482754801596, 1}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+
+  // delta 10
+  {
+    int const delta = 10;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 7.15508346777729631327, 71618},
+                                         {1, 33.04971680740474226923, 187499},
+                                         {2, 62.50566666553867634093, 231762},
+                                         {3, 83.46216572053654658703, 187500},
+                                         {4, 96.42204425201593664951, 71620},
+                                         {5, 99.99970905482754801596, 1}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+}
+
+TEST_F(TDigestTest, LargeInputInt)
+{
+  // these tests are being done explicitly because of the way we have to precompute the correct
+  // answers. since the input values generated by the generate_distribution() function below are
+  // cast to specific types -before- being sent into the aggregation, I can't (safely) just use the
+  // expected values that you get when using doubles all the way through.  so I have to pregenerate
+  // the correct answers for each type by hand. so, we'll choose a reasonable subset (double,
+  // decimal, int, bool)
+
+  auto values = generate_standardized_percentile_distribution(data_type{type_id::INT32});
+  // all in the same group
+  auto keys = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+               keys->mutable_view().template begin<int>(),
+               keys->mutable_view().template end<int>(),
+               0);
+
+  // compare against a sample of known/expected values (which themselves were verified against the
+  // Arrow implementation)
+
+  // delta 1000
+  {
+    int const delta = 1000;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 0, 7},
+                                         {14, 0, 212},
+                                         {26, 0.83247422680412408447, 388},
+                                         {44, 2, 648},
+                                         {45, 2.42598187311178170589, 662},
+                                         {342, 82.75190258751908345403, 1971},
+                                         {383, 90, 1577},
+                                         {417, 94.88376068376066996279, 1170},
+                                         {418, 95, 1157},
+                                         {479, 99, 307},
+                                         {500, 99, 2}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+
+  // delta 100
+  {
+    int const delta = 100;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 0, 739},
+                                         {7, 7.71486018890863167741, 10693},
+                                         {16, 36.32491615703294485229, 20276},
+                                         {29, 72.44392874508245938614, 22623},
+                                         {38, 90.14209614273795523332, 15581},
+                                         {46, 98.64041229093737683797, 5142},
+                                         {50, 99, 1}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+
+  // delta 10
+  {
+    int const delta = 10;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 6.66025300902007799664, 71618},
+                                         {1, 32.54912826201739051157, 187499},
+                                         {2, 62.00734805533262772315, 231762},
+                                         {3, 82.96355733333332693746, 187500},
+                                         {4, 95.91280368612116546956, 71620},
+                                         {5, 99, 1}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+}
+
+TEST_F(TDigestTest, LargeInputDecimal)
+{
+  // these tests are being done explicitly because of the way we have to precompute the correct
+  // answers. since the input values generated by the generate_distribution() function below are
+  // cast to specific types -before- being sent into the aggregation, I can't (safely) just use the
+  // expected values that you get when using doubles all the way through.  so I have to pregenerate
+  // the correct answers for each type by hand. so, we'll choose a reasonable subset (double,
+  // decimal, int, bool)
+
+  auto values = generate_standardized_percentile_distribution(data_type{type_id::DECIMAL32, -4});
+  // all in the same group
+  auto keys = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+               keys->mutable_view().template begin<int>(),
+               keys->mutable_view().template end<int>(),
+               0);
+
+  // compare against a sample of known/expected values (which themselves were verified against the
+  // Arrow implementation)
+
+  // delta 1000
+  {
+    int const delta = 1000;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 0.00035714285714285709, 7},
+                                         {10, 0.16229738562091505782, 153},
+                                         {59, 5.12759696969697031932, 858},
+                                         {250, 62.54576854838715860296, 2356},
+                                         {368, 87.85829446685879418055, 1735},
+                                         {409, 94.07680636792450457051, 1272},
+                                         {491, 99.94192461538463589932, 130},
+                                         {500, 99.99965000000000259206, 2}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+
+  // delta 100
+  {
+    int const delta = 100;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 0.07260811907983763525, 739},
+                                         {7, 8.19761183016926864298, 10693},
+                                         {16, 36.82272891595975750079, 20276},
+                                         {29, 72.95419827167043536065, 22623},
+                                         {38, 90.61224673640975879607, 15581},
+                                         {46, 99.07278498638662256326, 5142},
+                                         {50, 99.99970000000000425189, 1}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+
+  // delta 10
+  {
+    int const delta = 10;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 7.15503361864335740705, 71618},
+                                         {1, 33.04966679715625588187, 187499},
+                                         {2, 62.50561666407782013266, 231762},
+                                         {3, 83.46211575573336460820, 187500},
+                                         {4, 96.42199425300195514410, 71620},
+                                         {5, 99.99970000000000425189, 1}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+}
+
+struct TDigestMergeTest : public cudf::test::BaseFixture {
+};
+
+// Note: there is no need to test different types here as the internals of a tdigest are always
+// the same regardless of input.
+TEST_F(TDigestMergeTest, Simple)
+{
+  auto values = generate_standardized_percentile_distribution(data_type{type_id::FLOAT64});
+  CUDF_EXPECTS(values->size() == 750000, "Unexpected distribution size");
+  // all in the same group
+  auto keys = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+               keys->mutable_view().template begin<int>(),
+               keys->mutable_view().template end<int>(),
+               0);
+
+  auto split_values = cudf::split(*values, {250000, 500000});
+  auto split_keys   = cudf::split(*keys, {250000, 500000});
+
+  int const delta = 1000;
+
+  // generate seperate digests
+  std::vector<std::unique_ptr<column>> parts;
+  auto iter = thrust::make_counting_iterator(0);
+  std::transform(
+    iter,
+    iter + split_values.size(),
+    std::back_inserter(parts),
+    [&split_keys, &split_values, delta](int i) {
+      cudf::table_view t({split_keys[i]});
+      cudf::groupby::groupby gb(t);
+      std::vector<cudf::groupby::aggregation_request> requests;
+      std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+      aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+      requests.push_back({split_values[i], std::move(aggregations)});
+      auto result = gb.aggregate(requests);
+      return std::move(result.second[0].results[0]);
+    });
+  std::vector<column_view> part_views;
+  std::transform(parts.begin(),
+                 parts.end(),
+                 std::back_inserter(part_views),
+                 [](std::unique_ptr<column> const& col) { return col->view(); });
+
+  // merge delta = 1000
+  {
+    int const merge_delta = 1000;
+
+    // merge them
+    auto merge_input = cudf::concatenate(part_views);
+    cudf::test::fixed_width_column_wrapper<int> merge_keys{0, 0, 0};
+    cudf::table_view key_table({merge_keys});
+    cudf::groupby::groupby gb(key_table);
+    std::vector<cudf::groupby::aggregation_request> requests;
+    std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+    aggregations.push_back(
+      cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(merge_delta));
+    requests.push_back({*merge_input, std::move(aggregations)});
+    auto result = gb.aggregate(requests);
+
+    std::vector<expected_value> expected{{0, 0.00013945158577498588, 2},
+                                         {10, 0.04804393446447510763, 50},
+                                         {59, 1.68846964439246893797, 284},
+                                         {250, 33.36323141295877547918, 1479},
+                                         {368, 65.36307727957283475462, 2292},
+                                         {409, 73.95399208218296394080, 1784},
+                                         {490, 87.67566167909056673579, 1570},
+                                         {491, 87.83119717763385381204, 1570},
+                                         {500, 89.24891838334393412424, 1555},
+                                         {578, 95.87182997389099625707, 583},
+                                         {625, 98.20470345147104751504, 405},
+                                         {700, 99.96818381983835877236, 56},
+                                         {711, 99.99970905482754801596, 1}};
+
+    tdigest_sample_compare(*result.second[0].results[0], expected);
+  }
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/quantiles/percentile_approx_test.cu b/cpp/tests/quantiles/percentile_approx_test.cu
new file mode 100644
index 00000000000..39f7cc593d6
--- /dev/null
+++ b/cpp/tests/quantiles/percentile_approx_test.cu
@@ -0,0 +1,435 @@
+#include <arrow/util/tdigest.h>
+
+#include <cudf/detail/tdigest/tdigest.hpp>
+#include <cudf/detail/valid_if.cuh>
+#include <cudf/groupby.hpp>
+#include <cudf/quantiles.hpp>
+#include <cudf/transform.hpp>
+#include <cudf/unary.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_list_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+using namespace cudf;
+
+struct tdigest_gen {
+  template <
+    typename T,
+    typename std::enable_if_t<cudf::is_numeric<T>() || cudf::is_fixed_point<T>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& keys, column_view const& values, int delta)
+  {
+    cudf::table_view t({keys});
+    cudf::groupby::groupby gb(t);
+    std::vector<cudf::groupby::aggregation_request> requests;
+    std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+    aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+    requests.push_back({values, std::move(aggregations)});
+    auto result = gb.aggregate(requests);
+    return std::move(result.second[0].results[0]);
+  }
+
+  template <
+    typename T,
+    typename std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& keys, column_view const& values, int delta)
+  {
+    CUDF_FAIL("Invalid tdigest test type");
+  }
+};
+
+std::unique_ptr<column> arrow_percentile_approx(column_view const& _values,
+                                                int delta,
+                                                std::vector<double> const& percentages)
+{
+  // sort the incoming values using the same settings that groupby does.
+  // this is a little weak because null_order::AFTER is hardcoded internally to groupby.
+  table_view t({_values});
+  auto sorted_t      = cudf::sort(t, {}, {null_order::AFTER});
+  auto sorted_values = sorted_t->get_column(0).view();
+
+  std::vector<double> h_values(sorted_values.size());
+  cudaMemcpy(h_values.data(),
+             sorted_values.data<double>(),
+             sizeof(double) * sorted_values.size(),
+             cudaMemcpyDeviceToHost);
+  std::vector<char> h_validity(sorted_values.size());
+  if (sorted_values.null_mask() != nullptr) {
+    auto validity = cudf::mask_to_bools(sorted_values.null_mask(), 0, sorted_values.size());
+    cudaMemcpy(h_validity.data(),
+               (validity->view().data<char>()),
+               sizeof(char) * sorted_values.size(),
+               cudaMemcpyDeviceToHost);
+  }
+
+  // generate the tdigest
+  arrow::internal::TDigest atd(delta, sorted_values.size() * 2);
+  for (size_t idx = 0; idx < h_values.size(); idx++) {
+    if (sorted_values.null_mask() == nullptr || h_validity[idx]) { atd.Add(h_values[idx]); }
+  }
+
+  // generate the percentiles and stuff them into a list column
+  std::vector<double> h_result;
+  h_result.reserve(percentages.size());
+  std::transform(
+    percentages.begin(), percentages.end(), std::back_inserter(h_result), [&atd](double p) {
+      return atd.Quantile(p);
+    });
+  cudf::test::fixed_width_column_wrapper<double> result(h_result.begin(), h_result.end());
+  cudf::test::fixed_width_column_wrapper<size_type> offsets{
+    0, static_cast<size_type>(percentages.size())};
+  return cudf::make_lists_column(1, offsets.release(), result.release(), 0, {});
+}
+
+struct percentile_approx_dispatch {
+  template <
+    typename T,
+    typename std::enable_if_t<cudf::is_numeric<T>() || cudf::is_fixed_point<T>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& keys,
+                                     column_view const& values,
+                                     int delta,
+                                     std::vector<double> const& percentages,
+                                     size_type ulps)
+  {
+    // arrow implementation.
+    auto expected = [&]() {
+      // we're explicitly casting back to doubles here but this is ok because that is
+      // exactly what happens inside of the cudf implementation as values are processed as well. so
+      // this should not affect results.
+      auto as_doubles = cudf::cast(values, data_type{type_id::FLOAT64});
+      return arrow_percentile_approx(*as_doubles, delta, percentages);
+    }();
+
+    // gpu
+    cudf::table_view t({keys});
+    cudf::groupby::groupby gb(t);
+    std::vector<cudf::groupby::aggregation_request> requests;
+    std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+    aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+    requests.push_back({values, std::move(aggregations)});
+    auto gb_result = gb.aggregate(requests);
+
+    cudf::test::fixed_width_column_wrapper<double> g_percentages(percentages.begin(),
+                                                                 percentages.end());
+    structs_column_view scv(*(gb_result.second[0].results[0]));
+    auto result = cudf::percentile_approx(scv, g_percentages);
+
+    cudf::test::expect_columns_equivalent(
+      *expected, *result, cudf::test::debug_output_level::FIRST_ERROR, ulps);
+
+    return result;
+  }
+
+  template <
+    typename T,
+    typename std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& keys,
+                                     column_view const& values,
+                                     int delta,
+                                     std::vector<double> const& percentages,
+                                     size_type ulps)
+  {
+    CUDF_FAIL("Invalid input type for percentile_approx test");
+  }
+};
+
+void percentile_approx_test(column_view const& _keys,
+                            column_view const& _values,
+                            int delta,
+                            std::vector<double> const& percentages,
+                            size_type ulps)
+{
+  // first pass:  validate the actual percentages we get per group.
+
+  // produce the groups
+  cudf::table_view k({_keys});
+  cudf::groupby::groupby pass1_gb(k);
+  cudf::table_view v({_values});
+  auto groups = pass1_gb.get_groups(v);
+  // slice it all up so we have keys/columns for everything.
+  std::vector<column_view> keys;
+  std::vector<column_view> values;
+  for (size_t idx = 0; idx < groups.offsets.size() - 1; idx++) {
+    auto k =
+      cudf::slice(groups.keys->get_column(0), {groups.offsets[idx], groups.offsets[idx + 1]});
+    keys.push_back(k[0]);
+
+    auto v =
+      cudf::slice(groups.values->get_column(0), {groups.offsets[idx], groups.offsets[idx + 1]});
+    values.push_back(v[0]);
+  }
+
+  std::vector<std::unique_ptr<column>> parts;
+  for (size_t idx = 0; idx < values.size(); idx++) {
+    // do any casting of the input
+    parts.push_back(cudf::type_dispatcher(values[idx].type(),
+                                          percentile_approx_dispatch{},
+                                          keys[idx],
+                                          values[idx],
+                                          delta,
+                                          percentages,
+                                          ulps));
+  }
+  std::vector<column_view> part_views;
+  std::transform(parts.begin(),
+                 parts.end(),
+                 std::back_inserter(part_views),
+                 [](std::unique_ptr<column> const& c) { return c->view(); });
+  auto expected = cudf::concatenate(part_views);
+
+  // second pass. run the percentile_approx with all the keys in one pass and make sure we get the
+  // same results as the concatenated by-key results above
+
+  cudf::groupby::groupby gb(k);
+  std::vector<cudf::groupby::aggregation_request> requests;
+  std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+  aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+  requests.push_back({_values, std::move(aggregations)});
+  auto gb_result = gb.aggregate(requests);
+
+  cudf::test::fixed_width_column_wrapper<double> g_percentages(percentages.begin(),
+                                                               percentages.end());
+  structs_column_view scv(*(gb_result.second[0].results[0]));
+  auto result = cudf::percentile_approx(scv, g_percentages);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *result);
+}
+
+void simple_test(data_type input_type, std::vector<std::pair<int, int>> params)
+{
+  auto values = cudf::test::generate_standardized_percentile_distribution(input_type);
+  // all in the same group
+  auto keys = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+               keys->mutable_view().template begin<int>(),
+               keys->mutable_view().template end<int>(),
+               0);
+
+  std::for_each(params.begin(), params.end(), [&](std::pair<int, int> const& params) {
+    percentile_approx_test(
+      *keys, *values, params.first, {0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0}, params.second);
+  });
+}
+
+struct group_index {
+  __device__ int operator()(int i) { return i / 150000; }
+};
+
+void grouped_test(data_type input_type, std::vector<std::pair<int, int>> params)
+{
+  auto values = cudf::test::generate_standardized_percentile_distribution(input_type);
+  // all in the same group
+  auto keys = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
+  auto i = thrust::make_counting_iterator(0);
+  thrust::transform(rmm::exec_policy(rmm::cuda_stream_default),
+                    i,
+                    i + values->size(),
+                    keys->mutable_view().template begin<int>(),
+                    group_index{});
+
+  std::for_each(params.begin(), params.end(), [&](std::pair<int, int> const& params) {
+    percentile_approx_test(
+      *keys, *values, params.first, {0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0}, params.second);
+  });
+}
+
+std::pair<rmm::device_buffer, size_type> make_null_mask(column_view const& col)
+{
+  return cudf::detail::valid_if(thrust::make_counting_iterator<size_type>(0),
+                                thrust::make_counting_iterator<size_type>(col.size()),
+                                [] __device__(size_type i) { return i % 2 == 0; });
+}
+
+void simple_with_nulls_test(data_type input_type, std::vector<std::pair<int, int>> params)
+{
+  auto values = cudf::test::generate_standardized_percentile_distribution(input_type);
+  // all in the same group
+  auto keys = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+               keys->mutable_view().template begin<int>(),
+               keys->mutable_view().template end<int>(),
+               0);
+
+  // add a null mask
+  auto mask = make_null_mask(*values);
+  values->set_null_mask(mask.first, mask.second);
+
+  std::for_each(params.begin(), params.end(), [&](std::pair<int, int> const& params) {
+    percentile_approx_test(
+      *keys, *values, params.first, {0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0}, params.second);
+  });
+}
+
+void grouped_with_nulls_test(data_type input_type, std::vector<std::pair<int, int>> params)
+{
+  auto values = cudf::test::generate_standardized_percentile_distribution(input_type);
+  // all in the same group
+  auto keys = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
+  auto i = thrust::make_counting_iterator(0);
+  thrust::transform(rmm::exec_policy(rmm::cuda_stream_default),
+                    i,
+                    i + values->size(),
+                    keys->mutable_view().template begin<int>(),
+                    group_index{});
+
+  // add a null mask
+  auto mask = make_null_mask(*values);
+  values->set_null_mask(mask.first, mask.second);
+
+  std::for_each(params.begin(), params.end(), [&](std::pair<int, int> const& params) {
+    percentile_approx_test(
+      *keys, *values, params.first, {0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0}, params.second);
+  });
+}
+
+template <typename T>
+data_type get_appropriate_type()
+{
+  if constexpr (cudf::is_fixed_point<T>()) { return data_type{cudf::type_to_id<T>(), -7}; }
+  return data_type{cudf::type_to_id<T>()};
+}
+
+using PercentileApproxTypes =
+  cudf::test::Concat<cudf::test::NumericTypes, cudf::test::FixedPointTypes>;
+
+template <typename T>
+struct PercentileApproxInputTypesTest : public cudf::test::BaseFixture {
+};
+TYPED_TEST_CASE(PercentileApproxInputTypesTest, PercentileApproxTypes);
+
+TYPED_TEST(PercentileApproxInputTypesTest, Simple)
+{
+  using T               = TypeParam;
+  auto const input_type = get_appropriate_type<T>();
+
+  simple_test(input_type,
+              {{1000, cudf::test::default_ulp},
+               {100, cudf::test::default_ulp * 4},
+               {10, cudf::test::default_ulp * 11}});
+}
+
+TYPED_TEST(PercentileApproxInputTypesTest, Grouped)
+{
+  using T               = TypeParam;
+  auto const input_type = get_appropriate_type<T>();
+
+  grouped_test(input_type,
+               {{1000, cudf::test::default_ulp},
+                {100, cudf::test::default_ulp * 2},
+                {10, cudf::test::default_ulp * 10}});
+}
+
+TYPED_TEST(PercentileApproxInputTypesTest, SimpleWithNulls)
+{
+  using T               = TypeParam;
+  auto const input_type = get_appropriate_type<T>();
+
+  simple_with_nulls_test(input_type,
+                         {{1000, cudf::test::default_ulp},
+                          {100, cudf::test::default_ulp * 2},
+                          {10, cudf::test::default_ulp * 11}});
+}
+
+TYPED_TEST(PercentileApproxInputTypesTest, GroupedWithNulls)
+{
+  using T               = TypeParam;
+  auto const input_type = get_appropriate_type<T>();
+
+  grouped_with_nulls_test(input_type,
+                          {{1000, cudf::test::default_ulp},
+                           {100, cudf::test::default_ulp * 2},
+                           {10, cudf::test::default_ulp * 6}});
+}
+
+struct PercentileApproxTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(PercentileApproxTest, EmptyInput)
+{
+  auto empty_ = cudf::detail::tdigest::make_empty_tdigest_column();
+  cudf::test::fixed_width_column_wrapper<double> percentiles{0.0, 0.25, 0.3};
+
+  std::vector<column_view> input;
+  input.push_back(*empty_);
+  input.push_back(*empty_);
+  input.push_back(*empty_);
+  auto empty = cudf::concatenate(input);
+
+  structs_column_view scv(*empty);
+  auto result = cudf::percentile_approx(scv, percentiles);
+
+  cudf::test::fixed_width_column_wrapper<offset_type> offsets{0, 0, 0, 0};
+  std::vector<bool> nulls{0, 0, 0};
+  auto expected =
+    cudf::make_lists_column(3,
+                            offsets.release(),
+                            cudf::make_empty_column(data_type{type_id::FLOAT64}),
+                            3,
+                            cudf::test::detail::make_null_mask(nulls.begin(), nulls.end()));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
+}
+
+TEST_F(PercentileApproxTest, EmptyPercentiles)
+{
+  auto const delta = 1000;
+
+  cudf::test::fixed_width_column_wrapper<double> values{0, 1, 2, 3, 4, 5};
+  cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 1, 1, 1};
+  cudf::table_view t({keys});
+  cudf::groupby::groupby gb(t);
+  std::vector<cudf::groupby::aggregation_request> requests;
+  std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+  aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+  requests.push_back({values, std::move(aggregations)});
+  auto tdigest_column = gb.aggregate(requests);
+
+  cudf::test::fixed_width_column_wrapper<double> percentiles{};
+
+  structs_column_view scv(*tdigest_column.second[0].results[0]);
+  auto result = cudf::percentile_approx(scv, percentiles);
+
+  cudf::test::fixed_width_column_wrapper<offset_type> offsets{0, 0, 0};
+  auto expected = cudf::make_lists_column(2,
+                                          offsets.release(),
+                                          cudf::make_empty_column(data_type{type_id::FLOAT64}),
+                                          2,
+                                          cudf::detail::create_null_mask(2, mask_state::ALL_NULL));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
+}
+
+TEST_F(PercentileApproxTest, NullPercentiles)
+{
+  auto const delta = 1000;
+
+  cudf::test::fixed_width_column_wrapper<double> values{1, 1, 2, 3, 4, 5, 6, 7, 8};
+  cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0, 1, 1, 1, 1};
+  cudf::table_view t({keys});
+  cudf::groupby::groupby gb(t);
+  std::vector<cudf::groupby::aggregation_request> requests;
+  std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+  aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+  requests.push_back({values, std::move(aggregations)});
+  auto tdigest_column = gb.aggregate(requests);
+
+  structs_column_view scv(*tdigest_column.second[0].results[0]);
+
+  cudf::test::fixed_width_column_wrapper<double> npercentiles{{0.5, 0.5, 1.0, 1.0}, {0, 0, 1, 1}};
+  auto result = cudf::percentile_approx(scv, npercentiles);
+
+  std::vector<bool> valids{0, 0, 1, 1};
+  cudf::test::lists_column_wrapper<double> expected{{{99, 99, 4, 4}, valids.begin()},
+                                                    {{99, 99, 8, 8}, valids.begin()}};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
+}
\ No newline at end of file
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index f3002bc4b1a..0f10d6efe4a 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -323,7 +323,8 @@ class corresponding_rows_unequal {
   corresponding_rows_unequal(table_device_view d_lhs,
                              table_device_view d_rhs,
                              column_device_view lhs_row_indices_,
-                             column_device_view rhs_row_indices_)
+                             column_device_view rhs_row_indices_,
+                             size_type /*fp_ulps*/)
     : comp(d_lhs, d_rhs), lhs_row_indices(lhs_row_indices_), rhs_row_indices(rhs_row_indices_)
   {
   }
@@ -347,16 +348,20 @@ class corresponding_rows_not_equivalent {
   column_device_view lhs_row_indices;
   column_device_view rhs_row_indices;
 
+  size_type const fp_ulps;
+
  public:
   corresponding_rows_not_equivalent(table_device_view d_lhs,
                                     table_device_view d_rhs,
                                     column_device_view lhs_row_indices_,
-                                    column_device_view rhs_row_indices_)
+                                    column_device_view rhs_row_indices_,
+                                    size_type fp_ulps_)
     : d_lhs(d_lhs),
       d_rhs(d_rhs),
       comp(d_lhs, d_rhs),
       lhs_row_indices(lhs_row_indices_),
-      rhs_row_indices(rhs_row_indices_)
+      rhs_row_indices(rhs_row_indices_),
+      fp_ulps(fp_ulps_)
   {
     CUDF_EXPECTS(d_lhs.num_columns() == 1 and d_rhs.num_columns() == 1,
                  "Unsupported number of columns");
@@ -368,7 +373,8 @@ class corresponding_rows_not_equivalent {
       column_device_view const& lhs,
       column_device_view const& rhs,
       size_type lhs_index,
-      size_type rhs_index)
+      size_type rhs_index,
+      size_type fp_ulps)
     {
       if (lhs.is_valid(lhs_index) and rhs.is_valid(rhs_index)) {
         T const x = lhs.element<T>(lhs_index);
@@ -380,10 +386,9 @@ class corresponding_rows_not_equivalent {
         } else if (std::isnan(x) || std::isnan(y)) {
           return std::isnan(x) != std::isnan(y);  // comparison of (nan==nan) returns false
         } else {
-          constexpr int ulp     = 4;  // ulp = unit of least precision, value taken from google test
           T const abs_x_minus_y = std::abs(x - y);
           return abs_x_minus_y >= std::numeric_limits<T>::min() &&
-                 abs_x_minus_y > std::numeric_limits<T>::epsilon() * std::abs(x + y) * ulp;
+                 abs_x_minus_y > std::numeric_limits<T>::epsilon() * std::abs(x + y) * fp_ulps;
         }
       } else {
         // if either is null, then the inequality was checked already
@@ -409,8 +414,13 @@ class corresponding_rows_not_equivalent {
     if (not comp(lhs_index, rhs_index)) {
       auto lhs_col = this->d_lhs.column(0);
       auto rhs_col = this->d_rhs.column(0);
-      return type_dispatcher(
-        lhs_col.type(), typed_element_not_equivalent{}, lhs_col, rhs_col, lhs_index, rhs_index);
+      return type_dispatcher(lhs_col.type(),
+                             typed_element_not_equivalent{},
+                             lhs_col,
+                             rhs_col,
+                             lhs_index,
+                             rhs_index,
+                             fp_ulps);
     }
     return false;
   }
@@ -468,6 +478,7 @@ struct column_comparator_impl {
                   column_view const& lhs_row_indices,
                   column_view const& rhs_row_indices,
                   debug_output_level verbosity,
+                  size_type fp_ulps,
                   int depth)
   {
     auto d_lhs = cudf::table_device_view::create(table_view{{lhs}});
@@ -483,12 +494,12 @@ struct column_comparator_impl {
     auto differences = rmm::device_uvector<int>(
       lhs.size(), rmm::cuda_stream_default);  // worst case: everything different
     auto input_iter = thrust::make_counting_iterator(0);
-    auto diff_iter =
-      thrust::copy_if(rmm::exec_policy(),
-                      input_iter,
-                      input_iter + lhs_row_indices.size(),
-                      differences.begin(),
-                      ComparatorType(*d_lhs, *d_rhs, *d_lhs_row_indices, *d_rhs_row_indices));
+    auto diff_iter  = thrust::copy_if(
+      rmm::exec_policy(),
+      input_iter,
+      input_iter + lhs_row_indices.size(),
+      differences.begin(),
+      ComparatorType(*d_lhs, *d_rhs, *d_lhs_row_indices, *d_rhs_row_indices, fp_ulps));
 
     differences.resize(thrust::distance(differences.begin(), diff_iter),
                        rmm::cuda_stream_default);  // shrink back down
@@ -519,6 +530,7 @@ struct column_comparator_impl<list_view, check_exact_equality> {
                   column_view const& lhs_row_indices,
                   column_view const& rhs_row_indices,
                   debug_output_level verbosity,
+                  size_type fp_ulps,
                   int depth)
   {
     lists_column_view lhs_l(lhs);
@@ -638,6 +650,7 @@ struct column_comparator_impl<list_view, check_exact_equality> {
                                    *lhs_child_indices,
                                    *rhs_child_indices,
                                    verbosity,
+                                   fp_ulps,
                                    depth + 1);
     }
 
@@ -652,6 +665,7 @@ struct column_comparator_impl<struct_view, check_exact_equality> {
                   column_view const& lhs_row_indices,
                   column_view const& rhs_row_indices,
                   debug_output_level verbosity,
+                  size_type fp_ulps,
                   int depth)
   {
     structs_column_view l_scv(lhs);
@@ -667,6 +681,7 @@ struct column_comparator_impl<struct_view, check_exact_equality> {
                                  lhs_row_indices,
                                  rhs_row_indices,
                                  verbosity,
+                                 fp_ulps,
                                  depth + 1)) {
         return false;
       }
@@ -683,6 +698,7 @@ struct column_comparator {
                   column_view const& lhs_row_indices,
                   column_view const& rhs_row_indices,
                   debug_output_level verbosity,
+                  size_type fp_ulps,
                   int depth = 0)
   {
     CUDF_EXPECTS(lhs_row_indices.size() == rhs_row_indices.size(),
@@ -701,7 +717,7 @@ struct column_comparator {
 
     // compare values
     column_comparator_impl<T, check_exact_equality> comparator{};
-    return comparator(lhs, rhs, lhs_row_indices, rhs_row_indices, verbosity, depth);
+    return comparator(lhs, rhs, lhs_row_indices, rhs_row_indices, verbosity, fp_ulps, depth);
   }
 };
 
@@ -750,8 +766,14 @@ bool expect_columns_equal(cudf::column_view const& lhs,
                           debug_output_level verbosity)
 {
   auto indices = generate_all_row_indices(lhs.size());
-  return cudf::type_dispatcher(
-    lhs.type(), column_comparator<true>{}, lhs, rhs, *indices, *indices, verbosity);
+  return cudf::type_dispatcher(lhs.type(),
+                               column_comparator<true>{},
+                               lhs,
+                               rhs,
+                               *indices,
+                               *indices,
+                               verbosity,
+                               cudf::test::default_ulp);
 }
 
 /**
@@ -759,11 +781,12 @@ bool expect_columns_equal(cudf::column_view const& lhs,
  */
 bool expect_columns_equivalent(cudf::column_view const& lhs,
                                cudf::column_view const& rhs,
-                               debug_output_level verbosity)
+                               debug_output_level verbosity,
+                               size_type fp_ulps)
 {
   auto indices = generate_all_row_indices(lhs.size());
   return cudf::type_dispatcher(
-    lhs.type(), column_comparator<false>{}, lhs, rhs, *indices, *indices, verbosity);
+    lhs.type(), column_comparator<false>{}, lhs, rhs, *indices, *indices, verbosity, fp_ulps);
 }
 
 /**