From 12dc13007fd670af53c8fc869a3abebbe6188375 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 27 Mar 2023 09:51:37 -0400
Subject: [PATCH] Rework some code logic to reduce iterator and comparator
 inlining to improve compile time (#12900)

Disables inlining the device code logic for the row operators for nested column types did not work as hoped.
Some files took longer to compile and some functions ran 20% slower for large rows.

Reworking individual source files to break up the code logic into multiple kernels seems to work well for compile time while having a smaller effect on performance. The goal is to only rework the nested column code paths.
Here are some source files that have compile time issues and are improved in this PR.

| source file  | current | PR |
|:--- | ---:| ---:|
|   stream_compaction/unique_count.cu | 18 min |  13 min |
|   groupby/sort/group_nunique.cu   | 16 min |  2 min  |
|   stream_compaction/unique.cu  | 16 min | 5 min |
|   groupby/sort/sort_helper.cu | 10 min | 6.5 min |
|   search/contains_scalar.cu | 12 min | 4.7 min  |
|   sort/is_sorted.cu | 9 min | 7 min |
|   groupby/sort/group_std.cu | 7 min | 1.2 min |
|   groupby/sort/group_m2.cu | 6 min | 1.2 min  |

Available benchmarks showed minimal impact to performance.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/12900
---
 cpp/benchmarks/CMakeLists.txt                 |  1 +
 .../stream_compaction/unique_count.cpp        | 53 +++++++++++++
 cpp/src/groupby/sort/group_m2.cu              | 18 +++--
 cpp/src/groupby/sort/group_nunique.cu         | 36 +++++----
 cpp/src/groupby/sort/group_std.cu             | 18 +++--
 cpp/src/groupby/sort/sort_helper.cu           | 29 +++++--
 cpp/src/search/contains_scalar.cu             | 29 ++++---
 cpp/src/sort/is_sorted.cu                     | 25 +++++--
 cpp/src/stream_compaction/unique.cu           | 75 +++++++++++--------
 cpp/src/stream_compaction/unique_count.cu     | 20 +++--
 10 files changed, 217 insertions(+), 87 deletions(-)
 create mode 100644 cpp/benchmarks/stream_compaction/unique_count.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index cc0b642a337..e01d7745e94 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -150,6 +150,7 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp
 # * stream_compaction benchmark -------------------------------------------------------------------
 ConfigureNVBench(
   STREAM_COMPACTION_NVBENCH stream_compaction/distinct.cpp stream_compaction/unique.cpp
+  stream_compaction/unique_count.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/stream_compaction/unique_count.cpp b/cpp/benchmarks/stream_compaction/unique_count.cpp
new file mode 100644
index 00000000000..f8319e0385c
--- /dev/null
+++ b/cpp/benchmarks/stream_compaction/unique_count.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/rmm_pool_raii.hpp>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/stream_compaction.hpp>
+#include <cudf/types.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template <typename Type>
+void nvbench_unique_count(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("NumRows"));
+  auto const nulls    = state.get_float64("NullProbability");
+
+  data_profile profile = data_profile_builder().cardinality(0).null_probability(nulls).distribution(
+    cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, num_rows / 100);
+
+  auto source_column = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
+  auto sorted_table  = cudf::sort(cudf::table_view({source_column->view()}));
+
+  auto input = sorted_table->view();
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    cudf::unique_count(input, cudf::null_equality::EQUAL);
+  });
+}
+
+using data_type = nvbench::type_list<bool, int8_t, int32_t, int64_t, float, cudf::timestamp_ms>;
+
+NVBENCH_BENCH_TYPES(nvbench_unique_count, NVBENCH_TYPE_AXES(data_type))
+  .set_name("unique_count")
+  .set_type_axes_names({"Type"})
+  .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000})
+  .add_float64_axis("NullProbability", {0.0, 0.1});
diff --git a/cpp/src/groupby/sort/group_m2.cu b/cpp/src/groupby/sort/group_m2.cu
index edc8b089120..70b05100fb0 100644
--- a/cpp/src/groupby/sort/group_m2.cu
+++ b/cpp/src/groupby/sort/group_m2.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,10 +25,12 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/reduce.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace groupby {
@@ -62,15 +64,19 @@ void compute_m2_fn(column_device_view const& values,
                    ResultType* d_result,
                    rmm::cuda_stream_view stream)
 {
-  auto const var_iter = cudf::detail::make_counting_transform_iterator(
-    size_type{0},
-    m2_transform<ResultType, decltype(values_iter)>{
-      values, values_iter, d_means, group_labels.data()});
+  auto m2_fn = m2_transform<ResultType, decltype(values_iter)>{
+    values, values_iter, d_means, group_labels.data()};
+  auto const itr = thrust::counting_iterator<size_type>(0);
+  // Using a temporary buffer for intermediate transform results instead of
+  // using the transform-iterator directly in thrust::reduce_by_key
+  // improves compile-time significantly.
+  auto m2_vals = rmm::device_uvector<ResultType>(values.size(), stream);
+  thrust::transform(rmm::exec_policy(stream), itr, itr + values.size(), m2_vals.begin(), m2_fn);
 
   thrust::reduce_by_key(rmm::exec_policy(stream),
                         group_labels.begin(),
                         group_labels.end(),
-                        var_iter,
+                        m2_vals.begin(),
                         thrust::make_discard_iterator(),
                         d_result);
 }
diff --git a/cpp/src/groupby/sort/group_nunique.cu b/cpp/src/groupby/sort/group_nunique.cu
index cf81253483e..1a5f1691d5b 100644
--- a/cpp/src/groupby/sort/group_nunique.cu
+++ b/cpp/src/groupby/sort/group_nunique.cu
@@ -94,21 +94,20 @@ std::unique_ptr<column> group_nunique(column_view const& values,
 
   auto const d_values_view = column_device_view::create(values, stream);
 
+  auto d_result = rmm::device_uvector<size_type>(group_labels.size(), stream);
+
   auto const comparator_helper = [&](auto const d_equal) {
-    auto const is_unique_iterator =
-      thrust::make_transform_iterator(thrust::counting_iterator<cudf::size_type>(0),
-                                      is_unique_iterator_fn{nullate::DYNAMIC{values.has_nulls()},
-                                                            *d_values_view,
-                                                            d_equal,
-                                                            null_handling,
-                                                            group_offsets.data(),
-                                                            group_labels.data()});
-    thrust::reduce_by_key(rmm::exec_policy(stream),
-                          group_labels.begin(),
-                          group_labels.end(),
-                          is_unique_iterator,
-                          thrust::make_discard_iterator(),
-                          result->mutable_view().begin<size_type>());
+    auto fn = is_unique_iterator_fn{nullate::DYNAMIC{values.has_nulls()},
+                                    *d_values_view,
+                                    d_equal,
+                                    null_handling,
+                                    group_offsets.data(),
+                                    group_labels.data()};
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(values.size()),
+                      d_result.begin(),
+                      fn);
   };
 
   if (cudf::detail::has_nested_columns(values_view)) {
@@ -121,6 +120,15 @@ std::unique_ptr<column> group_nunique(column_view const& values,
     comparator_helper(d_equal);
   }
 
+  // calling this with a vector instead of a transform iterator is 10x faster to compile;
+  // it also helps that we are only calling it once for both conditions
+  thrust::reduce_by_key(rmm::exec_policy(stream),
+                        group_labels.begin(),
+                        group_labels.end(),
+                        d_result.begin(),
+                        thrust::make_discard_iterator(),
+                        result->mutable_view().begin<size_type>());
+
   return result;
 }
 
diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu
index a3efc1f172a..8cd2d8baf4e 100644
--- a/cpp/src/groupby/sort/group_std.cu
+++ b/cpp/src/groupby/sort/group_std.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/for_each.h>
@@ -33,6 +34,7 @@
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/reduce.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace groupby {
@@ -48,7 +50,7 @@ struct var_transform {
   size_type const* d_group_labels;
   size_type ddof;
 
-  __device__ ResultType operator()(size_type i)
+  __device__ ResultType operator()(size_type i) const
   {
     if (d_values.is_null(i)) return 0.0;
 
@@ -75,15 +77,19 @@ void reduce_by_key_fn(column_device_view const& values,
                       ResultType* d_result,
                       rmm::cuda_stream_view stream)
 {
-  auto var_iter = thrust::make_transform_iterator(
-    thrust::make_counting_iterator(0),
-    var_transform<ResultType, decltype(values_iter)>{
-      values, values_iter, d_means, d_group_sizes, group_labels.data(), ddof});
+  auto var_fn = var_transform<ResultType, decltype(values_iter)>{
+    values, values_iter, d_means, d_group_sizes, group_labels.data(), ddof};
+  auto const itr = thrust::make_counting_iterator<size_type>(0);
+  // Using a temporary buffer for intermediate transform results instead of
+  // using the transform-iterator directly in thrust::reduce_by_key
+  // improves compile-time significantly.
+  auto vars = rmm::device_uvector<ResultType>(values.size(), stream);
+  thrust::transform(rmm::exec_policy(stream), itr, itr + values.size(), vars.begin(), var_fn);
 
   thrust::reduce_by_key(rmm::exec_policy(stream),
                         group_labels.begin(),
                         group_labels.end(),
-                        var_iter,
+                        vars.begin(),
                         thrust::make_discard_iterator(),
                         d_result);
 }
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index 6e992f2f53b..5b5a6356d67 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -16,6 +16,8 @@
 
 #include "common_utils.cuh"
 
+#include <stream_compaction/stream_compaction_common.cuh>
+
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/copy.hpp>
@@ -144,7 +146,8 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(
 {
   if (_group_offsets) return *_group_offsets;
 
-  _group_offsets = std::make_unique<index_vector>(num_keys(stream) + 1, stream);
+  auto const size = num_keys(stream);
+  _group_offsets  = std::make_unique<index_vector>(size + 1, stream);
 
   auto const comparator = cudf::experimental::row::equality::self_comparator{_keys, stream};
 
@@ -154,23 +157,33 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(
   if (cudf::detail::has_nested_columns(_keys)) {
     auto const d_key_equal = comparator.equal_to<true>(
       cudf::nullate::DYNAMIC{cudf::has_nested_nulls(_keys)}, null_equality::EQUAL);
-    result_end = thrust::unique_copy(rmm::exec_policy(stream),
-                                     thrust::counting_iterator<size_type>(0),
-                                     thrust::counting_iterator<size_type>(num_keys(stream)),
-                                     _group_offsets->begin(),
-                                     permuted_row_equality_comparator(d_key_equal, sorted_order));
+    // Using a temporary buffer for intermediate transform results from the iterator containing
+    // the comparator speeds up compile-time significantly without much degradation in
+    // runtime performance over using the comparator directly in thrust::unique_copy.
+    auto result       = rmm::device_uvector<bool>(size, stream);
+    auto const itr    = thrust::make_counting_iterator<size_type>(0);
+    auto const row_eq = permuted_row_equality_comparator(d_key_equal, sorted_order);
+    auto const ufn    = cudf::detail::unique_copy_fn<decltype(itr), decltype(row_eq)>{
+      itr, duplicate_keep_option::KEEP_FIRST, row_eq, size - 1};
+    thrust::transform(rmm::exec_policy(stream), itr, itr + size, result.begin(), ufn);
+    result_end = thrust::copy_if(rmm::exec_policy(stream),
+                                 itr,
+                                 itr + size,
+                                 result.begin(),
+                                 _group_offsets->begin(),
+                                 thrust::identity<bool>{});
   } else {
     auto const d_key_equal = comparator.equal_to<false>(
       cudf::nullate::DYNAMIC{cudf::has_nested_nulls(_keys)}, null_equality::EQUAL);
     result_end = thrust::unique_copy(rmm::exec_policy(stream),
                                      thrust::counting_iterator<size_type>(0),
-                                     thrust::counting_iterator<size_type>(num_keys(stream)),
+                                     thrust::counting_iterator<size_type>(size),
                                      _group_offsets->begin(),
                                      permuted_row_equality_comparator(d_key_equal, sorted_order));
   }
 
   size_type num_groups = thrust::distance(_group_offsets->begin(), result_end);
-  _group_offsets->set_element(num_groups, num_keys(stream), stream);
+  _group_offsets->set_element(num_groups, size, stream);
   _group_offsets->resize(num_groups + 1, stream);
 
   return *_group_offsets;
diff --git a/cpp/src/search/contains_scalar.cu b/cpp/src/search/contains_scalar.cu
index 093a1f8f1ed..7c16a1b12ef 100644
--- a/cpp/src/search/contains_scalar.cu
+++ b/cpp/src/search/contains_scalar.cu
@@ -30,6 +30,7 @@
 
 #include <thrust/count.h>
 #include <thrust/pair.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace detail {
@@ -108,16 +109,24 @@ struct contains_scalar_dispatch {
     auto const haystack_cdv_ptr = column_device_view::create(haystack, stream);
 
     auto const d_comp = comparator.equal_to<true>(nullate::DYNAMIC{has_nulls});
-    return thrust::count_if(
-             rmm::exec_policy(stream),
-             begin,
-             end,
-             [d_comp, check_nulls, d_haystack = *haystack_cdv_ptr] __device__(auto const idx) {
-               if (check_nulls && d_haystack.is_null_nocheck(static_cast<size_type>(idx))) {
-                 return false;
-               }
-               return d_comp(idx, rhs_index_type{0});  // compare haystack[idx] == needle[0].
-             }) > 0;
+
+    // Using a temporary buffer for intermediate transform results from the lambda containing
+    // the comparator speeds up compile-time significantly without much degradation in
+    // runtime performance over using the comparator in a transform iterator with thrust::count_if.
+    auto d_results = rmm::device_uvector<bool>(haystack.size(), stream);
+    thrust::transform(
+      rmm::exec_policy(stream),
+      begin,
+      end,
+      d_results.begin(),
+      [d_comp, check_nulls, d_haystack = *haystack_cdv_ptr] __device__(auto const idx) {
+        if (check_nulls && d_haystack.is_null_nocheck(static_cast<size_type>(idx))) {
+          return false;
+        }
+        return d_comp(idx, rhs_index_type{0});  // compare haystack[idx] == needle[0].
+      });
+
+    return thrust::count(rmm::exec_policy(stream), d_results.begin(), d_results.end(), true) > 0;
   }
 };
 
diff --git a/cpp/src/sort/is_sorted.cu b/cpp/src/sort/is_sorted.cu
index 4c5ad1ef0ea..25c594e9e74 100644
--- a/cpp/src/sort/is_sorted.cu
+++ b/cpp/src/sort/is_sorted.cu
@@ -27,13 +27,15 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/count.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/sort.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace detail {
 
-auto is_sorted(cudf::table_view const& in,
+bool is_sorted(cudf::table_view const& in,
                std::vector<order> const& column_order,
                std::vector<null_order> const& null_precedence,
                rmm::cuda_stream_view stream)
@@ -44,16 +46,25 @@ auto is_sorted(cudf::table_view const& in,
   if (cudf::detail::has_nested_columns(in)) {
     auto const device_comparator = comparator.less<true>(has_nested_nulls(in));
 
-    return thrust::is_sorted(rmm::exec_policy(stream),
-                             thrust::make_counting_iterator(0),
-                             thrust::make_counting_iterator(in.num_rows()),
-                             device_comparator);
+    // Using a temporary buffer for intermediate transform results from the lambda containing
+    // the comparator speeds up compile-time significantly over using the comparator directly
+    // in thrust::is_sorted.
+    auto d_results = rmm::device_uvector<bool>(in.num_rows(), stream);
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::counting_iterator<size_type>(0),
+                      thrust::counting_iterator<size_type>(in.num_rows()),
+                      d_results.begin(),
+                      [device_comparator] __device__(auto idx) -> bool {
+                        return (idx == 0) || device_comparator(idx - 1, idx);
+                      });
+
+    return thrust::count(rmm::exec_policy(stream), d_results.begin(), d_results.end(), false) == 0;
   } else {
     auto const device_comparator = comparator.less<false>(has_nested_nulls(in));
 
     return thrust::is_sorted(rmm::exec_policy(stream),
-                             thrust::make_counting_iterator(0),
-                             thrust::make_counting_iterator(in.num_rows()),
+                             thrust::counting_iterator<size_type>(0),
+                             thrust::counting_iterator<size_type>(in.num_rows()),
                              device_comparator);
   }
 }
diff --git a/cpp/src/stream_compaction/unique.cu b/cpp/src/stream_compaction/unique.cu
index 511a7b7ae1c..2d81c00e9d9 100644
--- a/cpp/src/stream_compaction/unique.cu
+++ b/cpp/src/stream_compaction/unique.cu
@@ -67,38 +67,51 @@ std::unique_ptr<table> unique(table_view const& input,
 
   auto comp = cudf::experimental::row::equality::self_comparator(keys_view, stream);
 
-  auto const comparator_helper = [&](auto const row_equal) {
-    // get indices of unique rows
-    auto result_end = unique_copy(thrust::counting_iterator<size_type>(0),
-                                  thrust::counting_iterator<size_type>(num_rows),
-                                  mutable_view->begin<size_type>(),
-                                  row_equal,
-                                  keep,
-                                  stream);
+  size_type const unique_size = [&] {
+    if (cudf::detail::has_nested_columns(keys_view)) {
+      // Using a temporary buffer for intermediate transform results from the functor containing
+      // the comparator speeds up compile-time significantly without much degradation in
+      // runtime performance over using the comparator directly in thrust::unique_copy.
+      auto row_equal =
+        comp.equal_to<true>(nullate::DYNAMIC{has_nested_nulls(keys_view)}, nulls_equal);
+      auto d_results = rmm::device_uvector<bool>(num_rows, stream);
+      auto itr       = thrust::make_counting_iterator<size_type>(0);
+      thrust::transform(
+        rmm::exec_policy(stream),
+        itr,
+        itr + num_rows,
+        d_results.begin(),
+        unique_copy_fn<decltype(itr), decltype(row_equal)>{itr, keep, row_equal, num_rows - 1});
+      auto result_end = thrust::copy_if(rmm::exec_policy(stream),
+                                        itr,
+                                        itr + num_rows,
+                                        d_results.begin(),
+                                        mutable_view->begin<size_type>(),
+                                        thrust::identity<bool>{});
+      return static_cast<size_type>(thrust::distance(mutable_view->begin<size_type>(), result_end));
+    } else {
+      // Using thrust::unique_copy with the comparator directly will compile more slowly but
+      // improves runtime by up to 2x over the transform/copy_if approach above.
+      auto row_equal =
+        comp.equal_to<false>(nullate::DYNAMIC{has_nested_nulls(keys_view)}, nulls_equal);
+      auto result_end = unique_copy(thrust::counting_iterator<size_type>(0),
+                                    thrust::counting_iterator<size_type>(num_rows),
+                                    mutable_view->begin<size_type>(),
+                                    row_equal,
+                                    keep,
+                                    stream);
+      return static_cast<size_type>(thrust::distance(mutable_view->begin<size_type>(), result_end));
+    }
+  }();
+  auto indices_view = cudf::detail::slice(column_view(*unique_indices), 0, unique_size);
 
-    auto indices_view =
-      cudf::detail::slice(column_view(*unique_indices),
-                          0,
-                          thrust::distance(mutable_view->begin<size_type>(), result_end));
-
-    // gather unique rows and return
-    return detail::gather(input,
-                          indices_view,
-                          out_of_bounds_policy::DONT_CHECK,
-                          detail::negative_index_policy::NOT_ALLOWED,
-                          stream,
-                          mr);
-  };
-
-  if (cudf::detail::has_nested_columns(keys_view)) {
-    auto row_equal =
-      comp.equal_to<true>(nullate::DYNAMIC{has_nested_nulls(keys_view)}, nulls_equal);
-    return comparator_helper(row_equal);
-  } else {
-    auto row_equal =
-      comp.equal_to<false>(nullate::DYNAMIC{has_nested_nulls(keys_view)}, nulls_equal);
-    return comparator_helper(row_equal);
-  }
+  // gather unique rows and return
+  return detail::gather(input,
+                        indices_view,
+                        out_of_bounds_policy::DONT_CHECK,
+                        detail::negative_index_policy::NOT_ALLOWED,
+                        stream,
+                        mr);
 }
 }  // namespace detail
 
diff --git a/cpp/src/stream_compaction/unique_count.cu b/cpp/src/stream_compaction/unique_count.cu
index 4c1cf2b2bc3..ac9924311c2 100644
--- a/cpp/src/stream_compaction/unique_count.cu
+++ b/cpp/src/stream_compaction/unique_count.cu
@@ -38,6 +38,7 @@
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/logical.h>
+#include <thrust/transform.h>
 
 #include <cmath>
 #include <cstddef>
@@ -76,14 +77,23 @@ cudf::size_type unique_count(table_view const& keys,
   if (cudf::detail::has_nested_columns(keys)) {
     auto const comp =
       row_comp.equal_to<true>(nullate::DYNAMIC{has_nested_nulls(keys)}, nulls_equal);
-    return thrust::count_if(
-      rmm::exec_policy(stream),
-      thrust::counting_iterator<cudf::size_type>(0),
-      thrust::counting_iterator<cudf::size_type>(keys.num_rows()),
-      [comp] __device__(cudf::size_type i) { return (i == 0 or not comp(i, i - 1)); });
+    // Using a temporary buffer for intermediate transform results from the lambda containing
+    // the comparator speeds up compile-time significantly without much degradation in
+    // runtime performance over using the comparator directly in thrust::count_if.
+    auto d_results = rmm::device_uvector<bool>(keys.num_rows(), stream);
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(keys.num_rows()),
+                      d_results.begin(),
+                      [comp] __device__(auto i) { return (i == 0 or not comp(i, i - 1)); });
+
+    return static_cast<size_type>(
+      thrust::count(rmm::exec_policy(stream), d_results.begin(), d_results.end(), true));
   } else {
     auto const comp =
       row_comp.equal_to<false>(nullate::DYNAMIC{has_nested_nulls(keys)}, nulls_equal);
+    // Using thrust::copy_if with the comparator directly will compile more slowly but
+    // improves runtime by up to 2x over the transform/count approach above.
     return thrust::count_if(
       rmm::exec_policy(stream),
       thrust::counting_iterator<cudf::size_type>(0),