rapidsai · rapids-bot · Mar 27, 2023 · Mar 7, 2023 · Mar 7, 2023 · Mar 7, 2023
@@ -150,6 +150,7 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp
 # * stream_compaction benchmark -------------------------------------------------------------------
 ConfigureNVBench(
   STREAM_COMPACTION_NVBENCH stream_compaction/distinct.cpp stream_compaction/unique.cpp
+  stream_compaction/unique_count.cpp
 )
 
 # ##################################################################################################

@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/rmm_pool_raii.hpp>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/stream_compaction.hpp>
+#include <cudf/types.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template <typename Type>
+void nvbench_unique_count(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("NumRows"));
+  auto const nulls    = state.get_float64("NullProbability");
+
+  data_profile profile = data_profile_builder().cardinality(0).null_probability(nulls).distribution(
+    cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, num_rows / 100);
+
+  auto source_column = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
+  auto sorted_table  = cudf::sort(cudf::table_view({source_column->view()}));
+
+  auto input = sorted_table->view();
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    cudf::unique_count(input, cudf::null_equality::EQUAL);
+  });
+}
+
+using data_type = nvbench::type_list<bool, int8_t, int32_t, int64_t, float, cudf::timestamp_ms>;
+
+NVBENCH_BENCH_TYPES(nvbench_unique_count, NVBENCH_TYPE_AXES(data_type))
+  .set_name("unique_count")
+  .set_type_axes_names({"Type"})
+  .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000})
+  .add_float64_axis("NullProbability", {0.0, 0.1});
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,10 +25,12 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/reduce.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace groupby {
@@ -62,15 +64,19 @@ void compute_m2_fn(column_device_view const& values,
                    ResultType* d_result,
                    rmm::cuda_stream_view stream)
 {
-  auto const var_iter = cudf::detail::make_counting_transform_iterator(
-    size_type{0},
-    m2_transform<ResultType, decltype(values_iter)>{
-      values, values_iter, d_means, group_labels.data()});
+  auto m2_fn = m2_transform<ResultType, decltype(values_iter)>{
+    values, values_iter, d_means, group_labels.data()};
+  auto const itr = thrust::counting_iterator<size_type>(0);
+  // Using a temporary buffer for intermediate transform results instead of
+  // using the transform-iterator directly in thrust::reduce_by_key
+  // improves compile-time significantly.
+  auto m2_vals = rmm::device_uvector<ResultType>(values.size(), stream);
+  thrust::transform(rmm::exec_policy(stream), itr, itr + values.size(), m2_vals.begin(), m2_fn);
 
   thrust::reduce_by_key(rmm::exec_policy(stream),
                         group_labels.begin(),
                         group_labels.end(),
-                        var_iter,
+                        m2_vals.begin(),
                         thrust::make_discard_iterator(),
                         d_result);
 }

@@ -94,21 +94,20 @@ std::unique_ptr<column> group_nunique(column_view const& values,
 
   auto const d_values_view = column_device_view::create(values, stream);
 
+  auto d_result = rmm::device_uvector<size_type>(group_labels.size(), stream);
+
   auto const comparator_helper = [&](auto const d_equal) {
-    auto const is_unique_iterator =
-      thrust::make_transform_iterator(thrust::counting_iterator<cudf::size_type>(0),
-                                      is_unique_iterator_fn{nullate::DYNAMIC{values.has_nulls()},
-                                                            *d_values_view,
-                                                            d_equal,
-                                                            null_handling,
-                                                            group_offsets.data(),
-                                                            group_labels.data()});
-    thrust::reduce_by_key(rmm::exec_policy(stream),
-                          group_labels.begin(),
-                          group_labels.end(),
-                          is_unique_iterator,
-                          thrust::make_discard_iterator(),
-                          result->mutable_view().begin<size_type>());
+    auto fn = is_unique_iterator_fn{nullate::DYNAMIC{values.has_nulls()},
+                                    *d_values_view,
+                                    d_equal,
+                                    null_handling,
+                                    group_offsets.data(),
+                                    group_labels.data()};
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(values.size()),
+                      d_result.begin(),
+                      fn);
   };
 
   if (cudf::detail::has_nested_columns(values_view)) {
@@ -121,6 +120,15 @@ std::unique_ptr<column> group_nunique(column_view const& values,
     comparator_helper(d_equal);
   }
 
+  // calling this with a vector instead of a transform iterator is 10x faster to compile;
+  // it also helps that we are only calling it once for both conditions
+  thrust::reduce_by_key(rmm::exec_policy(stream),
+                        group_labels.begin(),
+                        group_labels.end(),
+                        d_result.begin(),
+                        thrust::make_discard_iterator(),
+                        result->mutable_view().begin<size_type>());
+
   return result;
 }
 

@@ -26,13 +26,15 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/reduce.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace groupby {
@@ -48,7 +50,7 @@ struct var_transform {
   size_type const* d_group_labels;
   size_type ddof;
 
-  __device__ ResultType operator()(size_type i)
+  __device__ ResultType operator()(size_type i) const
   {
     if (d_values.is_null(i)) return 0.0;
 
@@ -75,15 +77,19 @@ void reduce_by_key_fn(column_device_view const& values,
                       ResultType* d_result,
                       rmm::cuda_stream_view stream)
 {
-  auto var_iter = thrust::make_transform_iterator(
-    thrust::make_counting_iterator(0),
-    var_transform<ResultType, decltype(values_iter)>{
-      values, values_iter, d_means, d_group_sizes, group_labels.data(), ddof});
+  auto var_fn = var_transform<ResultType, decltype(values_iter)>{
+    values, values_iter, d_means, d_group_sizes, group_labels.data(), ddof};
+  auto const itr = thrust::make_counting_iterator<size_type>(0);
+  // Using a temporary buffer for intermediate transform results instead of
+  // using the transform-iterator directly in thrust::reduce_by_key
+  // improves compile-time significantly.
+  auto vars = rmm::device_uvector<ResultType>(values.size(), stream);
+  thrust::transform(rmm::exec_policy(stream), itr, itr + values.size(), vars.begin(), var_fn);
 
   thrust::reduce_by_key(rmm::exec_policy(stream),
                         group_labels.begin(),
                         group_labels.end(),
-                        var_iter,
+                        vars.begin(),
                         thrust::make_discard_iterator(),
                         d_result);
 }

@@ -16,6 +16,8 @@
 
 #include "common_utils.cuh"
 
+#include <stream_compaction/stream_compaction_common.cuh>
+
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/copy.hpp>
@@ -144,7 +146,8 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(
 {
   if (_group_offsets) return *_group_offsets;
 
-  _group_offsets = std::make_unique<index_vector>(num_keys(stream) + 1, stream);
+  auto const size = num_keys(stream);
+  _group_offsets  = std::make_unique<index_vector>(size + 1, stream);
 
   auto const comparator = cudf::experimental::row::equality::self_comparator{_keys, stream};
 
@@ -154,23 +157,33 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(
   if (cudf::detail::has_nested_columns(_keys)) {
     auto const d_key_equal = comparator.equal_to<true>(
       cudf::nullate::DYNAMIC{cudf::has_nested_nulls(_keys)}, null_equality::EQUAL);
-    result_end = thrust::unique_copy(rmm::exec_policy(stream),
-                                     thrust::counting_iterator<size_type>(0),
-                                     thrust::counting_iterator<size_type>(num_keys(stream)),
-                                     _group_offsets->begin(),
-                                     permuted_row_equality_comparator(d_key_equal, sorted_order));
+    // Using a temporary buffer for intermediate transform results from the iterator containing
+    // the comparator speeds up compile-time significantly without much degradation in
+    // runtime performance over using the comparator directly in thrust::unique_copy.
+    auto result       = rmm::device_uvector<bool>(size, stream);
+    auto const itr    = thrust::make_counting_iterator<size_type>(0);
+    auto const row_eq = permuted_row_equality_comparator(d_key_equal, sorted_order);
+    auto const ufn    = cudf::detail::unique_copy_fn<decltype(itr), decltype(row_eq)>{
+      itr, duplicate_keep_option::KEEP_FIRST, row_eq, size - 1};
+    thrust::transform(rmm::exec_policy(stream), itr, itr + size, result.begin(), ufn);
+    result_end = thrust::copy_if(rmm::exec_policy(stream),
+                                 itr,
+                                 itr + size,
+                                 result.begin(),
+                                 _group_offsets->begin(),
+                                 thrust::identity<bool>{});
   } else {
     auto const d_key_equal = comparator.equal_to<false>(
       cudf::nullate::DYNAMIC{cudf::has_nested_nulls(_keys)}, null_equality::EQUAL);
     result_end = thrust::unique_copy(rmm::exec_policy(stream),
                                      thrust::counting_iterator<size_type>(0),
-                                     thrust::counting_iterator<size_type>(num_keys(stream)),
+                                     thrust::counting_iterator<size_type>(size),
                                      _group_offsets->begin(),
                                      permuted_row_equality_comparator(d_key_equal, sorted_order));
   }
 
   size_type num_groups = thrust::distance(_group_offsets->begin(), result_end);
-  _group_offsets->set_element(num_groups, num_keys(stream), stream);
+  _group_offsets->set_element(num_groups, size, stream);
   _group_offsets->resize(num_groups + 1, stream);
 
   return *_group_offsets;

@@ -30,6 +30,7 @@
 
 #include <thrust/count.h>
 #include <thrust/pair.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace detail {
@@ -108,16 +109,24 @@ struct contains_scalar_dispatch {
     auto const haystack_cdv_ptr = column_device_view::create(haystack, stream);
 
     auto const d_comp = comparator.equal_to<true>(nullate::DYNAMIC{has_nulls});
-    return thrust::count_if(
-             rmm::exec_policy(stream),
-             begin,
-             end,
-             [d_comp, check_nulls, d_haystack = *haystack_cdv_ptr] __device__(auto const idx) {
-               if (check_nulls && d_haystack.is_null_nocheck(static_cast<size_type>(idx))) {
-                 return false;
-               }
-               return d_comp(idx, rhs_index_type{0});  // compare haystack[idx] == needle[0].
-             }) > 0;
+
+    // Using a temporary buffer for intermediate transform results from the lambda containing
+    // the comparator speeds up compile-time significantly without much degradation in
+    // runtime performance over using the comparator in a transform iterator with thrust::count_if.
+    auto d_results = rmm::device_uvector<bool>(haystack.size(), stream);
+    thrust::transform(
+      rmm::exec_policy(stream),
+      begin,
+      end,
+      d_results.begin(),
+      [d_comp, check_nulls, d_haystack = *haystack_cdv_ptr] __device__(auto const idx) {
+        if (check_nulls && d_haystack.is_null_nocheck(static_cast<size_type>(idx))) {
+          return false;
+        }
+        return d_comp(idx, rhs_index_type{0});  // compare haystack[idx] == needle[0].
+      });
+
+    return thrust::count(rmm::exec_policy(stream), d_results.begin(), d_results.end(), true) > 0;
   }
 };
 

@@ -27,13 +27,15 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/count.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/sort.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace detail {
 
-auto is_sorted(cudf::table_view const& in,
+bool is_sorted(cudf::table_view const& in,
                std::vector<order> const& column_order,
                std::vector<null_order> const& null_precedence,
                rmm::cuda_stream_view stream)
@@ -44,16 +46,25 @@ auto is_sorted(cudf::table_view const& in,
   if (cudf::detail::has_nested_columns(in)) {
     auto const device_comparator = comparator.less<true>(has_nested_nulls(in));
 
-    return thrust::is_sorted(rmm::exec_policy(stream),
-                             thrust::make_counting_iterator(0),
-                             thrust::make_counting_iterator(in.num_rows()),
-                             device_comparator);
+    // Using a temporary buffer for intermediate transform results from the lambda containing
+    // the comparator speeds up compile-time significantly over using the comparator directly
+    // in thrust::is_sorted.
+    auto d_results = rmm::device_uvector<bool>(in.num_rows(), stream);
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::counting_iterator<size_type>(0),
+                      thrust::counting_iterator<size_type>(in.num_rows()),
+                      d_results.begin(),
+                      [device_comparator] __device__(auto idx) -> bool {
+                        return (idx == 0) || device_comparator(idx - 1, idx);
+                      });
+
+    return thrust::count(rmm::exec_policy(stream), d_results.begin(), d_results.end(), false) == 0;
   } else {
     auto const device_comparator = comparator.less<false>(has_nested_nulls(in));
 
     return thrust::is_sorted(rmm::exec_policy(stream),
-                             thrust::make_counting_iterator(0),
-                             thrust::make_counting_iterator(in.num_rows()),
+                             thrust::counting_iterator<size_type>(0),
+                             thrust::counting_iterator<size_type>(in.num_rows()),
                              device_comparator);
   }
 }