rapidsai · rapids-bot · Jun 27, 2024 · Mar 12, 2024 · Mar 13, 2024 · Jun 11, 2024
@@ -27,6 +27,7 @@
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
+#include <cuco/static_set.cuh>
 #include <cuda/functional>
 #include <thrust/copy.h>
 #include <thrust/distance.h>
@@ -38,86 +39,118 @@
 
 namespace cudf {
 namespace detail {
-
-rmm::device_uvector<size_type> distinct_indices(table_view const& input,
-                                                duplicate_keep_option keep,
-                                                null_equality nulls_equal,
-                                                nan_equality nans_equal,
-                                                rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+namespace {
+/**
+ * @brief Invokes the given `func` with desired the row equality and probing method
+ *
+ * @tparam HasNested Flag indicating whether there are nested columns in the input
+ * @tparam Hasher Type of device hash function
+ * @tparam Func Type of the helper function doing `distinct` check
+ *
+ * @param compare_nulls Control whether nulls should be compared as equal or not
+ * @param compare_nans Control whether floating-point NaNs values should be compared as equal or not
+ * @param has_nulls Flag indicating whether the input has nulls or not
+ * @param equal Self table comparator
+ * @param d_hash Device hash functor
+ * @param func The input functor to invoke
+ */
+template <bool HasNested, typename Hasher, typename Func>
+rmm::device_uvector<cudf::size_type> dispatch_hash_set(
+  null_equality compare_nulls,
+  nan_equality compare_nans,
+  bool has_nulls,
+  cudf::experimental::row::equality::self_comparator row_equal,
+  Hasher const& d_hash,
+  Func&& func)
 {
-  if (input.num_rows() == 0 or input.num_columns() == 0) {
-    return rmm::device_uvector<size_type>(0, stream, mr);
-  }
-
-  auto map = hash_map_type{compute_hash_table_size(input.num_rows()),
-                           cuco::empty_key{-1},
-                           cuco::empty_value{std::numeric_limits<size_type>::min()},
-                           cudf::detail::cuco_allocator{stream},
-                           stream.value()};
-
-  auto const preprocessed_input =
-    cudf::experimental::row::hash::preprocessed_table::create(input, stream);
-  auto const has_nulls          = nullate::DYNAMIC{cudf::has_nested_nulls(input)};
-  auto const has_nested_columns = cudf::detail::has_nested_columns(input);
-
-  auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
-  auto const key_hasher = row_hasher.device_hasher(has_nulls);
-
-  auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input);
-
-  auto const pair_iter = cudf::detail::make_counting_transform_iterator(
-    size_type{0},
-    cuda::proclaim_return_type<cuco::pair<size_type, size_type>>(
-      [] __device__(size_type const i) { return cuco::make_pair(i, i); }));
-
-  auto const insert_keys = [&](auto const value_comp) {
-    if (has_nested_columns) {
-      auto const key_equal = row_comp.equal_to<true>(has_nulls, nulls_equal, value_comp);
-      map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
+  // Distinguish probing scheme CG sizes between nested and flat types for better performance
+  auto const probing_scheme = [&]() {
+    if constexpr (HasNested) {
+      return cuco::linear_probing<4, Hasher>{d_hash};
     } else {
-      auto const key_equal = row_comp.equal_to<false>(has_nulls, nulls_equal, value_comp);
-      map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
+      return cuco::linear_probing<1, Hasher>{d_hash};
     }
-  };
+  }();
 
-  if (nans_equal == nan_equality::ALL_EQUAL) {
-    using nan_equal_comparator =
-      cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
-    insert_keys(nan_equal_comparator{});
+  if (compare_nans == nan_equality::ALL_EQUAL) {
+    auto const d_equal = row_equal.equal_to<HasNested>(
+      nullate::DYNAMIC{has_nulls},
+      compare_nulls,
+      cudf::experimental::row::equality::nan_equal_physical_equality_comparator{});
+    return func(d_equal, probing_scheme);
   } else {
-    using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
-    insert_keys(nan_unequal_comparator{});
+    auto const d_equal = row_equal.equal_to<HasNested>(
+      nullate::DYNAMIC{has_nulls},
+      compare_nulls,
+      cudf::experimental::row::equality::physical_equality_comparator{});
+    return func(d_equal, probing_scheme);
   }
+}
+}  // namespace
 
-  auto output_indices = rmm::device_uvector<size_type>(map.get_size(), stream, mr);
+template <typename SetRef>
+void distinct_first_last_none(SetRef set,
+                              rmm::device_uvector<size_type>& output_indices,
+                              size_type num_rows,
+                              duplicate_keep_option keep,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
+{
+  auto reduction_results = rmm::device_uvector<size_type>(num_rows, stream, mr);
+  thrust::uninitialized_fill(rmm::exec_policy(stream),
+                             reduction_results.begin(),
+                             reduction_results.end(),
+                             reduction_init_value(keep));
 
-  // If we don't care about order, just gather indices of distinct keys taken from map.
-  if (keep == duplicate_keep_option::KEEP_ANY) {
-    map.retrieve_all(output_indices.begin(), thrust::make_discard_iterator(), stream.value());
-    return output_indices;
-  }
+  static auto constexpr cg_size = SetRef::cg_size;
+
+  thrust::for_each(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(num_rows * cg_size),
+    [set, keep, reduction_results = reduction_results.begin()] __device__(
+      size_type const idx) mutable {
+      size_type cg_idx = idx / cg_size;
+
+      auto [out_ptr, inserted] = [&]() {
+        if constexpr (cg_size == 1) {
+          return set.insert_and_find(idx);
+        } else {
+          auto const tile =
+            cooperative_groups::tiled_partition<cg_size>(cooperative_groups::this_thread_block());
+          return set.insert_and_find(tile, cg_idx);
+        }
+      }();
+
+      auto const tile =
+        cooperative_groups::tiled_partition<cg_size>(cooperative_groups::this_thread_block());
+      if (keep == duplicate_keep_option::KEEP_FIRST and tile.thread_rank() == 0) {
+        // Store the smallest index of all rows that are equal.
+        auto ref =
+          cuda::atomic_ref<size_type, cuda::thread_scope_device>{reduction_results[*out_ptr]};
+        ref.fetch_min(cg_idx, cuda::memory_order_relaxed);
+      }
+      if (keep == duplicate_keep_option::KEEP_LAST and tile.thread_rank() == 0) {
+        // Store the greatest index of all rows that are equal.
+        auto ref =
+          cuda::atomic_ref<size_type, cuda::thread_scope_device>{reduction_results[*out_ptr]};
+        ref.fetch_max(cg_idx, cuda::memory_order_relaxed);
+      }
+      if (keep == duplicate_keep_option::KEEP_NONE and tile.thread_rank() == 0) {
+        // Count the number of rows in each group of rows that are compared equal.
+        auto ref =
+          cuda::atomic_ref<size_type, cuda::thread_scope_device>{reduction_results[*out_ptr]};
+        ref.fetch_add(size_type{1}, cuda::memory_order_relaxed);
+      }
+    });
 
-  // For other keep options, reduce by row on rows that compare equal.
-  auto const reduction_results = reduce_by_row(map,
-                                               std::move(preprocessed_input),
-                                               input.num_rows(),
-                                               has_nulls,
-                                               has_nested_columns,
-                                               keep,
-                                               nulls_equal,
-                                               nans_equal,
-                                               stream,
-                                               rmm::mr::get_current_device_resource());
-
-  // Extract the desired output indices from reduction results.
   auto const map_end = [&] {
     if (keep == duplicate_keep_option::KEEP_NONE) {
       // Reduction results with `KEEP_NONE` are either group sizes of equal rows, or `0`.
       // Thus, we only output index of the rows in the groups having group size of `1`.
       return thrust::copy_if(rmm::exec_policy(stream),
                              thrust::make_counting_iterator(0),
-                             thrust::make_counting_iterator(input.num_rows()),
+                             thrust::make_counting_iterator(num_rows),
                              output_indices.begin(),
                              [reduction_results = reduction_results.begin()] __device__(
                                auto const idx) { return reduction_results[idx] == size_type{1}; });
@@ -136,7 +169,61 @@ rmm::device_uvector<size_type> distinct_indices(table_view const& input,
   }();
 
   output_indices.resize(thrust::distance(output_indices.begin(), map_end), stream);
-  return output_indices;
+}
+
+rmm::device_uvector<size_type> distinct_indices(table_view const& input,
+                                                duplicate_keep_option keep,
+                                                null_equality nulls_equal,
+                                                nan_equality nans_equal,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr)
+{
+  if (input.num_rows() == 0 or input.num_columns() == 0) {
+    return rmm::device_uvector<size_type>(0, stream, mr);
+  }
+
+  auto const preprocessed_input =
+    cudf::experimental::row::hash::preprocessed_table::create(input, stream);
+  auto const has_nulls          = nullate::DYNAMIC{cudf::has_nested_nulls(input)};
+  auto const has_nested_columns = cudf::detail::has_nested_columns(input);
+
+  auto const row_hash = cudf::experimental::row::hash::row_hasher(preprocessed_input);
+  auto const d_hash   = row_hash.device_hasher(has_nulls);
+
+  auto const row_equal = cudf::experimental::row::equality::self_comparator(preprocessed_input);
+
+  auto const helper_func = [&](auto const& d_equal, auto const& probing_scheme) {
+    auto set        = cuco::static_set{input.num_rows(),
+                                0.5,  // desired load factor
+                                cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+                                d_equal,
+                                probing_scheme,
+                                       {},
+                                       {},
+                                cudf::detail::cuco_allocator{stream},
+                                stream.value()};
+    auto const iter = thrust::counting_iterator<cudf::size_type>{0};
+    auto const size = set.insert(iter, iter + input.num_rows(), stream.value());
+
+    auto output_indices = rmm::device_uvector<size_type>(size, stream, mr);
+    // If we don't care about order, just gather indices of distinct keys taken from map.
+    if (keep == duplicate_keep_option::KEEP_ANY) {
+      set.retrieve_all(output_indices.begin(), stream.value());
+      return output_indices;
+    }
+
+    distinct_first_last_none(
+      set.ref(cuco::op::insert_and_find), output_indices, input.num_rows(), keep, stream, mr);
+    return output_indices;
+  };
+
+  if (cudf::detail::has_nested_columns(input)) {
+    return dispatch_hash_set<true>(
+      nulls_equal, nans_equal, has_nulls, row_equal, d_hash, helper_func);
+  } else {
+    return dispatch_hash_set<false>(
+      nulls_equal, nans_equal, has_nulls, row_equal, d_hash, helper_func);
+  }
 }
 
 std::unique_ptr<table> distinct(table_view const& input,

@@ -15,16 +15,17 @@
  */
 
 #include "stream_compaction_common.cuh"
-#include "stream_compaction_common.hpp"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sorting.hpp>
 #include <cudf/detail/stream_compaction.hpp>
+#include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,6 @@
 
 #include "distinct_helpers.hpp"
 
-#include <cudf/detail/hash_reduce_by_row.cuh>
-
 namespace cudf::detail {
 
 namespace {

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#include "stream_compaction_common.hpp"
-
+#include <cudf/detail/hash_reduce_by_row.cuh>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/types.hpp>

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,9 +15,8 @@
  */
 #pragma once
 
-#include "stream_compaction_common.hpp"
-
 #include <cudf/stream_compaction.hpp>
+#include <cudf/utilities/bit.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,6 @@
  */
 
 #include "stream_compaction_common.cuh"
-#include "stream_compaction_common.hpp"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>