From b67c0a97d16ec3c9d0abf825ad9755013b24ebab Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 2 Dec 2024 16:04:59 -0800
Subject: [PATCH] Update MurmurHash3_x64_128 to use the cuco equivalent
 implementation (#17457)

This PR modifies MurmurHash3_x64_128 to utilize the cuco equivalent implementation, eliminating duplication.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/17457
---
 .../hashing/detail/murmurhash3_x64_128.cuh    | 163 +++---------------
 .../hashing/detail/murmurhash3_x86_32.cuh     |   2 +-
 cpp/include/cudf/hashing/detail/xxhash_64.cuh |   2 +-
 cpp/src/hash/murmurhash3_x64_128.cu           |  17 +-
 4 files changed, 35 insertions(+), 149 deletions(-)
diff --git a/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh b/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh
index 5e88b905023..31390aa3edf 100644
--- a/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh
+++ b/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh
@@ -15,177 +15,63 @@
  */
 #pragma once
 
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/hashing.hpp>
 #include <cudf/hashing/detail/hash_functions.cuh>
 #include <cudf/strings/string_view.cuh>
 
-#include <thrust/pair.h>
+#include <cuco/hash_functions.cuh>
+#include <cuda/std/array>
+#include <cuda/std/cstddef>
 
 namespace cudf::hashing::detail {
 
-// MurmurHash3_x64_128 implementation from
-// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
-//-----------------------------------------------------------------------------
-// MurmurHash3 was written by Austin Appleby, and is placed in the public
-// domain. The author hereby disclaims copyright to this source code.
-// Note - The x86 and x64 versions do _not_ produce the same results, as the
-// algorithms are optimized for their respective platforms. You can still
-// compile and run any of them on any platform, but your performance with the
-// non-native version will be less than optimal.
 template <typename Key>
 struct MurmurHash3_x64_128 {
-  using result_type = thrust::pair<uint64_t, uint64_t>;
+  using result_type = cuda::std::array<uint64_t, 2>;
 
-  constexpr MurmurHash3_x64_128() = default;
-  constexpr MurmurHash3_x64_128(uint64_t seed) : m_seed(seed) {}
-
-  __device__ inline uint32_t getblock32(std::byte const* data, cudf::size_type offset) const
+  CUDF_HOST_DEVICE constexpr MurmurHash3_x64_128(uint64_t seed = cudf::DEFAULT_HASH_SEED)
+    : _impl{seed}
   {
-    // Read a 4-byte value from the data pointer as individual bytes for safe
-    // unaligned access (very likely for string types).
-    auto block = reinterpret_cast<uint8_t const*>(data + offset);
-    return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24);
   }
 
-  __device__ inline uint64_t getblock64(std::byte const* data, cudf::size_type offset) const
-  {
-    uint64_t result = getblock32(data, offset + 4);
-    result          = result << 32;
-    return result | getblock32(data, offset);
-  }
+  __device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); }
 
-  __device__ inline uint64_t fmix64(uint64_t k) const
+  __device__ constexpr result_type compute_bytes(cuda::std::byte const* bytes,
+                                                 std::uint64_t size) const
   {
-    k ^= k >> 33;
-    k *= 0xff51afd7ed558ccdUL;
-    k ^= k >> 33;
-    k *= 0xc4ceb9fe1a85ec53UL;
-    k ^= k >> 33;
-    return k;
+    return this->_impl.compute_hash(bytes, size);
   }
 
-  result_type __device__ inline operator()(Key const& key) const { return compute(key); }
-
+ private:
   template <typename T>
-  result_type __device__ inline compute(T const& key) const
-  {
-    return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(T));
-  }
-
-  result_type __device__ inline compute_remaining_bytes(std::byte const* data,
-                                                        cudf::size_type len,
-                                                        cudf::size_type tail_offset,
-                                                        result_type h) const
-  {
-    // Process remaining bytes that do not fill a 8-byte chunk.
-    uint64_t k1     = 0;
-    uint64_t k2     = 0;
-    auto const tail = reinterpret_cast<uint8_t const*>(data) + tail_offset;
-    switch (len & (BLOCK_SIZE - 1)) {
-      case 15: k2 ^= static_cast<uint64_t>(tail[14]) << 48;
-      case 14: k2 ^= static_cast<uint64_t>(tail[13]) << 40;
-      case 13: k2 ^= static_cast<uint64_t>(tail[12]) << 32;
-      case 12: k2 ^= static_cast<uint64_t>(tail[11]) << 24;
-      case 11: k2 ^= static_cast<uint64_t>(tail[10]) << 16;
-      case 10: k2 ^= static_cast<uint64_t>(tail[9]) << 8;
-      case 9:
-        k2 ^= static_cast<uint64_t>(tail[8]) << 0;
-        k2 *= c2;
-        k2 = rotate_bits_left(k2, 33);
-        k2 *= c1;
-        h.second ^= k2;
-
-      case 8: k1 ^= static_cast<uint64_t>(tail[7]) << 56;
-      case 7: k1 ^= static_cast<uint64_t>(tail[6]) << 48;
-      case 6: k1 ^= static_cast<uint64_t>(tail[5]) << 40;
-      case 5: k1 ^= static_cast<uint64_t>(tail[4]) << 32;
-      case 4: k1 ^= static_cast<uint64_t>(tail[3]) << 24;
-      case 3: k1 ^= static_cast<uint64_t>(tail[2]) << 16;
-      case 2: k1 ^= static_cast<uint64_t>(tail[1]) << 8;
-      case 1:
-        k1 ^= static_cast<uint64_t>(tail[0]) << 0;
-        k1 *= c1;
-        k1 = rotate_bits_left(k1, 31);
-        k1 *= c2;
-        h.first ^= k1;
-    };
-    return h;
-  }
-
-  result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const
+  __device__ constexpr result_type compute(T const& key) const
   {
-    auto const nblocks = len / BLOCK_SIZE;
-    uint64_t h1        = m_seed;
-    uint64_t h2        = m_seed;
-
-    // Process all four-byte chunks.
-    for (cudf::size_type i = 0; i < nblocks; i++) {
-      uint64_t k1 = getblock64(data, (i * BLOCK_SIZE));                     // 1st 8 bytes
-      uint64_t k2 = getblock64(data, (i * BLOCK_SIZE) + (BLOCK_SIZE / 2));  // 2nd 8 bytes
-
-      k1 *= c1;
-      k1 = rotate_bits_left(k1, 31);
-      k1 *= c2;
-
-      h1 ^= k1;
-      h1 = rotate_bits_left(h1, 27);
-      h1 += h2;
-      h1 = h1 * 5 + 0x52dce729;
-
-      k2 *= c2;
-      k2 = rotate_bits_left(k2, 33);
-      k2 *= c1;
-
-      h2 ^= k2;
-      h2 = rotate_bits_left(h2, 31);
-      h2 += h1;
-      h2 = h2 * 5 + 0x38495ab5;
-    }
-
-    thrust::tie(h1, h2) = compute_remaining_bytes(data, len, nblocks * BLOCK_SIZE, {h1, h2});
-
-    // Finalize hash.
-    h1 ^= len;
-    h2 ^= len;
-
-    h1 += h2;
-    h2 += h1;
-
-    h1 = fmix64(h1);
-    h2 = fmix64(h2);
-
-    h1 += h2;
-    h2 += h1;
-
-    return {h1, h2};
+    return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(&key), sizeof(T));
   }
 
- private:
-  uint64_t m_seed{};
-  static constexpr uint32_t BLOCK_SIZE = 16;  // 2 x 64-bit = 16 bytes
-
-  static constexpr uint64_t c1 = 0x87c37b91114253d5UL;
-  static constexpr uint64_t c2 = 0x4cf5ad432745937fUL;
+  cuco::murmurhash3_x64_128<Key> _impl;
 };
 
 template <>
 MurmurHash3_x64_128<bool>::result_type __device__ inline MurmurHash3_x64_128<bool>::operator()(
   bool const& key) const
 {
-  return compute<uint8_t>(key);
+  return this->compute<uint8_t>(key);
 }
 
 template <>
 MurmurHash3_x64_128<float>::result_type __device__ inline MurmurHash3_x64_128<float>::operator()(
   float const& key) const
 {
-  return compute(normalize_nans(key));
+  return this->compute(normalize_nans(key));
 }
 
 template <>
 MurmurHash3_x64_128<double>::result_type __device__ inline MurmurHash3_x64_128<double>::operator()(
   double const& key) const
 {
-  return compute(normalize_nans(key));
+  return this->compute(normalize_nans(key));
 }
 
 template <>
@@ -193,9 +79,8 @@ MurmurHash3_x64_128<cudf::string_view>::result_type
   __device__ inline MurmurHash3_x64_128<cudf::string_view>::operator()(
     cudf::string_view const& key) const
 {
-  auto const data = reinterpret_cast<std::byte const*>(key.data());
-  auto const len  = key.size_bytes();
-  return compute_bytes(data, len);
+  return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(key.data()),
+                             key.size_bytes());
 }
 
 template <>
@@ -203,7 +88,7 @@ MurmurHash3_x64_128<numeric::decimal32>::result_type
   __device__ inline MurmurHash3_x64_128<numeric::decimal32>::operator()(
     numeric::decimal32 const& key) const
 {
-  return compute(key.value());
+  return this->compute(key.value());
 }
 
 template <>
@@ -211,7 +96,7 @@ MurmurHash3_x64_128<numeric::decimal64>::result_type
   __device__ inline MurmurHash3_x64_128<numeric::decimal64>::operator()(
     numeric::decimal64 const& key) const
 {
-  return compute(key.value());
+  return this->compute(key.value());
 }
 
 template <>
@@ -219,7 +104,7 @@ MurmurHash3_x64_128<numeric::decimal128>::result_type
   __device__ inline MurmurHash3_x64_128<numeric::decimal128>::operator()(
     numeric::decimal128 const& key) const
 {
-  return compute(key.value());
+  return this->compute(key.value());
 }
 
 }  // namespace cudf::hashing::detail
diff --git a/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh b/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh
index 38a7d927b9c..e0c7ce840d7 100644
--- a/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh
+++ b/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh
@@ -33,7 +33,7 @@ template <typename Key>
 struct MurmurHash3_x86_32 {
   using result_type = hash_value_type;
 
-  __host__ __device__ constexpr MurmurHash3_x86_32(uint32_t seed = cudf::DEFAULT_HASH_SEED)
+  CUDF_HOST_DEVICE constexpr MurmurHash3_x86_32(uint32_t seed = cudf::DEFAULT_HASH_SEED)
     : _impl{seed}
   {
   }
diff --git a/cpp/include/cudf/hashing/detail/xxhash_64.cuh b/cpp/include/cudf/hashing/detail/xxhash_64.cuh
index 7d72349e340..d77d040b365 100644
--- a/cpp/include/cudf/hashing/detail/xxhash_64.cuh
+++ b/cpp/include/cudf/hashing/detail/xxhash_64.cuh
@@ -31,7 +31,7 @@ template <typename Key>
 struct XXHash_64 {
   using result_type = std::uint64_t;
 
-  __host__ __device__ constexpr XXHash_64(uint64_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {}
+  CUDF_HOST_DEVICE constexpr XXHash_64(uint64_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {}
 
   __device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); }
 
diff --git a/cpp/src/hash/murmurhash3_x64_128.cu b/cpp/src/hash/murmurhash3_x64_128.cu
index 090bd92af8c..43df7f325ac 100644
--- a/cpp/src/hash/murmurhash3_x64_128.cu
+++ b/cpp/src/hash/murmurhash3_x64_128.cu
@@ -24,6 +24,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/array>
 #include <thrust/for_each.h>
 
 namespace cudf {
@@ -31,7 +32,7 @@ namespace hashing {
 namespace detail {
 namespace {
 
-using hash_value_type = thrust::pair<uint64_t, uint64_t>;
+using hash_value_type = cuda::std::array<uint64_t, 2>;
 
 /**
  * @brief Computes the hash value of a row in the given table.
@@ -58,7 +59,7 @@ class murmur_device_row_hasher {
    */
   __device__ void operator()(size_type row_index) const noexcept
   {
-    auto h = cudf::detail::accumulate(
+    auto const h = cudf::detail::accumulate(
       _input.begin(),
       _input.end(),
       hash_value_type{_seed, 0},
@@ -66,8 +67,8 @@ class murmur_device_row_hasher {
         return cudf::type_dispatcher(
           column.type(), element_hasher_adapter{}, column, row_index, nulls, hash);
       });
-    _output1[row_index] = h.first;
-    _output2[row_index] = h.second;
+    _output1[row_index] = h[0];
+    _output2[row_index] = h[1];
   }
 
   /**
@@ -78,13 +79,13 @@ class murmur_device_row_hasher {
     template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
     __device__ hash_value_type operator()(column_device_view const& col,
                                           size_type row_index,
-                                          Nullate const _check_nulls,
-                                          hash_value_type const _seed) const noexcept
+                                          Nullate const check_nulls,
+                                          hash_value_type const seed) const noexcept
     {
-      if (_check_nulls && col.is_null(row_index)) {
+      if (check_nulls && col.is_null(row_index)) {
         return {std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::max()};
       }
-      auto const hasher = MurmurHash3_x64_128<T>{_seed.first};
+      auto const hasher = MurmurHash3_x64_128<T>{seed[0]};
       return hasher(col.element<T>(row_index));
     }