rapidsai · rapids-bot · Nov 22, 2024 · Nov 20, 2024 · Nov 20, 2024 · Nov 21, 2024
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#pragma once
+
 #include <cudf/hashing/detail/hash_functions.cuh>
 #include <cudf/strings/string_view.cuh>
 

@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/strings/string_view.cuh>
+#include <cudf/types.hpp>
+
+#include <cuco/hash_functions.cuh>
+#include <cuda/std/cstddef>
+
+namespace cudf::hashing::detail {
+
+template <typename Key>
+struct XXHash_64 : public cuco::xxhash_64<Key> {
+  using result_type = typename cuco::xxhash_64<Key>::result_type;
+
+  __device__ result_type operator()(Key const& key) const
+  {
+    return cuco::xxhash_64<Key>::operator()(key);
+  }
+
+  template <typename Extent>
+  __device__ result_type compute_hash(cuda::std::byte const* bytes, Extent size) const
+  {
+    return cuco::xxhash_64<Key>::compute_hash(bytes, size);
+  }
+};
+
+template <>
+XXHash_64<bool>::result_type __device__ inline XXHash_64<bool>::operator()(bool const& key) const
+{
+  return this->compute_hash(reinterpret_cast<cuda::std::byte const*>(&key), sizeof(key));
+}
+
+template <>
+XXHash_64<float>::result_type __device__ inline XXHash_64<float>::operator()(float const& key) const
+{
+  return cuco::xxhash_64<float>::operator()(normalize_nans(key));
+}
+
+template <>
+XXHash_64<double>::result_type __device__ inline XXHash_64<double>::operator()(
+  double const& key) const
+{
+  return cuco::xxhash_64<double>::operator()(normalize_nans(key));
+}
+
+template <>
+XXHash_64<cudf::string_view>::result_type
+  __device__ inline XXHash_64<cudf::string_view>::operator()(cudf::string_view const& key) const
+{
+  return this->compute_hash(reinterpret_cast<cuda::std::byte const*>(key.data()), key.size_bytes());
+}
+
+template <>
+XXHash_64<numeric::decimal32>::result_type
+  __device__ inline XXHash_64<numeric::decimal32>::operator()(numeric::decimal32 const& key) const
+{
+  auto const val = key.value();
+  auto const len = sizeof(val);
+  return this->compute_hash(reinterpret_cast<cuda::std::byte const*>(&val), len);
+}
+
+template <>
+XXHash_64<numeric::decimal64>::result_type
+  __device__ inline XXHash_64<numeric::decimal64>::operator()(numeric::decimal64 const& key) const
+{
+  auto const val = key.value();
+  auto const len = sizeof(val);
+  return this->compute_hash(reinterpret_cast<cuda::std::byte const*>(&val), len);
+}
+
+template <>
+XXHash_64<numeric::decimal128>::result_type
+  __device__ inline XXHash_64<numeric::decimal128>::operator()(numeric::decimal128 const& key) const
+{
+  auto const val = key.value();
+  auto const len = sizeof(val);
+  return this->compute_hash(reinterpret_cast<cuda::std::byte const*>(&val), len);
+}
+
+}  // namespace cudf::hashing::detail
@@ -18,6 +18,7 @@
 #include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/hashing/detail/hash_functions.cuh>
 #include <cudf/hashing/detail/hashing.hpp>
+#include <cudf/hashing/detail/xxhash_64.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
@@ -35,207 +36,6 @@ namespace {
 
 using hash_value_type = uint64_t;
 
-template <typename Key>
-struct XXHash_64 {
-  using result_type = hash_value_type;
-
-  constexpr XXHash_64() = default;
-  constexpr XXHash_64(hash_value_type seed) : m_seed(seed) {}
-
-  __device__ inline uint32_t getblock32(std::byte const* data, std::size_t offset) const
-  {
-    // Read a 4-byte value from the data pointer as individual bytes for safe
-    // unaligned access (very likely for string types).
-    auto block = reinterpret_cast<uint8_t const*>(data + offset);
-    return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24);
-  }
-
-  __device__ inline uint64_t getblock64(std::byte const* data, std::size_t offset) const
-  {
-    uint64_t result = getblock32(data, offset + 4);
-    result          = result << 32;
-    return result | getblock32(data, offset);
-  }
-
-  result_type __device__ inline operator()(Key const& key) const { return compute(key); }
-
-  template <typename T>
-  result_type __device__ inline compute(T const& key) const
-  {
-    auto data = device_span<std::byte const>(reinterpret_cast<std::byte const*>(&key), sizeof(T));
-    return compute_bytes(data);
-  }
-
-  result_type __device__ inline compute_remaining_bytes(device_span<std::byte const>& in,
-                                                        std::size_t offset,
-                                                        result_type h64) const
-  {
-    // remaining data can be processed in 8-byte chunks
-    if ((in.size() % 32) >= 8) {
-      for (; offset <= in.size() - 8; offset += 8) {
-        uint64_t k1 = getblock64(in.data(), offset) * prime2;
-
-        k1 = rotate_bits_left(k1, 31) * prime1;
-        h64 ^= k1;
-        h64 = rotate_bits_left(h64, 27) * prime1 + prime4;
-      }
-    }
-
-    // remaining data can be processed in 4-byte chunks
-    if ((in.size() % 8) >= 4) {
-      for (; offset <= in.size() - 4; offset += 4) {
-        h64 ^= (getblock32(in.data(), offset) & 0xfffffffful) * prime1;
-        h64 = rotate_bits_left(h64, 23) * prime2 + prime3;
-      }
-    }
-
-    // and the rest
-    if (in.size() % 4) {
-      while (offset < in.size()) {
-        h64 ^= (std::to_integer<uint8_t>(in[offset]) & 0xff) * prime5;
-        h64 = rotate_bits_left(h64, 11) * prime1;
-        ++offset;
-      }
-    }
-    return h64;
-  }
-
-  result_type __device__ compute_bytes(device_span<std::byte const>& in) const
-  {
-    uint64_t offset = 0;
-    uint64_t h64;
-    // data can be processed in 32-byte chunks
-    if (in.size() >= 32) {
-      auto limit  = in.size() - 32;
-      uint64_t v1 = m_seed + prime1 + prime2;
-      uint64_t v2 = m_seed + prime2;
-      uint64_t v3 = m_seed;
-      uint64_t v4 = m_seed - prime1;
-
-      do {
-        // pipeline 4*8byte computations
-        v1 += getblock64(in.data(), offset) * prime2;
-        v1 = rotate_bits_left(v1, 31);
-        v1 *= prime1;
-        offset += 8;
-        v2 += getblock64(in.data(), offset) * prime2;
-        v2 = rotate_bits_left(v2, 31);
-        v2 *= prime1;
-        offset += 8;
-        v3 += getblock64(in.data(), offset) * prime2;
-        v3 = rotate_bits_left(v3, 31);
-        v3 *= prime1;
-        offset += 8;
-        v4 += getblock64(in.data(), offset) * prime2;
-        v4 = rotate_bits_left(v4, 31);
-        v4 *= prime1;
-        offset += 8;
-      } while (offset <= limit);
-
-      h64 = rotate_bits_left(v1, 1) + rotate_bits_left(v2, 7) + rotate_bits_left(v3, 12) +
-            rotate_bits_left(v4, 18);
-
-      v1 *= prime2;
-      v1 = rotate_bits_left(v1, 31);
-      v1 *= prime1;
-      h64 ^= v1;
-      h64 = h64 * prime1 + prime4;
-
-      v2 *= prime2;
-      v2 = rotate_bits_left(v2, 31);
-      v2 *= prime1;
-      h64 ^= v2;
-      h64 = h64 * prime1 + prime4;
-
-      v3 *= prime2;
-      v3 = rotate_bits_left(v3, 31);
-      v3 *= prime1;
-      h64 ^= v3;
-      h64 = h64 * prime1 + prime4;
-
-      v4 *= prime2;
-      v4 = rotate_bits_left(v4, 31);
-      v4 *= prime1;
-      h64 ^= v4;
-      h64 = h64 * prime1 + prime4;
-    } else {
-      h64 = m_seed + prime5;
-    }
-
-    h64 += in.size();
-
-    h64 = compute_remaining_bytes(in, offset, h64);
-
-    return finalize(h64);
-  }
-
-  constexpr __host__ __device__ std::uint64_t finalize(std::uint64_t h) const noexcept
-  {
-    h ^= h >> 33;
-    h *= prime2;
-    h ^= h >> 29;
-    h *= prime3;
-    h ^= h >> 32;
-    return h;
-  }
-
- private:
-  hash_value_type m_seed{};
-  static constexpr uint64_t prime1 = 0x9e3779b185ebca87ul;
-  static constexpr uint64_t prime2 = 0xc2b2ae3d27d4eb4ful;
-  static constexpr uint64_t prime3 = 0x165667b19e3779f9ul;
-  static constexpr uint64_t prime4 = 0x85ebca77c2b2ae63ul;
-  static constexpr uint64_t prime5 = 0x27d4eb2f165667c5ul;
-};
-
-template <>
-hash_value_type __device__ inline XXHash_64<bool>::operator()(bool const& key) const
-{
-  return compute(static_cast<uint8_t>(key));
-}
-
-template <>
-hash_value_type __device__ inline XXHash_64<float>::operator()(float const& key) const
-{
-  return compute(normalize_nans(key));
-}
-
-template <>
-hash_value_type __device__ inline XXHash_64<double>::operator()(double const& key) const
-{
-  return compute(normalize_nans(key));
-}
-
-template <>
-hash_value_type __device__ inline XXHash_64<cudf::string_view>::operator()(
-  cudf::string_view const& key) const
-{
-  auto const len = key.size_bytes();
-  auto data = device_span<std::byte const>(reinterpret_cast<std::byte const*>(key.data()), len);
-  return compute_bytes(data);
-}
-
-template <>
-hash_value_type __device__ inline XXHash_64<numeric::decimal32>::operator()(
-  numeric::decimal32 const& key) const
-{
-  return compute(key.value());
-}
-
-template <>
-hash_value_type __device__ inline XXHash_64<numeric::decimal64>::operator()(
-  numeric::decimal64 const& key) const
-{
-  return compute(key.value());
-}
-
-template <>
-hash_value_type __device__ inline XXHash_64<numeric::decimal128>::operator()(
-  numeric::decimal128 const& key) const
-{
-  return compute(key.value());
-}
-
 /**
  * @brief Computes the hash value of a row in the given table.
  *