Skip to content

Commit

Permalink
Add XXHash_64 hash function to cudf (#13612)
Browse files Browse the repository at this point in the history
Add XXHash_64 hash function to libcudf

```
std::unique_ptr<column> xxhash_64(
  table_view const& input,  uint64_t seed,
  rmm::cuda_stream_view stream,  rmm::mr::device_memory_resource* mr);
```

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)
  - Daniel Jünger (https://github.com/sleeepyjack)

URL: #13612
  • Loading branch information
davidwendt authored Jul 19, 2023
1 parent 84578d7 commit 541c5bf
Show file tree
Hide file tree
Showing 6 changed files with 549 additions and 6 deletions.
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,7 @@ add_library(
src/hash/murmurhash3_x86_32.cu
src/hash/murmurhash3_x64_128.cu
src/hash/spark_murmurhash3_x86_32.cu
src/hash/xxhash_64.cu
src/interop/dlpack.cpp
src/interop/from_arrow.cu
src/interop/to_arrow.cu
Expand Down
27 changes: 23 additions & 4 deletions cpp/include/cudf/hashing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,11 @@ std::unique_ptr<column> hash(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

//! Hash APIs
namespace hashing {

/**
* @brief Computes the MurmurHash3 32-bit of each row in the given table
* @brief Computes the MurmurHash3 32-bit hash value of each row in the given table
*
* This function computes the hash of each column using the `seed` for the first column
* and the resulting hash as a seed for the next column and so on.
Expand All @@ -93,7 +94,7 @@ std::unique_ptr<column> murmurhash3_x86_32(
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Computes the hash values of each row in the input set of columns
* @brief Computes the MurmurHash3 64-bit hash value of each row in the given table
*
* This function takes a 64-bit seed value and returns hash values using the
* MurmurHash3_x64_128 algorithm. The hash produces in two uint64 values per row.
Expand All @@ -112,7 +113,7 @@ std::unique_ptr<table> murmurhash3_x64_128(
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Computes the MurmurHash3 32-bit of each row in the given table
* @brief Computes the MurmurHash3 32-bit hash value of each row in the given table
*
* This function computes the hash similar to MurmurHash3_x86_32 with special processing
* to match Spark's implementation results.
Expand All @@ -131,7 +132,7 @@ std::unique_ptr<column> spark_murmurhash3_x86_32(
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Computes the MD5 hash of each row in the given table
* @brief Computes the MD5 hash value of each row in the given table
*
* @param input The table of columns to hash
* @param stream CUDA stream used for device memory operations and kernel launches
Expand All @@ -144,6 +145,24 @@ std::unique_ptr<column> md5(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Computes the XXHash_64 hash value of each row in the given table
*
* This function takes a 64-bit seed value and returns a column of type UINT64.
*
* @param input The table of columns to hash
* @param seed Optional seed value to use for the hash function
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
*
* @returns A column where each row is the hash of a row from the input
*/
std::unique_ptr<column> xxhash_64(
table_view const& input,
uint64_t seed = DEFAULT_HASH_SEED,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

} // namespace hashing

/** @} */ // end of group
Expand Down
5 changes: 5 additions & 0 deletions cpp/include/cudf/hashing/detail/hashing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,11 @@ std::unique_ptr<column> md5(table_view const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

std::unique_ptr<column> xxhash_64(table_view const& input,
uint64_t seed,
rmm::cuda_stream_view,
rmm::mr::device_memory_resource* mr);

/* Copyright 2005-2014 Daniel James.
*
* Use, modification and distribution is subject to the Boost Software
Expand Down
Loading

0 comments on commit 541c5bf

Please sign in to comment.