rapidsai · kkraus14 · Aug 13, 2020 · Jun 9, 2020 · Jul 7, 2020 · Jul 21, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -47,6 +47,7 @@
 - PR #5658 Add `filter_tokens` nvtext API
 - PR #5666 Add `filter_characters_of_type` strings API
 - PR #5673 Always build and test with per-thread default stream enabled in the GPU CI build
+- PR #5438 Add MD5 hash support
 
 ## Improvements
 

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -423,6 +423,7 @@ add_library(cudf
             src/stream_compaction/drop_duplicates.cu
             src/datetime/datetime_ops.cu
             src/hash/hashing.cu
+            src/hash/hash_constants.cu
             src/partitioning/partitioning.cu
             src/quantiles/quantile.cu
             src/quantiles/quantiles.cu

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,9 +37,26 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> hash(table_view const& input,
+                             hash_id hash_function                     = hash_id::HASH_MURMUR3,
                              std::vector<uint32_t> const& initial_hash = {},
                              rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(),
                              cudaStream_t stream                 = 0);
 
+std::unique_ptr<column> identity_hash(
+  table_view const& input,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(),
+  cudaStream_t stream                 = 0);
+
+std::unique_ptr<column> murmur_hash3_32(
+  table_view const& input,
+  std::vector<uint32_t> const& initial_hash = {},
+  rmm::mr::device_memory_resource* mr       = rmm::mr::get_default_resource(),
+  cudaStream_t stream                       = 0);
+
+std::unique_ptr<column> md5_hash(
+  table_view const& input,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(),
+  cudaStream_t stream                 = 0);
+
 }  // namespace detail
 }  // namespace cudf
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, NVIDIA CORPORATION.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,221 @@
 
 #pragma once
 
+#include <cstdint>
 #include <cudf/strings/string_view.cuh>
+#include <hash/hash_constants.hpp>
+
+#include "cuda_runtime_api.h"
+#include "cudf/types.hpp"
+#include "driver_types.h"
+#include "vector_types.h"
 
 using hash_value_type = uint32_t;
 
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief Helper function, left rotate bit value the value n bits
+ */
+CUDA_HOST_DEVICE_CALLABLE uint32_t left_rotate(uint32_t value, uint32_t shift)
+{
+  return (value << shift) | (value >> (32 - shift));
+}
+
+/**
+ * @brief Core MD5 algorith implementation. Processes a single 512-bit chunk,
- * @brief Core MD5 algorith implementation. Processes a single 512-bit chunk,
+ * @brief Core MD5 algorithm implementation. Processes a single 512-bit chunk,
- * @brief Core MD5 algorith implementation. Processes a single 512-bit chunk,
+ * @brief Core MD5 algorithm implementation. Processes a single 512-bit chunk,
+ * updating the hash value so far. Does not zero out the buffer contents.
+ */
+void CUDA_HOST_DEVICE_CALLABLE md5_hash_step(md5_intermediate_data* hash_state,
+                                             const md5_hash_constants_type* hash_constants,
+                                             const md5_shift_constants_type* shift_constants)
+{
+  uint32_t A = hash_state->hash_value[0];
+  uint32_t B = hash_state->hash_value[1];
+  uint32_t C = hash_state->hash_value[2];
+  uint32_t D = hash_state->hash_value[3];
+
+  uint32_t* buffer_ints = (uint32_t*)hash_state->buffer;
 std::memcpy(&buffer_element_as_int, hash_state->buffer + g * 4, 4); 
 std::memcpy(&buffer_element_as_int, hash_state->buffer + g * 4, 4); 
+
+  for (unsigned int j = 0; j < 64; j++) {
+    uint32_t F, g;
-    uint32_t F, g;
+    uint32_t F;
+    uint32_t g;
-    uint32_t F, g;
+    uint32_t F;
+    uint32_t g;
+    switch (j / 16) {
+      case 0:
+        F = (B & C) | ((~B) & D);  // D ^ (B & (C ^ D))
+        g = j;
+        break;
+      case 1:
+        F = (D & B) | ((~D) & C);
+        g = (5 * j + 1) % 16;
+        break;
+      case 2:
+        F = B ^ C ^ D;
+        g = (3 * j + 5) % 16;
+        break;
+      case 3:
+        F = C ^ (B | (~D));
+        g = (7 * j) % 16;
+        break;
+    }
+
+    F = F + A + hash_constants[j] + buffer_ints[g];
+
+    A = D;
+    D = C;
+    C = B;
+    B = B + left_rotate(F, shift_constants[((j / 16) * 4) + (j % 4)]);
+  }
+
+  hash_state->hash_value[0] += A;
+  hash_state->hash_value[1] += B;
+  hash_state->hash_value[2] += C;
+  hash_state->hash_value[3] += D;
+
+  hash_state->buffer_length = 0;
+}
+
+template <typename Key>
+struct MD5Hash {
 template <typename Key> 
 struct MurmurHash3_32 { 
 result_type CUDA_HOST_DEVICE_CALLABLE operator()(Key const& key) const { return compute(key); } 
 template <typename Key> 
 struct MurmurHash3_32 { 
 result_type CUDA_HOST_DEVICE_CALLABLE operator()(Key const& key) const { return compute(key); } 
+  using argument_type = Key;
+
+  /**
+   * @brief Core MD5 element processing function
+   */
+  template <typename TKey>
+  void CUDA_HOST_DEVICE_CALLABLE process(TKey const& key,
+                                         const uint32_t len,
+                                         md5_intermediate_data* hash_state,
+                                         const md5_hash_constants_type* hash_constants,
+                                         const md5_shift_constants_type* shift_constants) const
+  {
+    uint8_t* data = (uint8_t*)&key;
 uint8_t const* data = reinterpret_cast<uint8_t const*>(&key); 
 uint8_t const* data = reinterpret_cast<uint8_t const*>(&key); 
+    hash_state->message_length += len;
+
+    if (hash_state->buffer_length + len < 64) {
+      thrust::copy_n(thrust::seq, data, len, hash_state->buffer + hash_state->buffer_length);
+      hash_state->buffer_length += len;
+    } else {
+      uint32_t copylen = 64 - hash_state->buffer_length;
+
+      thrust::copy_n(thrust::seq, data, copylen, hash_state->buffer + hash_state->buffer_length);
+      md5_hash_step(hash_state, hash_constants, shift_constants);
+
+      while (len > 64 + copylen) {
+        thrust::copy_n(thrust::seq, data + copylen, 64, hash_state->buffer);
+        md5_hash_step(hash_state, hash_constants, shift_constants);
+        copylen += 64;
+      }
+
+      thrust::copy_n(thrust::seq, data + copylen, len - copylen, hash_state->buffer);
+      hash_state->buffer_length = len - copylen;
+    }
+  }
+
+  template <typename T, typename std::enable_if_t<is_fixed_width<T>()>* = nullptr>
+  void CUDA_HOST_DEVICE_CALLABLE operator()(T const& key,
+                                            md5_intermediate_data* hash_state,
+                                            const md5_hash_constants_type* hash_constants,
+                                            const md5_shift_constants_type* shift_constants) const
+  {
+    process(key, size_of(key), hash_state, hash_constants, shift_constants);
+  }
+
+  template <typename T, typename std::enable_if_t<!is_fixed_width<T>()>* = nullptr>
+  void CUDA_HOST_DEVICE_CALLABLE operator()(T const& key,
+                                            md5_intermediate_data* hash_state,
+                                            const md5_hash_constants_type* hash_constants,
+                                            const md5_shift_constants_type* shift_constants) const
+  {
+    CUDF_FAIL("Unsupported hash type");
+  }
+
+  void CUDA_HOST_DEVICE_CALLABLE operator()(Key const& key,
 class row_hasher { 
 class row_hasher { 
+                                            md5_intermediate_data* hash_state,
+                                            const md5_hash_constants_type* hash_constants,
+                                            const md5_shift_constants_type* shift_constants) const
+  {
+  }
+};
+
+/**
+ * @brief Specialization of MD5Hash operator for strings.
+ */
+template <>
+void CUDA_HOST_DEVICE_CALLABLE
+MD5Hash<cudf::string_view>::operator()(cudf::string_view const& key,
+                                       md5_intermediate_data* hash_state,
+                                       const md5_hash_constants_type* hash_constants,
+                                       const md5_shift_constants_type* shift_constants) const
+{
+  const uint32_t len  = (uint32_t)key.size_bytes();
+  const uint8_t* data = (const uint8_t*)key.data();
+
+  hash_state->message_length += len;
+
+  if (hash_state->buffer_length + len < 64) {
+    thrust::copy_n(thrust::seq, data, len, hash_state->buffer + hash_state->buffer_length);
+    hash_state->buffer_length += len;
+  } else {
+    uint32_t copylen = 64 - hash_state->buffer_length;
+    thrust::copy_n(thrust::seq, data, copylen, hash_state->buffer + hash_state->buffer_length);
+    md5_hash_step(hash_state, hash_constants, shift_constants);
+
+    while (len > 64 + copylen) {
+      thrust::copy_n(thrust::seq, data + copylen, 64, hash_state->buffer);
+      md5_hash_step(hash_state, hash_constants, shift_constants);
+      copylen += 64;
+    }
+
+    thrust::copy_n(thrust::seq, data + copylen, len - copylen, hash_state->buffer);
+    hash_state->buffer_length = len - copylen;
+  }
+}
+
+/**
+ * @brief Finalize MD5 hash including converstion to hex string.
+ */
+void CUDA_HOST_DEVICE_CALLABLE finalize_md5_hash(md5_intermediate_data* hash_state,
+                                                 char* result_location,
+                                                 const md5_hash_constants_type* hash_constants,
+                                                 const md5_shift_constants_type* shift_constants,
+                                                 const hex_to_char_mapping_type* hex_char_map)
+{
+  uint64_t full_length = (uint64_t)hash_state->message_length;
+  full_length          = full_length << 3;
-  uint64_t full_length = (uint64_t)hash_state->message_length;
-  full_length          = full_length << 3;
+  auto const full_length = (static_cast<uint64_t>hash_state->message_length) << 3;
 auto const full_length = (static_cast<uint64_t>(hash_state->message_length)) << 3; 
-  uint64_t full_length = (uint64_t)hash_state->message_length;
-  full_length          = full_length << 3;
+  auto const full_length = (static_cast<uint64_t>hash_state->message_length) << 3;
 auto const full_length = (static_cast<uint64_t>(hash_state->message_length)) << 3; 
+  thrust::fill_n(thrust::seq, hash_state->buffer + hash_state->buffer_length, 1, 0x80);
+
+  if (hash_state->buffer_length <= 55) {
+    thrust::fill_n(thrust::seq,
+                   hash_state->buffer + hash_state->buffer_length + 1,
+                   (55 - hash_state->buffer_length),
+                   0x00);
+  } else {
+    thrust::fill_n(thrust::seq,
+                   hash_state->buffer + hash_state->buffer_length + 1,
+                   (64 - hash_state->buffer_length),
+                   0x00);
+    md5_hash_step(hash_state, hash_constants, shift_constants);
+
+    thrust::fill_n(thrust::seq, hash_state->buffer, 56, 0x00);
+  }
+
+  thrust::copy_n(thrust::seq, (uint8_t*)&full_length, 8, hash_state->buffer + 56);
+  md5_hash_step(hash_state, hash_constants, shift_constants);
+
+  u_char final_hash[32];
+  uint8_t* hash_result = (uint8_t*)hash_state->hash_value;
+  for (int i = 0; i < 16; i++) {
+    final_hash[i * 2]     = hex_char_map[(hash_result[i] >> 4) & 0xf];
+    final_hash[i * 2 + 1] = hex_char_map[hash_result[i] & 0xf];
+  }
+
+  thrust::copy_n(thrust::seq, final_hash, 32, result_location);
+}
+
+}  // namespace detail
+}  // namespace cudf
+
 // MurmurHash3_32 implementation from
 // https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
 //-----------------------------------------------------------------------------
@@ -250,4 +461,4 @@ struct IdentityHash {
 };
 
 template <typename Key>
-using default_hash = MurmurHash3_32<Key>;
+using default_hash = MurmurHash3_32<Key>;
-using default_hash = MurmurHash3_32<Key>;
+using default_hash = MurmurHash3_32<Key>;
+
-using default_hash = MurmurHash3_32<Key>;
+using default_hash = MurmurHash3_32<Key>;
+
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,6 +35,7 @@ namespace cudf {
  * @returns A column where each row is the hash of a column from the input
  */
 std::unique_ptr<column> hash(table_view const& input,
+                             hash_id hash_function                     = hash_id::HASH_MURMUR3,
                              std::vector<uint32_t> const& initial_hash = {},
                              rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
 

@@ -269,5 +269,14 @@ inline bool operator==(data_type const& lhs, data_type const& rhs) { return lhs.
  */
 std::size_t size_of(data_type t);
 
+/**
+ *  @brief Identifies the hash function to be used
+ */
+enum class hash_id {
+  HASH_IDENTITY = 0,  ///< Identity hash function that simply returns the key to be hashed
+  HASH_MURMUR3,       ///< Murmur3 hash function
+  HASH_MD5            ///< MD5 hash function
+};
+
 /** @} */
 }  // namespace cudf