Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Add MD5 to existing hashing functionality #5438

Merged
merged 20 commits into from
Aug 13, 2020
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
- PR #5658 Add `filter_tokens` nvtext API
- PR #5666 Add `filter_characters_of_type` strings API
- PR #5673 Always build and test with per-thread default stream enabled in the GPU CI build
- PR #5438 Add MD5 hash support
- PR #5704 Initial `fixed_point` Column Support
- PR #5716 Add `double_type_dispatcher` to libcudf
- PR #5739 Add `nvtext::detokenize` API
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,7 @@ add_library(cudf
src/stream_compaction/drop_duplicates.cu
src/datetime/datetime_ops.cu
src/hash/hashing.cu
src/hash/hash_constants.cu
src/partitioning/partitioning.cu
src/quantiles/quantile.cu
src/quantiles/quantiles.cu
Expand Down
14 changes: 13 additions & 1 deletion cpp/include/cudf/detail/hashing.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -37,9 +37,21 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> hash(table_view const& input,
hash_id hash_function = hash_id::HASH_MURMUR3,
std::vector<uint32_t> const& initial_hash = {},
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(),
cudaStream_t stream = 0);

std::unique_ptr<column> murmur_hash3_32(
table_view const& input,
std::vector<uint32_t> const& initial_hash = {},
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(),
cudaStream_t stream = 0);

std::unique_ptr<column> md5_hash(
table_view const& input,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(),
cudaStream_t stream = 0);

} // namespace detail
} // namespace cudf
238 changes: 237 additions & 1 deletion cpp/include/cudf/detail/utilities/hash_functions.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017, NVIDIA CORPORATION.
* Copyright (c) 2017-2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -16,10 +16,246 @@

#pragma once

#include <cudf/column/column_device_view.cuh>
#include <cudf/strings/string_view.cuh>
#include <hash/hash_constants.hpp>

#include "cuda_runtime_api.h"
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
#include "cudf/types.hpp"

using hash_value_type = uint32_t;

namespace cudf {
namespace detail {
/**
* Modified GPU implementation of
* https://johnnylee-sde.github.io/Fast-unsigned-integer-to-hex-string/
* Copyright (c) 2015 Barry Clark
* Licensed under the MIT license.
* See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
*/
void CUDA_DEVICE_CALLABLE uint32ToLowercaseHexString(uint32_t num, char* destination)
{
// Transform 0xABCD1234 => 0x0000ABCD00001234 => 0x0B0A0D0C02010403
uint64_t x = num;
x = ((x & 0xFFFF0000) << 16) | ((x & 0xFFFF));
x = ((x & 0xF0000000F) << 8) | ((x & 0xF0000000F0) >> 4) | ((x & 0xF0000000F00) << 16) |
((x & 0xF0000000F000) << 4);

// Calculate a mask of ascii value offsets for bytes that contain alphabetical hex digits
uint64_t offsets = (((x + 0x0606060606060606) >> 4) & 0x0101010101010101) * 0x27;

x |= 0x3030303030303030;
x += offsets;
thrust::copy_n(thrust::seq, reinterpret_cast<uint8_t*>(&x), 8, destination);
}

struct MD5Hash {
__device__ MD5Hash(md5_hash_constants_type const* hash_constants,
md5_shift_constants_type const* shift_constants)
: d_hash_constants(hash_constants), d_shift_constants(shift_constants)
{
}

/**
* @brief Core MD5 algorithm implementation. Processes a single 512-bit chunk,
* updating the hash value so far. Does not zero out the buffer contents.
*/
void __device__ hash_step(md5_intermediate_data* hash_state) const
{
uint32_t A = hash_state->hash_value[0];
uint32_t B = hash_state->hash_value[1];
uint32_t C = hash_state->hash_value[2];
uint32_t D = hash_state->hash_value[3];

for (unsigned int j = 0; j < 64; j++) {
uint32_t F;
uint32_t g;
switch (j / 16) {
case 0:
F = (B & C) | ((~B) & D);
g = j;
break;
case 1:
F = (D & B) | ((~D) & C);
g = (5 * j + 1) % 16;
break;
case 2:
F = B ^ C ^ D;
g = (3 * j + 5) % 16;
break;
case 3:
F = C ^ (B | (~D));
g = (7 * j) % 16;
break;
}

uint32_t buffer_element_as_int;
std::memcpy(&buffer_element_as_int, hash_state->buffer + g * 4, 4);
F = F + A + d_hash_constants[j] + buffer_element_as_int;
A = D;
D = C;
C = B;
B = B + __funnelshift_l(F, F, d_shift_constants[((j / 16) * 4) + (j % 4)]);
}

hash_state->hash_value[0] += A;
hash_state->hash_value[1] += B;
hash_state->hash_value[2] += C;
hash_state->hash_value[3] += D;

hash_state->buffer_length = 0;
}
/**
* @brief Core MD5 element processing function
*/
template <typename TKey>
void __device__ process(TKey const& key, md5_intermediate_data* hash_state) const
{
uint32_t const len = sizeof(TKey);
uint8_t const* data = reinterpret_cast<uint8_t const*>(&key);
hash_state->message_length += len;

// 64 bytes for the number of bytes processed in a given step
constexpr int md5_chunk_size = 64;
if (hash_state->buffer_length + len < md5_chunk_size) {
thrust::copy_n(thrust::seq, data, len, hash_state->buffer + hash_state->buffer_length);
jrhemstad marked this conversation as resolved.
Show resolved Hide resolved
hash_state->buffer_length += len;
} else {
uint32_t copylen = md5_chunk_size - hash_state->buffer_length;

thrust::copy_n(thrust::seq, data, copylen, hash_state->buffer + hash_state->buffer_length);
hash_step(hash_state);

while (len > md5_chunk_size + copylen) {
thrust::copy_n(thrust::seq, data + copylen, md5_chunk_size, hash_state->buffer);
hash_step(hash_state);
copylen += md5_chunk_size;
}

thrust::copy_n(thrust::seq, data + copylen, len - copylen, hash_state->buffer);
hash_state->buffer_length = len - copylen;
}
}

void __device__ finalize(md5_intermediate_data* hash_state, char* result_location) const
{
auto const full_length = (static_cast<uint64_t>(hash_state->message_length)) << 3;
thrust::fill_n(thrust::seq, hash_state->buffer + hash_state->buffer_length, 1, 0x80);

// 64 bytes for the number of bytes processed in a given step
constexpr int md5_chunk_size = 64;
// 8 bytes for the total message length, appended to the end of the last chunk processed
constexpr int message_length_size = 8;
// 1 byte for the end of the message flag
constexpr int end_of_message_size = 1;
if (hash_state->buffer_length + message_length_size + end_of_message_size <= md5_chunk_size) {
thrust::fill_n(
thrust::seq,
hash_state->buffer + hash_state->buffer_length + 1,
(md5_chunk_size - message_length_size - end_of_message_size - hash_state->buffer_length),
0x00);
} else {
thrust::fill_n(thrust::seq,
hash_state->buffer + hash_state->buffer_length + 1,
(md5_chunk_size - hash_state->buffer_length),
0x00);
hash_step(hash_state);

thrust::fill_n(thrust::seq, hash_state->buffer, md5_chunk_size - message_length_size, 0x00);
}

thrust::copy_n(thrust::seq,
reinterpret_cast<uint8_t const*>(&full_length),
message_length_size,
hash_state->buffer + md5_chunk_size - message_length_size);
hash_step(hash_state);

#pragma unroll
for (int i = 0; i < 4; ++i)
uint32ToLowercaseHexString(hash_state->hash_value[i], result_location + (8 * i));
}

template <typename T, typename std::enable_if_t<is_chrono<T>()>* = nullptr>
void __device__ operator()(column_device_view col,
size_type row_index,
md5_intermediate_data* hash_state) const
{
release_assert(false && "MD5 Unsupported chrono type column");
}

template <typename T, typename std::enable_if_t<!is_fixed_width<T>()>* = nullptr>
void __device__ operator()(column_device_view col,
size_type row_index,
md5_intermediate_data* hash_state) const
{
release_assert(false && "MD5 Unsupported non-fixed-width type column");
}

template <typename T, typename std::enable_if_t<is_floating_point<T>()>* = nullptr>
void __device__ operator()(column_device_view col,
size_type row_index,
md5_intermediate_data* hash_state) const
{
T const& key = col.element<T>(row_index);
if (isnan(key)) {
T nan = std::numeric_limits<T>::quiet_NaN();
process(nan, hash_state);
} else if (key == T{0.0}) {
process(T{0.0}, hash_state);
} else {
process(key, hash_state);
}
}

template <typename T,
typename std::enable_if_t<is_fixed_width<T>() && !is_floating_point<T>() &&
!is_chrono<T>()>* = nullptr>
void CUDA_DEVICE_CALLABLE operator()(column_device_view col,
size_type row_index,
md5_intermediate_data* hash_state) const
{
process(col.element<T>(row_index), hash_state);
}

private:
md5_hash_constants_type const* d_hash_constants;
md5_shift_constants_type const* d_shift_constants;
};

template <>
void CUDA_DEVICE_CALLABLE MD5Hash::operator()<string_view>(column_device_view col,
size_type row_index,
md5_intermediate_data* hash_state) const
{
string_view key = col.element<string_view>(row_index);
uint32_t const len = static_cast<uint32_t>(key.size_bytes());
uint8_t const* data = reinterpret_cast<uint8_t const*>(key.data());

hash_state->message_length += len;

if (hash_state->buffer_length + len < 64) {
thrust::copy_n(thrust::seq, data, len, hash_state->buffer + hash_state->buffer_length);
hash_state->buffer_length += len;
} else {
uint32_t copylen = 64 - hash_state->buffer_length;
thrust::copy_n(thrust::seq, data, copylen, hash_state->buffer + hash_state->buffer_length);
hash_step(hash_state);

while (len > 64 + copylen) {
thrust::copy_n(thrust::seq, data + copylen, 64, hash_state->buffer);
hash_step(hash_state);
copylen += 64;
}

thrust::copy_n(thrust::seq, data + copylen, len - copylen, hash_state->buffer);
hash_state->buffer_length = len - copylen;
}
}

} // namespace detail
} // namespace cudf

// MurmurHash3_32 implementation from
// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
//-----------------------------------------------------------------------------
Expand Down
8 changes: 7 additions & 1 deletion cpp/include/cudf/hashing.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -35,8 +35,14 @@ namespace cudf {
* @returns A column where each row is the hash of a column from the input
*/
std::unique_ptr<column> hash(table_view const& input,
hash_id hash_function = hash_id::HASH_MURMUR3,
std::vector<uint32_t> const& initial_hash = {},
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

std::unique_ptr<column> murmur_hash3_32(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't want the same function exposed two ways. Either the Python should be updated to call the hash with the HASH_MURMUR3 enum or get rid of the hash API and add a specific MD5 hash. I'd opt for the former.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mirrored the hash_id enum over to the python side and removed the murmur hash specific function.

table_view const& input,
std::vector<uint32_t> const& initial_hash = {},
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

/** @} */ // end of group
} // namespace cudf
9 changes: 9 additions & 0 deletions cpp/include/cudf/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -279,5 +279,14 @@ inline bool operator==(data_type const& lhs, data_type const& rhs) { return lhs.
*/
std::size_t size_of(data_type t);

/**
* @brief Identifies the hash function to be used
*/
enum class hash_id {
HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed
HASH_MURMUR3, ///< Murmur3 hash function
HASH_MD5 ///< MD5 hash function
};

/** @} */
} // namespace cudf
Loading