Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Add MD5 to existing hashing functionality #5438

Merged
merged 20 commits into from
Aug 13, 2020
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
- PR #5658 Add `filter_tokens` nvtext API
- PR #5666 Add `filter_characters_of_type` strings API
- PR #5673 Always build and test with per-thread default stream enabled in the GPU CI build
- PR #5438 Add MD5 hash support
- PR #5704 Initial `fixed_point` Column Support
- PR #5716 Add `double_type_dispatcher` to libcudf
- PR #5739 Add `nvtext::detokenize` API
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,7 @@ add_library(cudf
src/stream_compaction/drop_duplicates.cu
src/datetime/datetime_ops.cu
src/hash/hashing.cu
src/hash/hash_constants.cu
src/partitioning/partitioning.cu
src/quantiles/quantile.cu
src/quantiles/quantiles.cu
Expand Down
14 changes: 13 additions & 1 deletion cpp/include/cudf/detail/hashing.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -37,9 +37,21 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> hash(table_view const& input,
hash_id hash_function = hash_id::HASH_MURMUR3,
std::vector<uint32_t> const& initial_hash = {},
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(),
cudaStream_t stream = 0);

std::unique_ptr<column> murmur_hash3_32(
table_view const& input,
std::vector<uint32_t> const& initial_hash = {},
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(),
cudaStream_t stream = 0);

std::unique_ptr<column> md5_hash(
table_view const& input,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(),
cudaStream_t stream = 0);

} // namespace detail
} // namespace cudf
225 changes: 224 additions & 1 deletion cpp/include/cudf/detail/utilities/hash_functions.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017, NVIDIA CORPORATION.
* Copyright (c) 2017-2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -16,10 +16,233 @@

#pragma once

#include <cudf/column/column_device_view.cuh>
#include <cudf/strings/string_view.cuh>
#include <hash/hash_constants.hpp>

#include "cuda_runtime_api.h"
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
#include "cudf/types.hpp"

using hash_value_type = uint32_t;

namespace cudf {
namespace detail {
/**
* Modified GPU implementation of
* https://johnnylee-sde.github.io/Fast-unsigned-integer-to-hex-string/ that is lowercase only and
vuule marked this conversation as resolved.
Show resolved Hide resolved
* does not flip the endianness of the input.
*/
void CUDA_DEVICE_CALLABLE uint32ToLowercaseHexString(uint32_t num, char* destination)
{
// Transform 0xABCD1234 => 0x0000ABCD00001234 => 0x0B0A0D0C02010403
uint64_t x = num;
x = ((x & 0xFFFF0000) << 16) | ((x & 0xFFFF));
x = ((x & 0xF0000000F) << 8) | ((x & 0xF0000000F0) >> 4) | ((x & 0xF0000000F00) << 16) |
((x & 0xF0000000F000) << 4);

// Calculate a mask of ascii value offsets for bytes that contain alphabetical hex digits
uint64_t offsets = (((x + 0x0606060606060606) >> 4) & 0x0101010101010101) * 0x27;

x |= 0x3030303030303030;
x += offsets;
thrust::copy_n(thrust::seq, reinterpret_cast<uint8_t*>(&x), 8, destination);
}

struct MD5Hash {
__device__ MD5Hash(md5_hash_constants_type const* hash_constants,
md5_shift_constants_type const* shift_constants)
: d_hash_constants(hash_constants), d_shift_constants(shift_constants)
{
}

/**
* @brief Core MD5 algorithm implementation. Processes a single 512-bit chunk,
* updating the hash value so far. Does not zero out the buffer contents.
*/
void __device__ hash_step(md5_intermediate_data* hash_state) const
{
uint32_t A = hash_state->hash_value[0];
uint32_t B = hash_state->hash_value[1];
uint32_t C = hash_state->hash_value[2];
uint32_t D = hash_state->hash_value[3];

for (unsigned int j = 0; j < 64; j++) {
uint32_t F;
uint32_t g;
switch (j / 16) {
case 0:
F = (B & C) | ((~B) & D);
g = j;
break;
case 1:
F = (D & B) | ((~D) & C);
g = (5 * j + 1) % 16;
break;
case 2:
F = B ^ C ^ D;
g = (3 * j + 5) % 16;
break;
case 3:
F = C ^ (B | (~D));
g = (7 * j) % 16;
break;
}

uint32_t buffer_element_as_int;
std::memcpy(&buffer_element_as_int, hash_state->buffer + g * 4, 4);
F = F + A + d_hash_constants[j] + buffer_element_as_int;
A = D;
D = C;
C = B;
B = B + __funnelshift_l(F, F, d_shift_constants[((j / 16) * 4) + (j % 4)]);
}

hash_state->hash_value[0] += A;
hash_state->hash_value[1] += B;
hash_state->hash_value[2] += C;
hash_state->hash_value[3] += D;

hash_state->buffer_length = 0;
}
/**
* @brief Core MD5 element processing function
*/
template <typename TKey>
void __device__ process(TKey const& key, md5_intermediate_data* hash_state) const
{
uint32_t const len = sizeof(TKey);
uint8_t const* data = reinterpret_cast<uint8_t const*>(&key);
hash_state->message_length += len;

if (hash_state->buffer_length + len < 64) {
thrust::copy_n(thrust::seq, data, len, hash_state->buffer + hash_state->buffer_length);
jrhemstad marked this conversation as resolved.
Show resolved Hide resolved
hash_state->buffer_length += len;
} else {
uint32_t copylen = 64 - hash_state->buffer_length;

thrust::copy_n(thrust::seq, data, copylen, hash_state->buffer + hash_state->buffer_length);
hash_step(hash_state);

while (len > 64 + copylen) {
thrust::copy_n(thrust::seq, data + copylen, 64, hash_state->buffer);
hash_step(hash_state);
copylen += 64;
}

thrust::copy_n(thrust::seq, data + copylen, len - copylen, hash_state->buffer);
hash_state->buffer_length = len - copylen;
}
}

void __device__ finalize(md5_intermediate_data* hash_state, char* result_location) const
{
auto const full_length = (static_cast<uint64_t>(hash_state->message_length)) << 3;
thrust::fill_n(thrust::seq, hash_state->buffer + hash_state->buffer_length, 1, 0x80);

if (hash_state->buffer_length < 56) {
vuule marked this conversation as resolved.
Show resolved Hide resolved
thrust::fill_n(thrust::seq,
hash_state->buffer + hash_state->buffer_length + 1,
(55 - hash_state->buffer_length),
0x00);
} else {
thrust::fill_n(thrust::seq,
hash_state->buffer + hash_state->buffer_length + 1,
(64 - hash_state->buffer_length),
0x00);
hash_step(hash_state);

thrust::fill_n(thrust::seq, hash_state->buffer, 56, 0x00);
}

thrust::copy_n(
thrust::seq, reinterpret_cast<uint8_t const*>(&full_length), 8, hash_state->buffer + 56);
hash_step(hash_state);

#pragma unroll
for (int i = 0; i < 4; ++i)
uint32ToLowercaseHexString(hash_state->hash_value[i], result_location + (8 * i));
}

template <typename T, typename std::enable_if_t<is_chrono<T>()>* = nullptr>
void __device__ operator()(column_device_view col,
size_type row_index,
md5_intermediate_data* hash_state) const
{
release_assert(false && "MD5 Unsupported chrono type column");
}

template <typename T, typename std::enable_if_t<!is_fixed_width<T>()>* = nullptr>
void __device__ operator()(column_device_view col,
size_type row_index,
md5_intermediate_data* hash_state) const
{
release_assert(false && "MD5 Unsupported non-fixed-width type column");
}

template <typename T, typename std::enable_if_t<is_floating_point<T>()>* = nullptr>
void __device__ operator()(column_device_view col,
size_type row_index,
md5_intermediate_data* hash_state) const
{
T const& key = col.element<T>(row_index);
if (isnan(key)) {
T nan = std::numeric_limits<T>::quiet_NaN();
process(nan, hash_state);
} else if (key == T{0.0}) {
process(T{0.0}, hash_state);
} else {
process(key, hash_state);
}
}

template <typename T,
typename std::enable_if_t<is_fixed_width<T>() && !is_floating_point<T>() &&
!is_chrono<T>()>* = nullptr>
void CUDA_DEVICE_CALLABLE operator()(column_device_view col,
size_type row_index,
md5_intermediate_data* hash_state) const
{
process(col.element<T>(row_index), hash_state);
}

private:
md5_hash_constants_type const* d_hash_constants;
md5_shift_constants_type const* d_shift_constants;
};

template <>
void CUDA_DEVICE_CALLABLE MD5Hash::operator()<string_view>(column_device_view col,
size_type row_index,
md5_intermediate_data* hash_state) const
{
string_view key = col.element<string_view>(row_index);
uint32_t const len = static_cast<uint32_t>(key.size_bytes());
uint8_t const* data = reinterpret_cast<uint8_t const*>(key.data());

hash_state->message_length += len;

if (hash_state->buffer_length + len < 64) {
thrust::copy_n(thrust::seq, data, len, hash_state->buffer + hash_state->buffer_length);
hash_state->buffer_length += len;
} else {
uint32_t copylen = 64 - hash_state->buffer_length;
thrust::copy_n(thrust::seq, data, copylen, hash_state->buffer + hash_state->buffer_length);
hash_step(hash_state);

while (len > 64 + copylen) {
thrust::copy_n(thrust::seq, data + copylen, 64, hash_state->buffer);
hash_step(hash_state);
copylen += 64;
}

thrust::copy_n(thrust::seq, data + copylen, len - copylen, hash_state->buffer);
hash_state->buffer_length = len - copylen;
}
}

} // namespace detail
} // namespace cudf

// MurmurHash3_32 implementation from
// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
//-----------------------------------------------------------------------------
Expand Down
3 changes: 2 additions & 1 deletion cpp/include/cudf/hashing.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -35,6 +35,7 @@ namespace cudf {
* @returns A column where each row is the hash of a column from the input
*/
std::unique_ptr<column> hash(table_view const& input,
hash_id hash_function = hash_id::HASH_MURMUR3,
std::vector<uint32_t> const& initial_hash = {},
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

Expand Down
9 changes: 9 additions & 0 deletions cpp/include/cudf/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -279,5 +279,14 @@ inline bool operator==(data_type const& lhs, data_type const& rhs) { return lhs.
*/
std::size_t size_of(data_type t);

/**
* @brief Identifies the hash function to be used
*/
enum class hash_id {
HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed
HASH_MURMUR3, ///< Murmur3 hash function
HASH_MD5 ///< MD5 hash function
};

/** @} */
} // namespace cudf
Loading