Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Add MD5 to existing hashing functionality #5438

Merged
merged 20 commits into from
Aug 13, 2020
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
- PR #5658 Add `filter_tokens` nvtext API
- PR #5666 Add `filter_characters_of_type` strings API
- PR #5673 Always build and test with per-thread default stream enabled in the GPU CI build
- PR #5438 Add MD5 hash support

## Improvements

Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,7 @@ add_library(cudf
src/stream_compaction/drop_duplicates.cu
src/datetime/datetime_ops.cu
src/hash/hashing.cu
src/hash/hash_constants.cu
src/partitioning/partitioning.cu
src/quantiles/quantile.cu
src/quantiles/quantiles.cu
Expand Down
19 changes: 18 additions & 1 deletion cpp/include/cudf/detail/hashing.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -37,9 +37,26 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> hash(table_view const& input,
hash_id hash_function = hash_id::HASH_MURMUR3,
std::vector<uint32_t> const& initial_hash = {},
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(),
cudaStream_t stream = 0);

std::unique_ptr<column> identity_hash(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is this function?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be the wrapper call for the IdentityHash function, I did not end up implementing it and will remove it.

table_view const& input,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(),
cudaStream_t stream = 0);

std::unique_ptr<column> murmur_hash3_32(
table_view const& input,
std::vector<uint32_t> const& initial_hash = {},
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(),
cudaStream_t stream = 0);

std::unique_ptr<column> md5_hash(
table_view const& input,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(),
cudaStream_t stream = 0);

} // namespace detail
} // namespace cudf
215 changes: 213 additions & 2 deletions cpp/include/cudf/detail/utilities/hash_functions.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017, NVIDIA CORPORATION.
* Copyright (c) 2017-2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -16,10 +16,221 @@

#pragma once

#include <cstdint>
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
#include <cudf/strings/string_view.cuh>
#include <hash/hash_constants.hpp>

#include "cuda_runtime_api.h"
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
#include "cudf/types.hpp"
#include "driver_types.h"
#include "vector_types.h"
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved

using hash_value_type = uint32_t;

namespace cudf {
namespace detail {

/**
* @brief Helper function, left rotate bit value the value n bits
*/
CUDA_HOST_DEVICE_CALLABLE uint32_t left_rotate(uint32_t value, uint32_t shift)
{
return (value << shift) | (value >> (32 - shift));
}

/**
* @brief Core MD5 algorith implementation. Processes a single 512-bit chunk,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
* @brief Core MD5 algorith implementation. Processes a single 512-bit chunk,
* @brief Core MD5 algorithm implementation. Processes a single 512-bit chunk,

run spell check on all comments once.

* updating the hash value so far. Does not zero out the buffer contents.
*/
void CUDA_HOST_DEVICE_CALLABLE md5_hash_step(md5_intermediate_data* hash_state,
const md5_hash_constants_type* hash_constants,
const md5_shift_constants_type* shift_constants)
{
uint32_t A = hash_state->hash_value[0];
uint32_t B = hash_state->hash_value[1];
uint32_t C = hash_state->hash_value[2];
uint32_t D = hash_state->hash_value[3];
rwlee marked this conversation as resolved.
Show resolved Hide resolved

uint32_t* buffer_ints = (uint32_t*)hash_state->buffer;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use reinterpret_cast

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Switched to a memcpy

std::memcpy(&buffer_element_as_int, hash_state->buffer + g * 4, 4);


for (unsigned int j = 0; j < 64; j++) {
uint32_t F, g;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
uint32_t F, g;
uint32_t F;
uint32_t g;

switch (j / 16) {
case 0:
F = (B & C) | ((~B) & D); // D ^ (B & (C ^ D))
g = j;
break;
case 1:
F = (D & B) | ((~D) & C);
g = (5 * j + 1) % 16;
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
break;
case 2:
F = B ^ C ^ D;
g = (3 * j + 5) % 16;
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
break;
case 3:
F = C ^ (B | (~D));
g = (7 * j) % 16;
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
break;
}

F = F + A + hash_constants[j] + buffer_ints[g];

A = D;
D = C;
C = B;
B = B + left_rotate(F, shift_constants[((j / 16) * 4) + (j % 4)]);
}

hash_state->hash_value[0] += A;
hash_state->hash_value[1] += B;
hash_state->hash_value[2] += C;
hash_state->hash_value[3] += D;

hash_state->buffer_length = 0;
}

template <typename Key>
struct MD5Hash {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is MD5Hash templated on Key in addition to the operator()s being templates? I would think the struct does not need to be a template and instead just make the operator() be a template. That would simplify your specializations.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was largely copying the structure of the murmur hash function, I'll change the operator to a template.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but notice that MurmurHas3_32 is a class template:

template <typename Key>
struct MurmurHash3_32 {

But the operator() is not:

result_type CUDA_HOST_DEVICE_CALLABLE operator()(Key const& key) const { return compute(key); }

In your case, both are templates, which just complicates the specializations.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In order to fix the type dispatching error this was changed. The base operator is no longer templated https://github.com/rapidsai/cudf/pull/5438/files#diff-a6ce3f9a4f61a23dd6469473c7dbf15fR147

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed the template when I removed the md5_element_hasher

using argument_type = Key;

/**
* @brief Core MD5 element processing function
*/
template <typename TKey>
void CUDA_HOST_DEVICE_CALLABLE process(TKey const& key,
const uint32_t len,
md5_intermediate_data* hash_state,
const md5_hash_constants_type* hash_constants,
const md5_shift_constants_type* shift_constants) const
rwlee marked this conversation as resolved.
Show resolved Hide resolved
{
uint8_t* data = (uint8_t*)&key;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reinterpret_cast here too

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

uint8_t const* data = reinterpret_cast<uint8_t const*>(&key);
-- switched

hash_state->message_length += len;

if (hash_state->buffer_length + len < 64) {
thrust::copy_n(thrust::seq, data, len, hash_state->buffer + hash_state->buffer_length);
jrhemstad marked this conversation as resolved.
Show resolved Hide resolved
hash_state->buffer_length += len;
} else {
uint32_t copylen = 64 - hash_state->buffer_length;

thrust::copy_n(thrust::seq, data, copylen, hash_state->buffer + hash_state->buffer_length);
md5_hash_step(hash_state, hash_constants, shift_constants);

while (len > 64 + copylen) {
thrust::copy_n(thrust::seq, data + copylen, 64, hash_state->buffer);
md5_hash_step(hash_state, hash_constants, shift_constants);
copylen += 64;
}

thrust::copy_n(thrust::seq, data + copylen, len - copylen, hash_state->buffer);
hash_state->buffer_length = len - copylen;
}
}

template <typename T, typename std::enable_if_t<is_fixed_width<T>()>* = nullptr>
void CUDA_HOST_DEVICE_CALLABLE operator()(T const& key,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can these be just __device__ ? I don't see them being called from host.

md5_intermediate_data* hash_state,
const md5_hash_constants_type* hash_constants,
const md5_shift_constants_type* shift_constants) const
{
process(key, size_of(key), hash_state, hash_constants, shift_constants);
}

template <typename T, typename std::enable_if_t<!is_fixed_width<T>()>* = nullptr>
void CUDA_HOST_DEVICE_CALLABLE operator()(T const& key,
md5_intermediate_data* hash_state,
const md5_hash_constants_type* hash_constants,
const md5_shift_constants_type* shift_constants) const
{
CUDF_FAIL("Unsupported hash type");
}

void CUDA_HOST_DEVICE_CALLABLE operator()(Key const& key,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this one empty? Might warrant a comment if I'm not missing something trivial.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did a bunch of testing today to understand and explain this issue better. This operatore() is acting as a catch all case representing default behavior, that allows me to override specific cases like a string_view outside of the struct. Without this un-templated operator function, I get a bunch of compile errors ../include/cudf/detail/utilities/hash_functions.cuh(165): error: no suitable constructor exists to convert from "const __nv_bool" to "cudf::data_type" for a bunch of different types.

Ideally the default behavior should actually be a CUDF_FAIL("Unsupported hash type") but adding that to the empty function causes ../include/cudf/detail/utilities/hash_functions.cuh(173): error: device code does not support exception handling errors.

During my testing this afternoon, non-fixed width types never hit the CUDF_FAIL on line 146 -- the same was seen for other types I was trying to filter out as unsupported column data types. It's clear the current method of filtering out unsupported types doesn't work, any guidance on how to fix this would be appreciated.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, this isn't right. You're not hitting CUDF_FAIL because you're doing device-side dispatch and calling your MD5Hash object in device code. CUDF_FAIL is a host-only construct. I'm surprised this even compiles.

Copy link
Contributor

@vuule vuule Jul 24, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

EDIT: the advice does not stand, see the row_hasher instead as suggested below.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd suggest looking at row_hasher here:

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Required a bit of a rework, the use of size_of rather than sizeof was causing failures with primitive types and were being type dispatched to other operator functions.

md5_intermediate_data* hash_state,
const md5_hash_constants_type* hash_constants,
const md5_shift_constants_type* shift_constants) const
{
}
};

/**
* @brief Specialization of MD5Hash operator for strings.
*/
template <>
void CUDA_HOST_DEVICE_CALLABLE
MD5Hash<cudf::string_view>::operator()(cudf::string_view const& key,
md5_intermediate_data* hash_state,
const md5_hash_constants_type* hash_constants,
const md5_shift_constants_type* shift_constants) const
{
const uint32_t len = (uint32_t)key.size_bytes();
const uint8_t* data = (const uint8_t*)key.data();

hash_state->message_length += len;

if (hash_state->buffer_length + len < 64) {
thrust::copy_n(thrust::seq, data, len, hash_state->buffer + hash_state->buffer_length);
hash_state->buffer_length += len;
} else {
uint32_t copylen = 64 - hash_state->buffer_length;
thrust::copy_n(thrust::seq, data, copylen, hash_state->buffer + hash_state->buffer_length);
md5_hash_step(hash_state, hash_constants, shift_constants);

while (len > 64 + copylen) {
thrust::copy_n(thrust::seq, data + copylen, 64, hash_state->buffer);
md5_hash_step(hash_state, hash_constants, shift_constants);
copylen += 64;
}

thrust::copy_n(thrust::seq, data + copylen, len - copylen, hash_state->buffer);
hash_state->buffer_length = len - copylen;
}
}

/**
* @brief Finalize MD5 hash including converstion to hex string.
*/
void CUDA_HOST_DEVICE_CALLABLE finalize_md5_hash(md5_intermediate_data* hash_state,
char* result_location,
const md5_hash_constants_type* hash_constants,
const md5_shift_constants_type* shift_constants,
const hex_to_char_mapping_type* hex_char_map)
{
uint64_t full_length = (uint64_t)hash_state->message_length;
full_length = full_length << 3;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
uint64_t full_length = (uint64_t)hash_state->message_length;
full_length = full_length << 3;
auto const full_length = (static_cast<uint64_t>hash_state->message_length) << 3;

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

auto const full_length = (static_cast<uint64_t>(hash_state->message_length)) << 3;

thrust::fill_n(thrust::seq, hash_state->buffer + hash_state->buffer_length, 1, 0x80);

if (hash_state->buffer_length <= 55) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I got a bit lost here. Why 55? Might be good to give the value a name at least (comments are probably not needed if this is a known implementation).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the MD5 Hash the last value after the message is set to 1. Since we're only considering full byte input, that set bit requires a full byte + the full message length in bits requires 9 bytes.

Would if (hash_state->buffer_length < 56) { be more intuitive?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like for this to look something like this:

// 8 bytes for the message length plus a byte for the special character
constexpr int metadata_length = 9;
if (hash_state->buffer_length + metadata_length <= 64) {

(I probably did not get the comment or the name right)

Ideally 64 would also be named (I don't know what name that would be, but you probably do). Then, you can use these constants instead of of 64/55/56 in the code and it should become more readable (and less error-prone).

thrust::fill_n(thrust::seq,
hash_state->buffer + hash_state->buffer_length + 1,
(55 - hash_state->buffer_length),
0x00);
} else {
thrust::fill_n(thrust::seq,
hash_state->buffer + hash_state->buffer_length + 1,
(64 - hash_state->buffer_length),
0x00);
md5_hash_step(hash_state, hash_constants, shift_constants);

thrust::fill_n(thrust::seq, hash_state->buffer, 56, 0x00);
}

thrust::copy_n(thrust::seq, (uint8_t*)&full_length, 8, hash_state->buffer + 56);
md5_hash_step(hash_state, hash_constants, shift_constants);

u_char final_hash[32];
uint8_t* hash_result = (uint8_t*)hash_state->hash_value;
for (int i = 0; i < 16; i++) {
final_hash[i * 2] = hex_char_map[(hash_result[i] >> 4) & 0xf];
final_hash[i * 2 + 1] = hex_char_map[hash_result[i] & 0xf];
}

thrust::copy_n(thrust::seq, final_hash, 32, result_location);
}

} // namespace detail
} // namespace cudf

// MurmurHash3_32 implementation from
// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
//-----------------------------------------------------------------------------
Expand Down Expand Up @@ -250,4 +461,4 @@ struct IdentityHash {
};

template <typename Key>
using default_hash = MurmurHash3_32<Key>;
using default_hash = MurmurHash3_32<Key>;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

missing newline

Suggested change
using default_hash = MurmurHash3_32<Key>;
using default_hash = MurmurHash3_32<Key>;

3 changes: 2 additions & 1 deletion cpp/include/cudf/hashing.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -35,6 +35,7 @@ namespace cudf {
* @returns A column where each row is the hash of a column from the input
*/
std::unique_ptr<column> hash(table_view const& input,
hash_id hash_function = hash_id::HASH_MURMUR3,
std::vector<uint32_t> const& initial_hash = {},
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

Expand Down
9 changes: 9 additions & 0 deletions cpp/include/cudf/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -269,5 +269,14 @@ inline bool operator==(data_type const& lhs, data_type const& rhs) { return lhs.
*/
std::size_t size_of(data_type t);

/**
* @brief Identifies the hash function to be used
*/
enum class hash_id {
HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed
HASH_MURMUR3, ///< Murmur3 hash function
HASH_MD5 ///< MD5 hash function
};

/** @} */
} // namespace cudf
Loading