-
Notifications
You must be signed in to change notification settings - Fork 916
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[REVIEW] Add MD5 to existing hashing functionality #5438
Changes from 5 commits
b626432
0d3845c
d32552d
a5ed47f
f4ad66e
fa8c5d1
535ff93
c1725ec
14d7672
fa93274
94ceeae
8ae065e
efd8fdb
ba9efae
f3daf4b
f41c8cc
614182b
6aaa7b8
3b68a9d
7379b23
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
@@ -1,5 +1,5 @@ | ||||||||||
/* | ||||||||||
* Copyright (c) 2017, NVIDIA CORPORATION. | ||||||||||
* Copyright (c) 2017-2020, NVIDIA CORPORATION. | ||||||||||
* | ||||||||||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||||||||||
* you may not use this file except in compliance with the License. | ||||||||||
|
@@ -16,10 +16,221 @@ | |||||||||
|
||||||||||
#pragma once | ||||||||||
|
||||||||||
#include <cstdint> | ||||||||||
karthikeyann marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||
#include <cudf/strings/string_view.cuh> | ||||||||||
#include <hash/hash_constants.hpp> | ||||||||||
|
||||||||||
#include "cuda_runtime_api.h" | ||||||||||
karthikeyann marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||
#include "cudf/types.hpp" | ||||||||||
#include "driver_types.h" | ||||||||||
#include "vector_types.h" | ||||||||||
karthikeyann marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||
|
||||||||||
using hash_value_type = uint32_t; | ||||||||||
|
||||||||||
namespace cudf { | ||||||||||
namespace detail { | ||||||||||
|
||||||||||
/** | ||||||||||
* @brief Helper function, left rotate bit value the value n bits | ||||||||||
*/ | ||||||||||
CUDA_HOST_DEVICE_CALLABLE uint32_t left_rotate(uint32_t value, uint32_t shift) | ||||||||||
{ | ||||||||||
return (value << shift) | (value >> (32 - shift)); | ||||||||||
} | ||||||||||
|
||||||||||
/** | ||||||||||
* @brief Core MD5 algorith implementation. Processes a single 512-bit chunk, | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
run spell check on all comments once. |
||||||||||
* updating the hash value so far. Does not zero out the buffer contents. | ||||||||||
*/ | ||||||||||
void CUDA_HOST_DEVICE_CALLABLE md5_hash_step(md5_intermediate_data* hash_state, | ||||||||||
const md5_hash_constants_type* hash_constants, | ||||||||||
const md5_shift_constants_type* shift_constants) | ||||||||||
{ | ||||||||||
uint32_t A = hash_state->hash_value[0]; | ||||||||||
uint32_t B = hash_state->hash_value[1]; | ||||||||||
uint32_t C = hash_state->hash_value[2]; | ||||||||||
uint32_t D = hash_state->hash_value[3]; | ||||||||||
rwlee marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||
|
||||||||||
uint32_t* buffer_ints = (uint32_t*)hash_state->buffer; | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use reinterpret_cast There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Switched to a memcpy
|
||||||||||
|
||||||||||
for (unsigned int j = 0; j < 64; j++) { | ||||||||||
uint32_t F, g; | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||
switch (j / 16) { | ||||||||||
case 0: | ||||||||||
F = (B & C) | ((~B) & D); // D ^ (B & (C ^ D)) | ||||||||||
g = j; | ||||||||||
break; | ||||||||||
case 1: | ||||||||||
F = (D & B) | ((~D) & C); | ||||||||||
g = (5 * j + 1) % 16; | ||||||||||
karthikeyann marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||
break; | ||||||||||
case 2: | ||||||||||
F = B ^ C ^ D; | ||||||||||
g = (3 * j + 5) % 16; | ||||||||||
karthikeyann marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||
break; | ||||||||||
case 3: | ||||||||||
F = C ^ (B | (~D)); | ||||||||||
g = (7 * j) % 16; | ||||||||||
karthikeyann marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||
break; | ||||||||||
} | ||||||||||
|
||||||||||
F = F + A + hash_constants[j] + buffer_ints[g]; | ||||||||||
|
||||||||||
A = D; | ||||||||||
D = C; | ||||||||||
C = B; | ||||||||||
B = B + left_rotate(F, shift_constants[((j / 16) * 4) + (j % 4)]); | ||||||||||
} | ||||||||||
|
||||||||||
hash_state->hash_value[0] += A; | ||||||||||
hash_state->hash_value[1] += B; | ||||||||||
hash_state->hash_value[2] += C; | ||||||||||
hash_state->hash_value[3] += D; | ||||||||||
|
||||||||||
hash_state->buffer_length = 0; | ||||||||||
} | ||||||||||
|
||||||||||
template <typename Key> | ||||||||||
struct MD5Hash { | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was largely copying the structure of the murmur hash function, I'll change the operator to a template. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, but notice that cudf/cpp/include/cudf/detail/utilities/hash_functions.cuh Lines 32 to 33 in 855e735
But the
In your case, both are templates, which just complicates the specializations. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In order to fix the type dispatching error this was changed. The base operator is no longer templated https://github.com/rapidsai/cudf/pull/5438/files#diff-a6ce3f9a4f61a23dd6469473c7dbf15fR147 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removed the template when I removed the md5_element_hasher |
||||||||||
using argument_type = Key; | ||||||||||
|
||||||||||
/** | ||||||||||
* @brief Core MD5 element processing function | ||||||||||
*/ | ||||||||||
template <typename TKey> | ||||||||||
void CUDA_HOST_DEVICE_CALLABLE process(TKey const& key, | ||||||||||
const uint32_t len, | ||||||||||
md5_intermediate_data* hash_state, | ||||||||||
const md5_hash_constants_type* hash_constants, | ||||||||||
const md5_shift_constants_type* shift_constants) const | ||||||||||
rwlee marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||
{ | ||||||||||
uint8_t* data = (uint8_t*)&key; | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. reinterpret_cast here too There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||||||||||
hash_state->message_length += len; | ||||||||||
|
||||||||||
if (hash_state->buffer_length + len < 64) { | ||||||||||
thrust::copy_n(thrust::seq, data, len, hash_state->buffer + hash_state->buffer_length); | ||||||||||
jrhemstad marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||
hash_state->buffer_length += len; | ||||||||||
} else { | ||||||||||
uint32_t copylen = 64 - hash_state->buffer_length; | ||||||||||
|
||||||||||
thrust::copy_n(thrust::seq, data, copylen, hash_state->buffer + hash_state->buffer_length); | ||||||||||
md5_hash_step(hash_state, hash_constants, shift_constants); | ||||||||||
|
||||||||||
while (len > 64 + copylen) { | ||||||||||
thrust::copy_n(thrust::seq, data + copylen, 64, hash_state->buffer); | ||||||||||
md5_hash_step(hash_state, hash_constants, shift_constants); | ||||||||||
copylen += 64; | ||||||||||
} | ||||||||||
|
||||||||||
thrust::copy_n(thrust::seq, data + copylen, len - copylen, hash_state->buffer); | ||||||||||
hash_state->buffer_length = len - copylen; | ||||||||||
} | ||||||||||
} | ||||||||||
|
||||||||||
template <typename T, typename std::enable_if_t<is_fixed_width<T>()>* = nullptr> | ||||||||||
void CUDA_HOST_DEVICE_CALLABLE operator()(T const& key, | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can these be just |
||||||||||
md5_intermediate_data* hash_state, | ||||||||||
const md5_hash_constants_type* hash_constants, | ||||||||||
const md5_shift_constants_type* shift_constants) const | ||||||||||
{ | ||||||||||
process(key, size_of(key), hash_state, hash_constants, shift_constants); | ||||||||||
} | ||||||||||
|
||||||||||
template <typename T, typename std::enable_if_t<!is_fixed_width<T>()>* = nullptr> | ||||||||||
void CUDA_HOST_DEVICE_CALLABLE operator()(T const& key, | ||||||||||
md5_intermediate_data* hash_state, | ||||||||||
const md5_hash_constants_type* hash_constants, | ||||||||||
const md5_shift_constants_type* shift_constants) const | ||||||||||
{ | ||||||||||
CUDF_FAIL("Unsupported hash type"); | ||||||||||
} | ||||||||||
|
||||||||||
void CUDA_HOST_DEVICE_CALLABLE operator()(Key const& key, | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this one empty? Might warrant a comment if I'm not missing something trivial. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did a bunch of testing today to understand and explain this issue better. This Ideally the default behavior should actually be a During my testing this afternoon, non-fixed width types never hit the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, this isn't right. You're not hitting There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. EDIT: the advice does not stand, see the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd suggest looking at cudf/cpp/include/cudf/table/row_operators.cuh Line 411 in 855e735
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Required a bit of a rework, the use of |
||||||||||
md5_intermediate_data* hash_state, | ||||||||||
const md5_hash_constants_type* hash_constants, | ||||||||||
const md5_shift_constants_type* shift_constants) const | ||||||||||
{ | ||||||||||
} | ||||||||||
}; | ||||||||||
|
||||||||||
/** | ||||||||||
* @brief Specialization of MD5Hash operator for strings. | ||||||||||
*/ | ||||||||||
template <> | ||||||||||
void CUDA_HOST_DEVICE_CALLABLE | ||||||||||
MD5Hash<cudf::string_view>::operator()(cudf::string_view const& key, | ||||||||||
md5_intermediate_data* hash_state, | ||||||||||
const md5_hash_constants_type* hash_constants, | ||||||||||
const md5_shift_constants_type* shift_constants) const | ||||||||||
{ | ||||||||||
const uint32_t len = (uint32_t)key.size_bytes(); | ||||||||||
const uint8_t* data = (const uint8_t*)key.data(); | ||||||||||
|
||||||||||
hash_state->message_length += len; | ||||||||||
|
||||||||||
if (hash_state->buffer_length + len < 64) { | ||||||||||
thrust::copy_n(thrust::seq, data, len, hash_state->buffer + hash_state->buffer_length); | ||||||||||
hash_state->buffer_length += len; | ||||||||||
} else { | ||||||||||
uint32_t copylen = 64 - hash_state->buffer_length; | ||||||||||
thrust::copy_n(thrust::seq, data, copylen, hash_state->buffer + hash_state->buffer_length); | ||||||||||
md5_hash_step(hash_state, hash_constants, shift_constants); | ||||||||||
|
||||||||||
while (len > 64 + copylen) { | ||||||||||
thrust::copy_n(thrust::seq, data + copylen, 64, hash_state->buffer); | ||||||||||
md5_hash_step(hash_state, hash_constants, shift_constants); | ||||||||||
copylen += 64; | ||||||||||
} | ||||||||||
|
||||||||||
thrust::copy_n(thrust::seq, data + copylen, len - copylen, hash_state->buffer); | ||||||||||
hash_state->buffer_length = len - copylen; | ||||||||||
} | ||||||||||
} | ||||||||||
|
||||||||||
/** | ||||||||||
* @brief Finalize MD5 hash including converstion to hex string. | ||||||||||
*/ | ||||||||||
void CUDA_HOST_DEVICE_CALLABLE finalize_md5_hash(md5_intermediate_data* hash_state, | ||||||||||
char* result_location, | ||||||||||
const md5_hash_constants_type* hash_constants, | ||||||||||
const md5_shift_constants_type* shift_constants, | ||||||||||
const hex_to_char_mapping_type* hex_char_map) | ||||||||||
{ | ||||||||||
uint64_t full_length = (uint64_t)hash_state->message_length; | ||||||||||
full_length = full_length << 3; | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||||||||||
thrust::fill_n(thrust::seq, hash_state->buffer + hash_state->buffer_length, 1, 0x80); | ||||||||||
|
||||||||||
if (hash_state->buffer_length <= 55) { | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I got a bit lost here. Why 55? Might be good to give the value a name at least (comments are probably not needed if this is a known implementation). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the MD5 Hash the last value after the message is set to 1. Since we're only considering full byte input, that set bit requires a full byte + the full message length in bits requires 9 bytes. Would There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd like for this to look something like this:
(I probably did not get the comment or the name right) Ideally 64 would also be named (I don't know what name that would be, but you probably do). Then, you can use these constants instead of of 64/55/56 in the code and it should become more readable (and less error-prone). |
||||||||||
thrust::fill_n(thrust::seq, | ||||||||||
hash_state->buffer + hash_state->buffer_length + 1, | ||||||||||
(55 - hash_state->buffer_length), | ||||||||||
0x00); | ||||||||||
} else { | ||||||||||
thrust::fill_n(thrust::seq, | ||||||||||
hash_state->buffer + hash_state->buffer_length + 1, | ||||||||||
(64 - hash_state->buffer_length), | ||||||||||
0x00); | ||||||||||
md5_hash_step(hash_state, hash_constants, shift_constants); | ||||||||||
|
||||||||||
thrust::fill_n(thrust::seq, hash_state->buffer, 56, 0x00); | ||||||||||
} | ||||||||||
|
||||||||||
thrust::copy_n(thrust::seq, (uint8_t*)&full_length, 8, hash_state->buffer + 56); | ||||||||||
md5_hash_step(hash_state, hash_constants, shift_constants); | ||||||||||
|
||||||||||
u_char final_hash[32]; | ||||||||||
uint8_t* hash_result = (uint8_t*)hash_state->hash_value; | ||||||||||
for (int i = 0; i < 16; i++) { | ||||||||||
final_hash[i * 2] = hex_char_map[(hash_result[i] >> 4) & 0xf]; | ||||||||||
final_hash[i * 2 + 1] = hex_char_map[hash_result[i] & 0xf]; | ||||||||||
} | ||||||||||
|
||||||||||
thrust::copy_n(thrust::seq, final_hash, 32, result_location); | ||||||||||
} | ||||||||||
|
||||||||||
} // namespace detail | ||||||||||
} // namespace cudf | ||||||||||
|
||||||||||
// MurmurHash3_32 implementation from | ||||||||||
// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp | ||||||||||
//----------------------------------------------------------------------------- | ||||||||||
|
@@ -250,4 +461,4 @@ struct IdentityHash { | |||||||||
}; | ||||||||||
|
||||||||||
template <typename Key> | ||||||||||
using default_hash = MurmurHash3_32<Key>; | ||||||||||
using default_hash = MurmurHash3_32<Key>; | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. missing newline
Suggested change
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what is this function?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would be the wrapper call for the IdentityHash function, I did not end up implementing it and will remove it.