Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Add MD5 to existing hashing functionality #5438

Merged
merged 20 commits into from
Aug 13, 2020
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
- PR #5658 Add `filter_tokens` nvtext API
- PR #5666 Add `filter_characters_of_type` strings API
- PR #5673 Always build and test with per-thread default stream enabled in the GPU CI build
- PR #5438 Add MD5 hash support

## Improvements

Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,7 @@ add_library(cudf
src/stream_compaction/drop_duplicates.cu
src/datetime/datetime_ops.cu
src/hash/hashing.cu
src/hash/hash_constants.cu
src/partitioning/partitioning.cu
src/quantiles/quantile.cu
src/quantiles/quantiles.cu
Expand Down
14 changes: 13 additions & 1 deletion cpp/include/cudf/detail/hashing.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -37,9 +37,21 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> hash(table_view const& input,
hash_id hash_function = hash_id::HASH_MURMUR3,
std::vector<uint32_t> const& initial_hash = {},
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(),
cudaStream_t stream = 0);

std::unique_ptr<column> murmur_hash3_32(
table_view const& input,
std::vector<uint32_t> const& initial_hash = {},
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(),
cudaStream_t stream = 0);

std::unique_ptr<column> md5_hash(
table_view const& input,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(),
cudaStream_t stream = 0);

} // namespace detail
} // namespace cudf
205 changes: 204 additions & 1 deletion cpp/include/cudf/detail/utilities/hash_functions.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017, NVIDIA CORPORATION.
* Copyright (c) 2017-2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -16,10 +16,213 @@

#pragma once

#include <cstdint>
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
#include <cudf/strings/string_view.cuh>
#include <hash/hash_constants.hpp>

#include "cuda_runtime_api.h"
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
#include "cudf/types.hpp"
#include "driver_types.h"
#include "vector_types.h"
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved

using hash_value_type = uint32_t;

namespace cudf {
namespace detail {

/**
* @brief Core MD5 algorith implementation. Processes a single 512-bit chunk,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
* @brief Core MD5 algorith implementation. Processes a single 512-bit chunk,
* @brief Core MD5 algorithm implementation. Processes a single 512-bit chunk,

run spell check on all comments once.

* updating the hash value so far. Does not zero out the buffer contents.
*/
void CUDA_DEVICE_CALLABLE md5_hash_step(md5_intermediate_data* hash_state,
md5_hash_constants_type const* hash_constants,
md5_shift_constants_type const* shift_constants)
{
uint32_t A = hash_state->hash_value[0];
uint32_t B = hash_state->hash_value[1];
uint32_t C = hash_state->hash_value[2];
uint32_t D = hash_state->hash_value[3];
rwlee marked this conversation as resolved.
Show resolved Hide resolved

for (unsigned int j = 0; j < 64; j++) {
uint32_t F;
uint32_t g;
switch (j / 16) {
case 0:
F = (B & C) | ((~B) & D);
g = j;
break;
case 1:
F = (D & B) | ((~D) & C);
g = (5 * j + 1) % 16;
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
break;
case 2:
F = B ^ C ^ D;
g = (3 * j + 5) % 16;
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
break;
case 3:
F = C ^ (B | (~D));
g = (7 * j) % 16;
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
break;
}

uint32_t buffer_element_as_int;
thrust::copy_n(thrust::seq,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't suggest using thrust::copy here. Use the compiler built-in memcpy, otherwise it may not get optimized away.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

memcpy is slower.
https://stackoverflow.com/a/49037139/1550940 (tested on 3.0 CC device. Need to validate once)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jrhemstad
memcpy copies one byte at a time and uses loop.

        ld.global.u8    %rs1, [%rd10];
        add.s64         %rd11, %rd3, %rd12;
        st.global.u8    [%rd11], %rs1;

thrust::copy_n copies the type size at a time.
For example, for int it uses unrolled loop of copying 32-bit at a time.

   ld.global.u32   %r3, [%rd4+8];
   st.global.u32   [%rd3+8], %r3;
   ld.global.u32   %r4, [%rd4+12];

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No actual copies should be performed. The compiler will optimize them away. This is a standard trick for type-punning without breaking aliasing rules: https://gist.github.com/shafik/848ae25ee209f698763cffee272a58f8#how-do-we-type-pun-correctly

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's optimizing away if these are registers!
(for global memory, it doesn't, as expected).

Copy link
Contributor

@jrhemstad jrhemstad Jul 30, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the compiler needs to see the declarations in order to elide the copies.

It is not expected that the same optimization doesn't work for global memory. This is purely a pessimization made by nvcc where if it can't see the declaration, it assumes the pointer is underaligned and performs 1B copies. Notice how gcc makes the same optimization without needing to go through temporaries: https://godbolt.org/z/f6Kra8

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changing this to memcpy(&buffer_element_as_int, hash_state->buffer + g * 4, 4); the compilation being killed

[22/76] Building CUDA object CMakeFiles/cudf.dir/src/hash/hashing.cu.o
FAILED: /usr/local/cuda-10.1/bin/nvcc  -Dcudf_EXPORTS -Igoogletest/install/include -I_deps/cub-src -I_deps/thrust-src -I_deps/jitify-src -I_deps/libcudacxx-src/include -I/usr/local/cuda-10.1/targets/x86_64-linux/include -Iinclude -Iinclude/jit -I../include -I../src -I/home/ryanlee/miniconda2/envs/cudf_dev_test/include -Xcompiler -Wno-parentheses -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70 --expt-extended-lambda --expt-relaxed-constexpr -Werror=cross-execution-space-call -Xcompiler -Wall,-Werror,-Wno-error=deprecated-declarations -Xcompiler -Wno-deprecated-declarations -DBOOST_NO_CXX14_CONSTEXPR -DHT_DEFAULT_ALLOCATOR -O3 -DNDEBUG -Xcompiler=-fPIC   -DJITIFY_USE_CACHE -DCUDF_VERSION=0.15.0 -std=c++14 -x cu -c ../src/hash/hashing.cu -o CMakeFiles/cudf.dir/src/hash/hashing.cu.o && /usr/local/cuda-10.1/bin/nvcc  -Dcudf_EXPORTS -Igoogletest/install/include -I_deps/cub-src -I_deps/thrust-src -I_deps/jitify-src -I_deps/libcudacxx-src/include -I/usr/local/cuda-10.1/targets/x86_64-linux/include -Iinclude -Iinclude/jit -I../include -I../src -I/home/ryanlee/miniconda2/envs/cudf_dev_test/include -Xcompiler -Wno-parentheses -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70 --expt-extended-lambda --expt-relaxed-constexpr -Werror=cross-execution-space-call -Xcompiler -Wall,-Werror,-Wno-error=deprecated-declarations -Xcompiler -Wno-deprecated-declarations -DBOOST_NO_CXX14_CONSTEXPR -DHT_DEFAULT_ALLOCATOR -O3 -DNDEBUG -Xcompiler=-fPIC   -DJITIFY_USE_CACHE -DCUDF_VERSION=0.15.0 -std=c++14 -x cu -M ../src/hash/hashing.cu -MT CMakeFiles/cudf.dir/src/hash/hashing.cu.o -o CMakeFiles/cudf.dir/src/hash/hashing.cu.o.d
Killed

Maybe i'm missing something here, but that's why I had gone back to thrust::copy_n despite your original suggestion.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

:/ That will require further investigation.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After removing the md5_element_hasher class, the replacing the copy_n worked fine.

hash_state->buffer + g * 4,
4,
reinterpret_cast<uint8_t*>(&buffer_element_as_int));
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
F = F + A + hash_constants[j] + buffer_element_as_int;
A = D;
D = C;
C = B;

uint32_t shift = shift_constants[((j / 16) * 4) + (j % 4)];
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
B = B + ((F << shift) | (F >> (32 - shift)));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use funnel_shift here.
Check other bit intrinsic functions too.
https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__INT.html

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I substituted __funnelshift_l in -- replacing lines 78-79 with B = B + __funnelshift_l(F, F, shift_constants[((j / 16) * 4) + (j % 4)]); but I'm a little hesitant to commit and push the changes because it more than tripled my build time. Any idea why this would cause such a massive jump?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Besides, build time. Does it improve performance?
Is it built only for your GPU architecture or all GPU architectures?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was building for multiple architectures, but I don't think I was comparing apples to apples because I was using ninja to build cudf. In a few tests adding and reverting changes, the compile times seem nearly identical.

}

hash_state->hash_value[0] += A;
hash_state->hash_value[1] += B;
hash_state->hash_value[2] += C;
hash_state->hash_value[3] += D;

hash_state->buffer_length = 0;
}

template <typename Key>
struct MD5Hash {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is MD5Hash templated on Key in addition to the operator()s being templates? I would think the struct does not need to be a template and instead just make the operator() be a template. That would simplify your specializations.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was largely copying the structure of the murmur hash function, I'll change the operator to a template.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but notice that MurmurHas3_32 is a class template:

template <typename Key>
struct MurmurHash3_32 {

But the operator() is not:

result_type CUDA_HOST_DEVICE_CALLABLE operator()(Key const& key) const { return compute(key); }

In your case, both are templates, which just complicates the specializations.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In order to fix the type dispatching error this was changed. The base operator is no longer templated https://github.com/rapidsai/cudf/pull/5438/files#diff-a6ce3f9a4f61a23dd6469473c7dbf15fR147

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed the template when I removed the md5_element_hasher

using argument_type = Key;

/**
* @brief Core MD5 element processing function
*/
template <typename TKey>
void __device__ process(TKey const& key,
md5_intermediate_data* hash_state,
md5_hash_constants_type const* hash_constants,
md5_shift_constants_type const* shift_constants) const
{
uint32_t const len = sizeof(TKey);
uint8_t const* data = reinterpret_cast<uint8_t const*>(&key);
hash_state->message_length += len;

if (hash_state->buffer_length + len < 64) {
thrust::copy_n(thrust::seq, data, len, hash_state->buffer + hash_state->buffer_length);
jrhemstad marked this conversation as resolved.
Show resolved Hide resolved
hash_state->buffer_length += len;
} else {
uint32_t copylen = 64 - hash_state->buffer_length;

thrust::copy_n(thrust::seq, data, copylen, hash_state->buffer + hash_state->buffer_length);
md5_hash_step(hash_state, hash_constants, shift_constants);

while (len > 64 + copylen) {
thrust::copy_n(thrust::seq, data + copylen, 64, hash_state->buffer);
md5_hash_step(hash_state, hash_constants, shift_constants);
copylen += 64;
}

thrust::copy_n(thrust::seq, data + copylen, len - copylen, hash_state->buffer);
hash_state->buffer_length = len - copylen;
}
}

template <typename T, typename std::enable_if_t<is_chrono<T>()>* = nullptr>
void __device__ operator()(T const& key,
md5_intermediate_data* hash_state,
md5_hash_constants_type const* hash_constants,
md5_shift_constants_type const* shift_constants) const
{
release_assert(false && "MD5 Unsupported chrono type column");
}

template <typename T,
typename std::enable_if_t<!std::is_same<T, cudf::string_view>::value &&
!is_fixed_width<T>()>* = nullptr>
void __device__ operator()(T const& key,
md5_intermediate_data* hash_state,
md5_hash_constants_type const* hash_constants,
md5_shift_constants_type const* shift_constants) const
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of repeating hash_constants and shift_constants as function parameters, it might be nicer to make these data members of your MD5Hash class that are set at construction.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They could be made __constant__.

{
release_assert(false && "MD5 Unsupported non-fixed-width type column");
}

void CUDA_DEVICE_CALLABLE operator()(argument_type const& key,
md5_intermediate_data* hash_state,
md5_hash_constants_type const* hash_constants,
md5_shift_constants_type const* shift_constants) const
{
process(key, hash_state, hash_constants, shift_constants);
}

template <typename T, typename std::enable_if_t<std::is_floating_point<T>::value>* = nullptr>
void __device__ process_floating_point(T const& key,
md5_intermediate_data* hash_state,
md5_hash_constants_type const* hash_constants,
md5_shift_constants_type const* shift_constants) const
{
if (isnan(key)) {
T nan = std::numeric_limits<T>::quiet_NaN();
process(nan, hash_state, hash_constants, shift_constants);
} else if (key == T{0.0}) {
process(T{0.0}, hash_state, hash_constants, shift_constants);
} else {
process(key, hash_state, hash_constants, shift_constants);
}
}
};

template <>
void CUDA_DEVICE_CALLABLE
MD5Hash<string_view>::operator()(string_view const& key,
md5_intermediate_data* hash_state,
md5_hash_constants_type const* hash_constants,
md5_shift_constants_type const* shift_constants) const
{
uint32_t const len = static_cast<uint32_t>(key.size_bytes());
uint8_t const* data = reinterpret_cast<uint8_t const*>(key.data());

hash_state->message_length += len;

if (hash_state->buffer_length + len < 64) {
thrust::copy_n(thrust::seq, data, len, hash_state->buffer + hash_state->buffer_length);
hash_state->buffer_length += len;
} else {
uint32_t copylen = 64 - hash_state->buffer_length;
thrust::copy_n(thrust::seq, data, copylen, hash_state->buffer + hash_state->buffer_length);
md5_hash_step(hash_state, hash_constants, shift_constants);

while (len > 64 + copylen) {
thrust::copy_n(thrust::seq, data + copylen, 64, hash_state->buffer);
md5_hash_step(hash_state, hash_constants, shift_constants);
copylen += 64;
}

thrust::copy_n(thrust::seq, data + copylen, len - copylen, hash_state->buffer);
hash_state->buffer_length = len - copylen;
}
}

template <>
void CUDA_DEVICE_CALLABLE
MD5Hash<float>::operator()(float const& key,
md5_intermediate_data* hash_state,
md5_hash_constants_type const* hash_constants,
md5_shift_constants_type const* shift_constants) const
{
this->process_floating_point(key, hash_state, hash_constants, shift_constants);
}

template <>
void CUDA_DEVICE_CALLABLE
MD5Hash<double>::operator()(double const& key,
md5_intermediate_data* hash_state,
md5_hash_constants_type const* hash_constants,
md5_shift_constants_type const* shift_constants) const
{
this->process_floating_point(key, hash_state, hash_constants, shift_constants);
}

} // namespace detail
} // namespace cudf

// MurmurHash3_32 implementation from
// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
//-----------------------------------------------------------------------------
Expand Down
3 changes: 2 additions & 1 deletion cpp/include/cudf/hashing.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -35,6 +35,7 @@ namespace cudf {
* @returns A column where each row is the hash of a column from the input
*/
std::unique_ptr<column> hash(table_view const& input,
hash_id hash_function = hash_id::HASH_MURMUR3,
std::vector<uint32_t> const& initial_hash = {},
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

Expand Down
9 changes: 9 additions & 0 deletions cpp/include/cudf/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -269,5 +269,14 @@ inline bool operator==(data_type const& lhs, data_type const& rhs) { return lhs.
*/
std::size_t size_of(data_type t);

/**
* @brief Identifies the hash function to be used
*/
enum class hash_id {
HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed
HASH_MURMUR3, ///< Murmur3 hash function
HASH_MD5 ///< MD5 hash function
};

/** @} */
} // namespace cudf
Loading