Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] SHA-1 and SHA-2 hashes #6020

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion cpp/include/cudf/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,12 @@ std::size_t size_of(data_type t);
enum class hash_id {
HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed
HASH_MURMUR3, ///< Murmur3 hash function
HASH_MD5 ///< MD5 hash function
HASH_MD5, ///< MD5 hash function
HASH_SHA1,
HASH_SHA224,
HASH_SHA256,
HASH_SHA384,
HASH_SHA512
};

/** @} */
Expand Down
62 changes: 62 additions & 0 deletions cpp/src/hash/hash_constants.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,5 +60,67 @@ __device__ __constant__ md5_hash_constants_type md5_hash_constants[64] = {
0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391,
};

using sha256_word_type = uint32_t;

struct sha256_intermediate_data {
uint64_t message_length = 0;
uint32_t buffer_length = 0;
uint32_t hash_value[8];
uint8_t buffer[64];
};

__device__ __constant__ sha256_word_type sha256_hash_constants[64] = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm concerned about using up all of the available constant memory with all of these lookup tables. There's only 64KB available.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, what will the access pattern be across threads in the same warp? If it is random, rather than uniform, I think just __device__ const would be better.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Happy to change the approach, was asked to use __device__ __constant__ in the md5 hashing. Should I swap back to the thread_safe_per_context_cache?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oops, Mark's reply didn't show up in my initial comment.

Access pattern for each thread is sequential, but there's nothing synchronizing each thread across the warp -- so I think the access within a warp would effectively be random.

0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
__device__ __constant__ sha256_word_type sha224_initial_hash = {0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939, 0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4};

__device__ __constant__ sha256_word_type sha256_initial_hash = {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19};

using sha512_word_type = uint64_t;

struct sha512_intermediate_data {
uint64_t message_length = 0;
uint32_t buffer_length = 0;
uint64_t hash_value[8];
uint8_t buffer[128];
};

__device__ __constant__ sha512_word_type sha512_hash_constants[80] = {
0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc, 0x3956c25bf348b538,
0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118, 0xd807aa98a3030242, 0x12835b0145706fbe,
0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2, 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235,
0xc19bf174cf692694, 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,
0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5, 0x983e5152ee66dfab,
0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4, 0xc6e00bf33da88fc2, 0xd5a79147930aa725,
0x06ca6351e003826f, 0x142929670a0e6e70, 0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed,
0x53380d139d95b3df, 0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b,
0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30, 0xd192e819d6ef5218,
0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8, 0x19a4c116b8d2d0c8, 0x1e376c085141ab53,
0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8, 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373,
0x682e6ff3d6b2b8a3, 0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec,
0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b, 0xca273eceea26619c,
0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, 0x06f067aa72176fba, 0x0a637dc5a2c898a6,
0x113f9804bef90dae, 0x1b710b35131c471b, 0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc,
0x431d67c49c100d4c, 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
};

__device__ __constant__ sha512_word_type sha384_initial_hash = {
0xcbbb9d5dc1059ed8, 0x629a292a367cd507, 0x9159015a3070dd17, 0x152fecd8f70e5939,
0x67332667ffc00b31, 0x8eb44a8768581511, 0xdb0c2e0d64f98fa7, 0x47b5481dbefa4fa4
};

__device__ __constant__ sha512_word_type sha512_initial_hash = {
0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
};

} // namespace detail
} // namespace cudf
23 changes: 23 additions & 0 deletions cpp/src/hash/hashing.cu
Original file line number Diff line number Diff line change
Expand Up @@ -644,6 +644,12 @@ std::unique_ptr<column> hash(table_view const& input,
switch (hash_function) {
case (hash_id::HASH_MURMUR3): return murmur_hash3_32(input, initial_hash, mr, stream);
case (hash_id::HASH_MD5): return md5_hash(input, mr, stream);
case (hash_id::HASH_SHA1): return sha1(input, mr, stream);
case (hash_id::HASH_SHA224): return sha256_base(input, true, mr, stream);
case (hash_id::HASH_SHA256): return sha256_base(input, false, mr, stream);
case (hash_id::HASH_SHA384): return sha512_base(input, true, mr, stream);
case (hash_id::HASH_SHA512): return sha512_base(input, false, mr, stream);

default: return nullptr;
}
}
Expand Down Expand Up @@ -765,6 +771,23 @@ std::unique_ptr<column> murmur_hash3_32(table_view const& input,
return output;
}

std::unique_ptr<column> sha1(table_view const& input,
rmm::mr::device_memory_resource* mr,
cudaStream_t stream) {
}

std::unique_ptr<column> sha256_base(table_view const& input,
bool truncate_output,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't use a bool parameter. Use an enum class with a descriptive name so it's more obvious at the callsite what this controls.

rmm::mr::device_memory_resource* mr,
cudaStream_t stream) {
}

std::unique_ptr<column> sha512_base(table_view const& input,
bool truncate_output,
rmm::mr::device_memory_resource* mr,
cudaStream_t stream) {
}

} // namespace detail

std::unique_ptr<column> hash(table_view const& input,
Expand Down