From 83f717a3ef19f633f478c5a9c19f75bc28291837 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Fri, 31 Jul 2020 15:02:06 +0800 Subject: [PATCH] OpenCL improvements This patch attempts to tweak OpenCL kernel for the follow aspects: 1. Reduce unnecessary memory access; 2. Remove non-reacheable code; 3. Specialize character-wise set; 4. Add loop unrolling hints; 5. Assume the messages no exceeding 17 exabytes and apply optimizations; It is known to bring about 15% speedup on NVIDIA TITAN Xp. --- gpu_cl.go | 197 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 104 insertions(+), 93 deletions(-) diff --git a/gpu_cl.go b/gpu_cl.go index 3c25a9a..d5905b8 100644 --- a/gpu_cl.go +++ b/gpu_cl.go @@ -189,53 +189,56 @@ typedef struct __blake2b_param uchar salt[BLAKE2B_SALTBYTES]; // 48 uchar personal[BLAKE2B_PERSONALBYTES]; // 64 } blake2b_param; + +// Optimize: blake2 can overflow a uint64_t only if messages larger than +// 17 exabytes are seen. Set the value 2 if messages larger than 17 exabytes +// should be supported, otherwise keep 1. +#define BLAKE2B_LONG_MESSAGE (1) + typedef struct __blake2b_state { ulong h[8]; - ulong t[2]; + ulong t[BLAKE2B_LONG_MESSAGE]; ulong f[2]; uchar buf[2 * BLAKE2B_BLOCKBYTES]; size_t buflen; - uchar last_node; } blake2b_state; -__constant static const ulong blake2b_IV[8] = + +enum Blake2b_IV { - 0x6a09e667f3bcc908UL, 0xbb67ae8584caa73bUL, - 0x3c6ef372fe94f82bUL, 0xa54ff53a5f1d36f1UL, - 0x510e527fade682d1UL, 0x9b05688c2b3e6c1fUL, - 0x1f83d9abfb41bd6bUL, 0x5be0cd19137e2179UL + iv0 = 0x6a09e667f3bcc908UL, + iv1 = 0xbb67ae8584caa73bUL, + iv2 = 0x3c6ef372fe94f82bUL, + iv3 = 0xa54ff53a5f1d36f1UL, + iv4 = 0x510e527fade682d1UL, + iv5 = 0x9b05688c2b3e6c1fUL, + iv6 = 0x1f83d9abfb41bd6bUL, + iv7 = 0x5be0cd19137e2179UL, }; -__constant static const uchar blake2b_sigma[12][16] = +__constant const static ulong blake2b_IV[8] = {iv0, iv1, iv2, iv3, iv4, iv5, iv6, iv7}; + +__constant const static uchar blake2b_sigma[12 * 16] = { - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3, + 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4, + 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8, + 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13, + 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9, + 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11, + 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10, + 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5, + 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3, }; -static inline int blake2b_set_lastnode( blake2b_state *S ) -{ - S->f[1] = ~0UL; - return 0; -} -/* Some helper functions, not necessarily useful */ -static inline int blake2b_set_lastblock( blake2b_state *S ) -{ - if( S->last_node ) blake2b_set_lastnode( S ); - S->f[0] = ~0UL; - return 0; -} + static inline int blake2b_increment_counter( blake2b_state *S, const ulong inc ) { S->t[0] += inc; +#if BLAKE2B_LONG_MESSAGE > 1 S->t[1] += ( S->t[0] < inc ); +#endif return 0; } static inline ulong load64( const void *src ) @@ -283,16 +286,15 @@ static inline void store64( void *dst, ulong w ) *p++ = ( uchar )w; #endif } -static inline ulong rotr64( const ulong w, const unsigned c ) -{ - return ( w >> c ) | ( w << ( 64 - c ) ); -} -static void ucharset (void * dest_a, int val, size_t count) + +static inline ulong rotr64(ulong a, ulong shift) { return rotate(a, 64 - shift); } + +static void memzero (void * dest_a, size_t count) { uchar * dest = (uchar *)dest_a; for (size_t i = 0; i < count; ++i) { - *dest++ = val; + *dest++ = 0; } } /* init xors IV with input parameter block */ @@ -304,7 +306,7 @@ static inline int blake2b_init_param( blake2b_state *S, const blake2b_param *P ) h = ( uchar * )( S->h ); p = ( uchar * )( P ); /* IV XOR ParamBlock */ - ucharset( S, 0, sizeof( blake2b_state ) ); + memzero( S, sizeof( blake2b_state ) ); for( int i = 0; i < BLAKE2B_OUTBYTES; ++i ) h[i] = v[i] ^ p[i]; return 0; } @@ -320,68 +322,74 @@ static inline int blake2b_init( blake2b_state *S, const uchar outlen ) store64( &P->node_offset, 0 ); P->node_depth = 0; P->inner_length = 0; - ucharset( P->reserved, 0, sizeof( P->reserved ) ); - ucharset( P->salt, 0, sizeof( P->salt ) ); - ucharset( P->personal, 0, sizeof( P->personal ) ); + memzero( P->reserved, sizeof( P->reserved ) ); + memzero( P->salt, sizeof( P->salt ) ); + memzero( P->personal, sizeof( P->personal ) ); return blake2b_init_param( S, P ); } + +#define G(r,i,a,b,c,d) \ + do { \ + a = a + b + m[blake2b_sigma[r * 16 + i * 2 + 0]]; \ + d = rotr64(d ^ a, 32); \ + c = c + d; \ + b = rotr64(b ^ c, 24); \ + a = a + b + m[blake2b_sigma[r * 16 + i * 2 + 1]]; \ + d = rotr64(d ^ a, 16); \ + c = c + d; \ + b = rotr64(b ^ c, 63); \ + } while(0) +#define ROUND(r) \ + do { \ + G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ + G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ + G(r,2,v[ 2],v[ 6],v[10],v[14]); \ + G(r,3,v[ 3],v[ 7],v[11],v[15]); \ + G(r,4,v[ 0],v[ 5],v[10],v[15]); \ + G(r,5,v[ 1],v[ 6],v[11],v[12]); \ + G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ + G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ + } while(0) +#define BLAKE2B_ROUNDS() ROUND(0);ROUND(1);ROUND(2);ROUND(3);ROUND(4);ROUND(5);ROUND(6);ROUND(7);ROUND(8);ROUND(9);ROUND(10);ROUND(11); + static int blake2b_compress( blake2b_state *S, __private const uchar block[BLAKE2B_BLOCKBYTES] ) { ulong m[16]; ulong v[16]; int i; + for( i = 0; i < 16; ++i ) - m[i] = load64( block + i * sizeof( m[i] ) ); + m[i] = load64( block + i * sizeof( m[i] ) ); + + #pragma unroll 8 for( i = 0; i < 8; ++i ) - v[i] = S->h[i]; - v[ 8] = blake2b_IV[0]; - v[ 9] = blake2b_IV[1]; - v[10] = blake2b_IV[2]; - v[11] = blake2b_IV[3]; - v[12] = S->t[0] ^ blake2b_IV[4]; - v[13] = S->t[1] ^ blake2b_IV[5]; - v[14] = S->f[0] ^ blake2b_IV[6]; - v[15] = S->f[1] ^ blake2b_IV[7]; -#define G(r,i,a,b,c,d) \ - do { \ - a = a + b + m[blake2b_sigma[r][2*i+0]]; \ - d = rotr64(d ^ a, 32); \ - c = c + d; \ - b = rotr64(b ^ c, 24); \ - a = a + b + m[blake2b_sigma[r][2*i+1]]; \ - d = rotr64(d ^ a, 16); \ - c = c + d; \ - b = rotr64(b ^ c, 63); \ - } while(0) -#define ROUND(r) \ - do { \ - G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ - G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ - G(r,2,v[ 2],v[ 6],v[10],v[14]); \ - G(r,3,v[ 3],v[ 7],v[11],v[15]); \ - G(r,4,v[ 0],v[ 5],v[10],v[15]); \ - G(r,5,v[ 1],v[ 6],v[11],v[12]); \ - G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ - G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ - } while(0) - ROUND( 0 ); - ROUND( 1 ); - ROUND( 2 ); - ROUND( 3 ); - ROUND( 4 ); - ROUND( 5 ); - ROUND( 6 ); - ROUND( 7 ); - ROUND( 8 ); - ROUND( 9 ); - ROUND( 10 ); - ROUND( 11 ); + v[i] = S->h[i]; + + v[ 8] = iv0; + v[ 9] = iv1; + v[10] = iv2; + v[11] = iv3; + v[12] = S->t[0] ^ iv4; +#if BLAKE2B_LONG_MESSAGE > 1 + v[13] = S->t[1] ^ iv5; +#else + v[13] = iv5; +#endif + v[14] = S->f[0] ^ iv6; + v[15] = iv7; /* ^ S->f[1] removed: no last_node */ + + BLAKE2B_ROUNDS(); + + #pragma unroll 8 for( i = 0; i < 8; ++i ) - S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; -#undef G -#undef ROUND + S->h[i] ^= v[i] ^ v[i + 8]; + return 0; } +#undef G +#undef ROUND +#undef BLAKE2B_ROUNDS + static void ucharcpy (uchar * dst, uchar const * src, size_t count) { for (size_t i = 0; i < count; ++i) @@ -434,10 +442,13 @@ static int blake2b_final( blake2b_state *S, uchar *out, uchar outlen ) // if ( S->t[0] < inc ) // S->t[1] += 1; // This seems to crash the opencl compiler though fortunately this is calculating size and we don't do things bigger than 2^32 - - blake2b_set_lastblock( S ); - ucharset( S->buf + S->buflen, 0, 2 * BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */ + + // blake2b_set_lastblock(S) + S->f[0] = ~0UL; + memzero( S->buf + S->buflen, 2 * BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */ blake2b_compress( S, S->buf ); + + #pragma unroll 8 for( int i = 0; i < 8; ++i ) /* Output full hash to temp buffer */ store64( buffer + sizeof( S->h[i] ) * i, S->h[i] ); ucharcpy( out, buffer, outlen ); @@ -445,7 +456,7 @@ static int blake2b_final( blake2b_state *S, uchar *out, uchar outlen ) } static void ucharcpyglb (uchar * dst, __global uchar const * src, size_t count) { - for (size_t i = 0; i < count; ++i) + for (size_t i = 0; i < count; i++) { *dst = *src; ++dst;