Skip to content

Commit

Permalink
Merge pull request #1 from jserv/opencl-improve
Browse files Browse the repository at this point in the history
OpenCL improvements
  • Loading branch information
inkeliz authored Aug 1, 2020
2 parents c7f05fa + 83f717a commit cd443d0
Showing 1 changed file with 104 additions and 93 deletions.
197 changes: 104 additions & 93 deletions gpu_cl.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,53 +189,56 @@ typedef struct __blake2b_param
uchar salt[BLAKE2B_SALTBYTES]; // 48
uchar personal[BLAKE2B_PERSONALBYTES]; // 64
} blake2b_param;
// Optimize: blake2 can overflow a uint64_t only if messages larger than
// 17 exabytes are seen. Set the value 2 if messages larger than 17 exabytes
// should be supported, otherwise keep 1.
#define BLAKE2B_LONG_MESSAGE (1)
typedef struct __blake2b_state
{
ulong h[8];
ulong t[2];
ulong t[BLAKE2B_LONG_MESSAGE];
ulong f[2];
uchar buf[2 * BLAKE2B_BLOCKBYTES];
size_t buflen;
uchar last_node;
} blake2b_state;
__constant static const ulong blake2b_IV[8] =
enum Blake2b_IV
{
0x6a09e667f3bcc908UL, 0xbb67ae8584caa73bUL,
0x3c6ef372fe94f82bUL, 0xa54ff53a5f1d36f1UL,
0x510e527fade682d1UL, 0x9b05688c2b3e6c1fUL,
0x1f83d9abfb41bd6bUL, 0x5be0cd19137e2179UL
iv0 = 0x6a09e667f3bcc908UL,
iv1 = 0xbb67ae8584caa73bUL,
iv2 = 0x3c6ef372fe94f82bUL,
iv3 = 0xa54ff53a5f1d36f1UL,
iv4 = 0x510e527fade682d1UL,
iv5 = 0x9b05688c2b3e6c1fUL,
iv6 = 0x1f83d9abfb41bd6bUL,
iv7 = 0x5be0cd19137e2179UL,
};
__constant static const uchar blake2b_sigma[12][16] =
__constant const static ulong blake2b_IV[8] = {iv0, iv1, iv2, iv3, iv4, iv5, iv6, iv7};
__constant const static uchar blake2b_sigma[12 * 16] =
{
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } ,
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3,
11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4,
7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8,
9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13,
2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9,
12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11,
13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10,
6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5,
10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3,
};
static inline int blake2b_set_lastnode( blake2b_state *S )
{
S->f[1] = ~0UL;
return 0;
}
/* Some helper functions, not necessarily useful */
static inline int blake2b_set_lastblock( blake2b_state *S )
{
if( S->last_node ) blake2b_set_lastnode( S );
S->f[0] = ~0UL;
return 0;
}
static inline int blake2b_increment_counter( blake2b_state *S, const ulong inc )
{
S->t[0] += inc;
#if BLAKE2B_LONG_MESSAGE > 1
S->t[1] += ( S->t[0] < inc );
#endif
return 0;
}
static inline ulong load64( const void *src )
Expand Down Expand Up @@ -283,16 +286,15 @@ static inline void store64( void *dst, ulong w )
*p++ = ( uchar )w;
#endif
}
static inline ulong rotr64( const ulong w, const unsigned c )
{
return ( w >> c ) | ( w << ( 64 - c ) );
}
static void ucharset (void * dest_a, int val, size_t count)
static inline ulong rotr64(ulong a, ulong shift) { return rotate(a, 64 - shift); }
static void memzero (void * dest_a, size_t count)
{
uchar * dest = (uchar *)dest_a;
for (size_t i = 0; i < count; ++i)
{
*dest++ = val;
*dest++ = 0;
}
}
/* init xors IV with input parameter block */
Expand All @@ -304,7 +306,7 @@ static inline int blake2b_init_param( blake2b_state *S, const blake2b_param *P )
h = ( uchar * )( S->h );
p = ( uchar * )( P );
/* IV XOR ParamBlock */
ucharset( S, 0, sizeof( blake2b_state ) );
memzero( S, sizeof( blake2b_state ) );
for( int i = 0; i < BLAKE2B_OUTBYTES; ++i ) h[i] = v[i] ^ p[i];
return 0;
}
Expand All @@ -320,68 +322,74 @@ static inline int blake2b_init( blake2b_state *S, const uchar outlen )
store64( &P->node_offset, 0 );
P->node_depth = 0;
P->inner_length = 0;
ucharset( P->reserved, 0, sizeof( P->reserved ) );
ucharset( P->salt, 0, sizeof( P->salt ) );
ucharset( P->personal, 0, sizeof( P->personal ) );
memzero( P->reserved, sizeof( P->reserved ) );
memzero( P->salt, sizeof( P->salt ) );
memzero( P->personal, sizeof( P->personal ) );
return blake2b_init_param( S, P );
}
#define G(r,i,a,b,c,d) \
do { \
a = a + b + m[blake2b_sigma[r * 16 + i * 2 + 0]]; \
d = rotr64(d ^ a, 32); \
c = c + d; \
b = rotr64(b ^ c, 24); \
a = a + b + m[blake2b_sigma[r * 16 + i * 2 + 1]]; \
d = rotr64(d ^ a, 16); \
c = c + d; \
b = rotr64(b ^ c, 63); \
} while(0)
#define ROUND(r) \
do { \
G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
G(r,2,v[ 2],v[ 6],v[10],v[14]); \
G(r,3,v[ 3],v[ 7],v[11],v[15]); \
G(r,4,v[ 0],v[ 5],v[10],v[15]); \
G(r,5,v[ 1],v[ 6],v[11],v[12]); \
G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
} while(0)
#define BLAKE2B_ROUNDS() ROUND(0);ROUND(1);ROUND(2);ROUND(3);ROUND(4);ROUND(5);ROUND(6);ROUND(7);ROUND(8);ROUND(9);ROUND(10);ROUND(11);
static int blake2b_compress( blake2b_state *S, __private const uchar block[BLAKE2B_BLOCKBYTES] )
{
ulong m[16];
ulong v[16];
int i;
for( i = 0; i < 16; ++i )
m[i] = load64( block + i * sizeof( m[i] ) );
m[i] = load64( block + i * sizeof( m[i] ) );
#pragma unroll 8
for( i = 0; i < 8; ++i )
v[i] = S->h[i];
v[ 8] = blake2b_IV[0];
v[ 9] = blake2b_IV[1];
v[10] = blake2b_IV[2];
v[11] = blake2b_IV[3];
v[12] = S->t[0] ^ blake2b_IV[4];
v[13] = S->t[1] ^ blake2b_IV[5];
v[14] = S->f[0] ^ blake2b_IV[6];
v[15] = S->f[1] ^ blake2b_IV[7];
#define G(r,i,a,b,c,d) \
do { \
a = a + b + m[blake2b_sigma[r][2*i+0]]; \
d = rotr64(d ^ a, 32); \
c = c + d; \
b = rotr64(b ^ c, 24); \
a = a + b + m[blake2b_sigma[r][2*i+1]]; \
d = rotr64(d ^ a, 16); \
c = c + d; \
b = rotr64(b ^ c, 63); \
} while(0)
#define ROUND(r) \
do { \
G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
G(r,2,v[ 2],v[ 6],v[10],v[14]); \
G(r,3,v[ 3],v[ 7],v[11],v[15]); \
G(r,4,v[ 0],v[ 5],v[10],v[15]); \
G(r,5,v[ 1],v[ 6],v[11],v[12]); \
G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
} while(0)
ROUND( 0 );
ROUND( 1 );
ROUND( 2 );
ROUND( 3 );
ROUND( 4 );
ROUND( 5 );
ROUND( 6 );
ROUND( 7 );
ROUND( 8 );
ROUND( 9 );
ROUND( 10 );
ROUND( 11 );
v[i] = S->h[i];
v[ 8] = iv0;
v[ 9] = iv1;
v[10] = iv2;
v[11] = iv3;
v[12] = S->t[0] ^ iv4;
#if BLAKE2B_LONG_MESSAGE > 1
v[13] = S->t[1] ^ iv5;
#else
v[13] = iv5;
#endif
v[14] = S->f[0] ^ iv6;
v[15] = iv7; /* ^ S->f[1] removed: no last_node */
BLAKE2B_ROUNDS();
#pragma unroll 8
for( i = 0; i < 8; ++i )
S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
#undef G
#undef ROUND
S->h[i] ^= v[i] ^ v[i + 8];
return 0;
}
#undef G
#undef ROUND
#undef BLAKE2B_ROUNDS
static void ucharcpy (uchar * dst, uchar const * src, size_t count)
{
for (size_t i = 0; i < count; ++i)
Expand Down Expand Up @@ -434,18 +442,21 @@ static int blake2b_final( blake2b_state *S, uchar *out, uchar outlen )
// if ( S->t[0] < inc )
// S->t[1] += 1;
// This seems to crash the opencl compiler though fortunately this is calculating size and we don't do things bigger than 2^32
blake2b_set_lastblock( S );
ucharset( S->buf + S->buflen, 0, 2 * BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */
// blake2b_set_lastblock(S)
S->f[0] = ~0UL;
memzero( S->buf + S->buflen, 2 * BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */
blake2b_compress( S, S->buf );
#pragma unroll 8
for( int i = 0; i < 8; ++i ) /* Output full hash to temp buffer */
store64( buffer + sizeof( S->h[i] ) * i, S->h[i] );
ucharcpy( out, buffer, outlen );
return 0;
}
static void ucharcpyglb (uchar * dst, __global uchar const * src, size_t count)
{
for (size_t i = 0; i < count; ++i)
for (size_t i = 0; i < count; i++)
{
*dst = *src;
++dst;
Expand Down

0 comments on commit cd443d0

Please sign in to comment.