From f8f050cbe9938175aef9b18353f7a9866d081cd2 Mon Sep 17 00:00:00 2001 From: John Doering Date: Sat, 19 Mar 2016 23:36:25 +0200 Subject: [PATCH] v2.4.3: Performance and Compatibility Improvements --- cpu-miner.c | 145 +++++--- miner.h | 6 +- neoscrypt.c | 872 +++++++++++++++++++++++------------------------- neoscrypt.h | 33 +- neoscrypt_asm.S | 402 +++++++++++----------- scrypt.c | 6 +- sha2.c | 9 +- version.h | 2 +- 8 files changed, 745 insertions(+), 730 deletions(-) diff --git a/cpu-miner.c b/cpu-miner.c index 225638390..6d3a8ff69 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -1,7 +1,7 @@ /* * Copyright 2010 Jeff Garzik * Copyright 2012-2014 pooler - * Copyright 2014-2015 John Doering + * Copyright 2014-2016 John Doering * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -21,9 +21,8 @@ #include #include #ifdef WIN32 +#include #include -typedef unsigned long ulong; -typedef unsigned int uint; #else #include #include @@ -182,20 +181,24 @@ static char const usage[] = "\ Usage: " PROGRAM_NAME " [OPTIONS]\n\ Options:\n\ -a, --algo=ALGO specify the algorithm to use\n\ - neoscrypt NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20 (default)\n\ - altscrypt Scrypt(1024, 1, 1) with Salsa20/8 through NeoScrypt\n\ + neoscrypt NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20 (default)\n" +#ifdef SHA256 +"\ + altscrypt Scrypt(1024, 1, 1) with Salsa20/8 through NeoScrypt\n" +#endif +"\ scrypt Scrypt(1024, 1, 1) with Salsa20/8\n\ sha256d SHA-256d\n" -#if (ASM) +#ifdef ASM "\ -e, --engine=N choose a NeoScrypt hashing engine\n\ 0 integer (default)\n\ 1 SSE2\n" -#if (MINER_4WAY) +#ifdef MINER_4WAY "\ 2 SSE2 4-way\n" -#endif -#endif +#endif /* ASM */ +#endif /* MINER_4WAY */ "\ -o, --url=URL URL of mining server\n\ -O, --userpass=U:P username:password pair for mining server\n\ @@ -1143,7 +1146,7 @@ bool fulltest_le(const uint *hash, const uint *target) { } static int scanhash_neoscrypt(int thr_id, uint *pdata, const uint *ptarget, - uint max_nonce, ulong *hashes_done, uint profile) { + uint max_nonce, uint *hashes_done, uint profile) { uint hash[8]; const uint targint = ptarget[7]; uint start_nonce = pdata[19], inc_nonce = 1; @@ -1169,8 +1172,9 @@ static int scanhash_neoscrypt(int thr_id, uint *pdata, const uint *ptarget, return(0); } +#ifdef SHA256 static int scanhash_altscrypt(int thr_id, uint *pdata, const uint *ptarget, - uint max_nonce, ulong *hashes_done, uint profile) { + uint max_nonce, uint *hashes_done, uint profile) { uint hash[8], data[20]; const uint targint = ptarget[7]; uint start_nonce = pdata[19], inc_nonce = 1; @@ -1202,11 +1206,11 @@ static int scanhash_altscrypt(int thr_id, uint *pdata, const uint *ptarget, *hashes_done = data[19] - inc_nonce - start_nonce; return(0); } +#endif /* SHA256 */ -#if (ASM) && (MINER_4WAY) - +#if defined(ASM) && defined(MINER_4WAY) static int scanhash_neoscrypt_4way(int thr_id, uint *pdata, - const uint *ptarget, uint max_nonce, ulong *hashes_done) { + const uint *ptarget, uint max_nonce, uint *hashes_done, uchar *scratchpad) { uint hash[32]; const uint targint = ptarget[7]; uint start_nonce = pdata[19]; @@ -1215,7 +1219,7 @@ static int scanhash_neoscrypt_4way(int thr_id, uint *pdata, while((pdata[19] <= (max_nonce - inc_nonce)) && !work_restart[thr_id].restart) { - neoscrypt_4way((uint8_t *) pdata, (uint8_t *) hash, 0); + neoscrypt_4way((uchar *) pdata, (uchar *) hash, scratchpad); if(hash[7] <= targint) { if(fulltest_le(&hash[0], ptarget)) { @@ -1256,8 +1260,9 @@ static int scanhash_neoscrypt_4way(int thr_id, uint *pdata, return(0); } +#ifdef SHA256 static int scanhash_altscrypt_4way(int thr_id, uint *pdata, - const uint *ptarget, uint max_nonce, ulong *hashes_done) { + const uint *ptarget, uint max_nonce, uint *hashes_done, uchar *scratchpad) { uint hash[32], data[20]; const uint targint = ptarget[7]; uint start_nonce = pdata[19]; @@ -1272,7 +1277,7 @@ static int scanhash_altscrypt_4way(int thr_id, uint *pdata, while((data[19] <= (max_nonce - inc_nonce)) && !work_restart[thr_id].restart) { - neoscrypt_4way((uint8_t *) data, (uint8_t *) hash, 1); + scrypt_4way((uchar *) data, (uchar *) hash, scratchpad); if(hash[7] <= targint) { if(fulltest_le(&hash[0], ptarget)) { @@ -1316,9 +1321,8 @@ static int scanhash_altscrypt_4way(int thr_id, uint *pdata, *hashes_done = data[19] - inc_nonce - start_nonce; return(0); } - - -#endif +#endif /* SHA256 */ +#endif /* (ASM) && (MINER_4WAY) */ static void *miner_thread(void *userdata) { @@ -1327,7 +1331,6 @@ static void *miner_thread(void *userdata) struct work work = {{0}}; uint32_t max_nonce; uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x20; - unsigned char *scratchbuf = NULL; char s[16]; int i; @@ -1347,14 +1350,29 @@ static void *miner_thread(void *userdata) thr_id, thr_id % num_processors); affine_to_cpu(thr_id, thr_id % num_processors); } - - if (opt_algo == ALGO_SCRYPT) - { - scratchbuf = scrypt_buffer_alloc(); - } - while (1) { - unsigned long hashes_done; + + uchar *scratchbuf = NULL; +#if defined(ASM) && defined(MINER_4WAY) + const size_t align = 0x40; + if(opt_neoscrypt_asm == 2) { + if(opt_algo == ALGO_NEOSCRYPT) { + scratchbuf = (uchar *) malloc(134464 + align); + } +#ifdef SHA256 + else if(opt_algo == ALGO_ALTSCRYPT) { + scratchbuf = (uchar *) malloc(525632 + align); + } +#endif /* SHA256 */ + } else +#endif /* (ASM) && (MINER_4WAY) */ + if(opt_algo == ALGO_SCRYPT) { + scratchbuf = scrypt_buffer_alloc(); + } + + + while(1) { + uint hashes_done; struct timeval tv_start, tv_end, diff; int64_t max64; int rc; @@ -1414,7 +1432,9 @@ static void *miner_thread(void *userdata) switch(opt_algo) { case(ALGO_NEOSCRYPT): +#ifdef SHA256 case(ALGO_ALTSCRYPT): +#endif case(ALGO_SCRYPT): max64 = 0x3FFFF; if(opt_nfactor > 3) @@ -1441,26 +1461,30 @@ static void *miner_thread(void *userdata) switch(opt_algo) { case(ALGO_NEOSCRYPT): -#if (ASM) && (MINER_4WAY) +#if defined(ASM) && defined(MINER_4WAY) if(opt_neoscrypt_asm == 2) rc = scanhash_neoscrypt_4way(thr_id, work.data, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done, + (uchar *) &scratchbuf[(size_t)scratchbuf & (align - 1)]); else #endif rc = scanhash_neoscrypt(thr_id, work.data, work.target, max_nonce, &hashes_done, opt_neoscrypt_profile); break; +#ifdef SHA256 case(ALGO_ALTSCRYPT): -#if (ASM) && (MINER_4WAY) +#if defined(ASM) && defined(MINER_4WAY) if(opt_neoscrypt_asm == 2) rc = scanhash_altscrypt_4way(thr_id, work.data, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done, + (uchar *) &scratchbuf[(size_t)scratchbuf & (align - 1)]); else -#endif +#endif /* (ASM) && (MINER_4WAY) */ rc = scanhash_altscrypt(thr_id, work.data, work.target, max_nonce, &hashes_done, opt_neoscrypt_profile); break; +#endif /* SHA256 */ case ALGO_SCRYPT: rc = scanhash_scrypt(thr_id, work.data, scratchbuf, work.target, @@ -1482,15 +1506,14 @@ static void *miner_thread(void *userdata) timeval_subtract(&diff, &tv_end, &tv_start); if (diff.tv_usec || diff.tv_sec) { pthread_mutex_lock(&stats_lock); - thr_hashrates[thr_id] = - hashes_done / (diff.tv_sec + 1e-6 * diff.tv_usec); + thr_hashrates[thr_id] = + (ullong)hashes_done / (diff.tv_sec + 1e-6 * diff.tv_usec); pthread_mutex_unlock(&stats_lock); } if (!opt_quiet) { sprintf(s, thr_hashrates[thr_id] >= 1e6 ? "%.0f" : "%.3f", 1e-3 * thr_hashrates[thr_id]); - applog(LOG_INFO, "thread %d: %lu hashes, %s KH/s", - thr_id, hashes_done, s); + applog(LOG_INFO, "thread %d: %u hashes, %s KH/s", thr_id, hashes_done, s); } if (opt_benchmark && thr_id == opt_n_threads - 1) { double hashrate = 0.; @@ -1510,6 +1533,8 @@ static void *miner_thread(void *userdata) out: tq_freeze(mythr->q); + if(scratchbuf) free(scratchbuf); + return NULL; } @@ -1794,7 +1819,11 @@ static void parse_arg(int key, char *arg, char *pname) break; } } +#ifdef SHA256 if((i == ALGO_ALTSCRYPT) || (i == ALGO_SCRYPT)) +#else + if(i == ALGO_SCRYPT) +#endif opt_nfactor = 9; if (i == ARRAY_SIZE(algo_names)) { fprintf(stderr, "%s: unknown algorithm -- '%s'\n", @@ -1804,26 +1833,34 @@ static void parse_arg(int key, char *arg, char *pname) break; case('e'): -#if (ASM) +#ifdef ASM +#ifdef SHA256 if((opt_algo == ALGO_NEOSCRYPT) || (opt_algo == ALGO_ALTSCRYPT)) { +#else + if(opt_algo == ALGO_NEOSCRYPT) { +#endif /* SHA256 */ v = atoi(arg); -#if (MINER_4WAY) +#ifdef MINER_4WAY if((v < 0) || (v > 2)) { #else if((v < 0) || (v > 1)) { -#endif +#endif /* MINER_4WAY */ fprintf(stderr, "%s: incorrect engine %d\n", pname, v); show_usage_and_exit(1); } opt_neoscrypt_asm = v; } -#endif +#endif /* ASM */ break; case('n'): -#if !(ASM) +#ifndef ASM /* Nfactor is fixed in the NeoScrypt assembly code */ +#ifdef SHA256 if((opt_algo == ALGO_NEOSCRYPT) || (opt_algo == ALGO_ALTSCRYPT)) { +#else + if(opt_algo == ALGO_NEOSCRYPT) { +#endif /* SHA256 */ v = atoi(arg); /* Nfactor = lb(N) - 1; N = (1 << (Nfactor + 1)) */ if((v < 0) || (v > 30)) { @@ -1832,7 +1869,7 @@ static void parse_arg(int key, char *arg, char *pname) } opt_nfactor = v; } -#endif +#endif /* !ASM */ break; case 'B': @@ -2126,8 +2163,13 @@ int main(int argc, char *argv[]) /* Processor vector extensions detected */ uint opt_flags = cpu_vec_exts(); - /* Configure for SSE2-4way by default */ +#ifdef MINER_4WAY + /* Configure for SSE2 4-way by default */ if(opt_flags & 0x00000020) opt_neoscrypt_asm = 2; +#else + /* Configure for SSE2 by default */ + if(opt_flags & 0x00000020) opt_neoscrypt_asm = 1; +#endif rpc_user = strdup(""); rpc_pass = strdup(""); @@ -2135,33 +2177,40 @@ int main(int argc, char *argv[]) /* parse command line */ parse_cmdline(argc, argv); +#ifdef SHA256 if((opt_algo == ALGO_NEOSCRYPT) || (opt_algo == ALGO_ALTSCRYPT)) { +#else + if(opt_algo == ALGO_NEOSCRYPT) { +#endif + printf("Engines: "); -#if (ASM) -#if (MINER_4WAY) +#ifdef ASM +#ifdef MINER_4WAY printf("INT SSE2 SSE2-4way (enabled: "); if(opt_neoscrypt_asm == 2) printf("SSE2-4way)\n"); #else printf("INT SSE2 (enabled: "); -#endif +#endif /* MINER_4WAY */ if(opt_neoscrypt_asm == 1) printf("SSE2)\n"); if(!opt_neoscrypt_asm) printf("INT)\n"); #else printf("INT (enabled: INT)\n"); -#endif +#endif /* ASM */ if(opt_algo == ALGO_NEOSCRYPT) { opt_neoscrypt_profile = 0x80000020 | (opt_nfactor << 8) | ((opt_neoscrypt_asm & 0x1) << 12); } +#ifdef SHA256 if(opt_algo == ALGO_ALTSCRYPT) { opt_neoscrypt_profile = 0x80000003 | (opt_nfactor << 8) | ((opt_neoscrypt_asm & 0x1) << 12); } +#endif } diff --git a/miner.h b/miner.h index 2ed365331..14d17ff9a 100644 --- a/miner.h +++ b/miner.h @@ -152,12 +152,12 @@ void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap); #endif extern int scanhash_sha256d(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); + const uint32_t *ptarget, uint32_t max_nonce, unsigned int *hashes_done); extern unsigned char *scrypt_buffer_alloc(); extern int scanhash_scrypt(int thr_id, uint32_t *pdata, - unsigned char *scratchbuf, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done); + unsigned char *scratchbuf, const uint32_t *ptarget, + uint32_t max_nonce, unsigned int *hashes_done); struct thr_info { int id; diff --git a/neoscrypt.c b/neoscrypt.c index 7b2998671..82546e8a1 100644 --- a/neoscrypt.c +++ b/neoscrypt.c @@ -2,7 +2,7 @@ * Copyright (c) 2009 Colin Percival, 2011 ArtForz * Copyright (c) 2012 Andrew Moon (floodyberry) * Copyright (c) 2012 Samuel Neves - * Copyright (c) 2014-2015 John Doering + * Copyright (c) 2014-2016 John Doering * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -35,19 +35,19 @@ #include "neoscrypt.h" -#if (SHA256) +#ifdef SHA256 /* SHA-256 */ -static const uint32_t sha256_constants[64] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +static const uint sha256_constants[64] = { + 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5, + 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174, + 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA, + 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967, + 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85, + 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070, + 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3, + 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2 }; #define Ch(x,y,z) (z ^ (x & (y ^ z))) @@ -70,18 +70,15 @@ static const uint32_t sha256_constants[64] = { r[1] = r[0]; \ r[0] = t0 + t1; - typedef struct sha256_hash_state_t { - uint32_t H[8]; - uint64_t T; - uint32_t leftover; - uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; + uint H[8]; + ullong T; + uint leftover; + uchar buffer[BLOCK_SIZE]; } sha256_hash_state; - -static void sha256_blocks(sha256_hash_state *S, const uint8_t *in, size_t blocks) { - uint32_t r[8], w[64], t0, t1; - size_t i; +static void sha256_blocks(sha256_hash_state *S, const uchar *in, uint blocks) { + uint r[8], w[64], t0, t1, i; for(i = 0; i < 8; i++) r[i] = S->H[i]; @@ -100,34 +97,34 @@ static void sha256_blocks(sha256_hash_state *S, const uint8_t *in, size_t blocks r[i] += S->H[i]; S->H[i] = r[i]; } - S->T += SCRYPT_HASH_BLOCK_SIZE * 8; - in += SCRYPT_HASH_BLOCK_SIZE; + S->T += BLOCK_SIZE * 8; + in += BLOCK_SIZE; } } static void neoscrypt_hash_init_sha256(sha256_hash_state *S) { - S->H[0] = 0x6a09e667; - S->H[1] = 0xbb67ae85; - S->H[2] = 0x3c6ef372; - S->H[3] = 0xa54ff53a; - S->H[4] = 0x510e527f; - S->H[5] = 0x9b05688c; - S->H[6] = 0x1f83d9ab; - S->H[7] = 0x5be0cd19; + S->H[0] = 0x6A09E667; + S->H[1] = 0xBB67AE85; + S->H[2] = 0x3C6EF372; + S->H[3] = 0xA54FF53A; + S->H[4] = 0x510E527F; + S->H[5] = 0x9B05688C; + S->H[6] = 0x1F83D9AB; + S->H[7] = 0x5BE0CD19; S->T = 0; S->leftover = 0; } -static void neoscrypt_hash_update_sha256(sha256_hash_state *S, const uint8_t *in, size_t inlen) { - size_t blocks, want; +static void neoscrypt_hash_update_sha256(sha256_hash_state *S, const uchar *in, uint inlen) { + uint blocks, want; /* handle the previous data */ if(S->leftover) { - want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); + want = (BLOCK_SIZE - S->leftover); want = (want < inlen) ? want : inlen; - memcpy(S->buffer + S->leftover, in, want); - S->leftover += (uint32_t)want; - if(S->leftover < SCRYPT_HASH_BLOCK_SIZE) + neoscrypt_copy(S->buffer + S->leftover, in, want); + S->leftover += (uint)want; + if(S->leftover < BLOCK_SIZE) return; in += want; inlen -= want; @@ -135,28 +132,28 @@ static void neoscrypt_hash_update_sha256(sha256_hash_state *S, const uint8_t *in } /* handle the current data */ - blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); - S->leftover = (uint32_t)(inlen - blocks); + blocks = (inlen & ~(BLOCK_SIZE - 1)); + S->leftover = (uint)(inlen - blocks); if(blocks) { - sha256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE); + sha256_blocks(S, in, blocks / BLOCK_SIZE); in += blocks; } /* handle leftover data */ if(S->leftover) - memcpy(S->buffer, in, S->leftover); + neoscrypt_copy(S->buffer, in, S->leftover); } -static void neoscrypt_hash_finish_sha256(sha256_hash_state *S, uint8_t *hash) { - uint64_t t = S->T + (S->leftover * 8); +static void neoscrypt_hash_finish_sha256(sha256_hash_state *S, uchar *hash) { + ullong t = S->T + (S->leftover * 8); S->buffer[S->leftover] = 0x80; if(S->leftover <= 55) { - memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover); + neoscrypt_erase(S->buffer + S->leftover + 1, 55 - S->leftover); } else { - memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover); + neoscrypt_erase(S->buffer + S->leftover + 1, 63 - S->leftover); sha256_blocks(S, S->buffer, 1); - memset(S->buffer, 0, 56); + neoscrypt_erase(S->buffer, 56); } U64TO8_BE(S->buffer + 56, t); @@ -172,13 +169,6 @@ static void neoscrypt_hash_finish_sha256(sha256_hash_state *S, uint8_t *hash) { U32TO8_BE(&hash[28], S->H[7]); } -static void neoscrypt_hash_sha256(hash_digest hash, const uint8_t *m, size_t mlen) { - sha256_hash_state st; - neoscrypt_hash_init_sha256(&st); - neoscrypt_hash_update_sha256(&st, m, mlen); - neoscrypt_hash_finish_sha256(&st, hash); -} - /* HMAC for SHA-256 */ @@ -186,40 +176,49 @@ typedef struct sha256_hmac_state_t { sha256_hash_state inner, outer; } sha256_hmac_state; -static void neoscrypt_hmac_init_sha256(sha256_hmac_state *st, const uint8_t *key, size_t keylen) { - uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0}; - size_t i; +static inline void neoscrypt_hmac_init_sha256(sha256_hmac_state *st, + const uchar *key, uint keylen) { + uchar pad[BLOCK_SIZE + DIGEST_SIZE]; + uint *P = (uint *) pad; + uint i; - neoscrypt_hash_init_sha256(&st->inner); - neoscrypt_hash_init_sha256(&st->outer); + /* The pad initialisation for the inner loop */ + for(i = 0; i < (BLOCK_SIZE >> 2); i++) + P[i] = 0x36363636; - if(keylen <= SCRYPT_HASH_BLOCK_SIZE) { - /* use the key directly if it's <= blocksize bytes */ - memcpy(pad, key, keylen); + if(keylen <= BLOCK_SIZE) { + /* XOR the key into the pad */ + neoscrypt_xor(pad, key, keylen); } else { - /* if it's > blocksize bytes, hash it */ - neoscrypt_hash_sha256(pad, key, keylen); + /* Hash the key and XOR into the pad */ + sha256_hash_state st0; + neoscrypt_hash_init_sha256(&st0); + neoscrypt_hash_update_sha256(&st0, key, keylen); + neoscrypt_hash_finish_sha256(&st0, &pad[BLOCK_SIZE]); + neoscrypt_xor(&pad[0], &pad[BLOCK_SIZE], DIGEST_SIZE); } - /* inner = (key ^ 0x36) */ - /* h(inner || ...) */ - for(i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) - pad[i] ^= 0x36; - neoscrypt_hash_update_sha256(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE); - - /* outer = (key ^ 0x5c) */ - /* h(outer || ...) */ - for(i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) - pad[i] ^= (0x5c ^ 0x36); - neoscrypt_hash_update_sha256(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE); + neoscrypt_hash_init_sha256(&st->inner); + /* h(inner || pad) */ + neoscrypt_hash_update_sha256(&st->inner, pad, BLOCK_SIZE); + + /* The pad re-initialisation for the outer loop */ + for(i = 0; i < (BLOCK_SIZE >> 2); i++) + P[i] ^= (0x36363636 ^ 0x5C5C5C5C); + + neoscrypt_hash_init_sha256(&st->outer); + /* h(outer || pad) */ + neoscrypt_hash_update_sha256(&st->outer, pad, BLOCK_SIZE); } -static void neoscrypt_hmac_update_sha256(sha256_hmac_state *st, const uint8_t *m, size_t mlen) { +static inline void neoscrypt_hmac_update_sha256(sha256_hmac_state *st, + const uchar *m, uint mlen) { /* h(inner || m...) */ neoscrypt_hash_update_sha256(&st->inner, m, mlen); } -static void neoscrypt_hmac_finish_sha256(sha256_hmac_state *st, hash_digest mac) { +static inline void neoscrypt_hmac_finish_sha256(sha256_hmac_state *st, + hash_digest mac) { /* h(inner || m) */ hash_digest innerhash; neoscrypt_hash_finish_sha256(&st->inner, innerhash); @@ -232,14 +231,14 @@ static void neoscrypt_hmac_finish_sha256(sha256_hmac_state *st, hash_digest mac) /* PBKDF2 for SHA-256 */ -void neoscrypt_pbkdf2_sha256(const uint8_t *password, size_t password_len, - const uint8_t *salt, size_t salt_len, uint32_t N, uint8_t *output, size_t output_len) { +void neoscrypt_pbkdf2_sha256(const uchar *password, uint password_len, + const uchar *salt, uint salt_len, uint N, uchar *output, uint output_len) { sha256_hmac_state hmac_pw, hmac_pw_salt, work; hash_digest ti, u; - uint8_t be[4]; - uint32_t i, j, k, blocks; + uchar be[4]; + uint i, j, k, blocks; - /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */ + /* bytes must be <= (0xffffffff - (DIGEST_SIZE - 1)), which they will always be under scrypt */ /* hmac(password, ...) */ neoscrypt_hmac_init_sha256(&hmac_pw, password, password_len); @@ -248,20 +247,20 @@ void neoscrypt_pbkdf2_sha256(const uint8_t *password, size_t password_len, hmac_pw_salt = hmac_pw; neoscrypt_hmac_update_sha256(&hmac_pw_salt, salt, salt_len); - blocks = ((uint32_t)output_len + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE; + blocks = ((uint)output_len + (DIGEST_SIZE - 1)) / DIGEST_SIZE; for(i = 1; i <= blocks; i++) { /* U1 = hmac(password, salt || be(i)) */ U32TO8_BE(be, i); work = hmac_pw_salt; neoscrypt_hmac_update_sha256(&work, be, 4); neoscrypt_hmac_finish_sha256(&work, ti); - memcpy(u, ti, sizeof(u)); + neoscrypt_copy(u, ti, sizeof(u)); /* T[i] = U1 ^ U2 ^ U3... */ for(j = 0; j < N - 1; j++) { /* UX = hmac(password, U{X-1}) */ work = hmac_pw; - neoscrypt_hmac_update_sha256(&work, u, SCRYPT_HASH_DIGEST_SIZE); + neoscrypt_hmac_update_sha256(&work, u, DIGEST_SIZE); neoscrypt_hmac_finish_sha256(&work, u); /* T[i] ^= UX */ @@ -269,20 +268,20 @@ void neoscrypt_pbkdf2_sha256(const uint8_t *password, size_t password_len, ti[k] ^= u[k]; } - memcpy(output, ti, (output_len > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : output_len); - output += SCRYPT_HASH_DIGEST_SIZE; - output_len -= SCRYPT_HASH_DIGEST_SIZE; + neoscrypt_copy(output, ti, (output_len > DIGEST_SIZE) ? DIGEST_SIZE : output_len); + output += DIGEST_SIZE; + output_len -= DIGEST_SIZE; } } -#endif +#endif /* SHA256 */ -#if (BLAKE256) +#ifdef BLAKE256 /* BLAKE-256 */ -const uint8_t blake256_sigma[] = { +const uchar blake256_sigma[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3, 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4, @@ -295,24 +294,24 @@ const uint8_t blake256_sigma[] = { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }; -const uint32_t blake256_constants[16] = { - 0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344, - 0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89, - 0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c, - 0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917 +const uint blake256_constants[16] = { + 0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344, + 0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89, + 0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C, + 0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917 }; typedef struct blake256_hash_state_t { - uint32_t H[8], T[2]; - uint32_t leftover; - uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; + uint H[8]; + uint T[2]; + uint leftover; + uchar buffer[BLOCK_SIZE]; } blake256_hash_state; -static void blake256_blocks(blake256_hash_state *S, const uint8_t *in, - size_t blocks) { - const uint8_t *sigma, *sigma_end = blake256_sigma + (10 * 16); - uint32_t m[16], v[16], h[8], t[2]; - uint32_t i; +static void blake256_blocks(blake256_hash_state *S, const uchar *in, uint blocks) { + const uchar *sigma, *sigma_end = blake256_sigma + (10 * 16); + uint m[16], v[16], h[8], t[2]; + uint i; for(i = 0; i < 8; i++) h[i] = S->H[i]; @@ -328,9 +327,9 @@ static void blake256_blocks(blake256_hash_state *S, const uint8_t *in, for(i = 0; i < 4; i++) v[i + 8] = blake256_constants[i]; for(i = 0; i < 2; i++) - v[i + 12] = blake256_constants[i+4] ^ t[0]; + v[i + 12] = blake256_constants[i + 4] ^ t[0]; for(i = 0; i < 2; i++) - v[i + 14] = blake256_constants[i+6] ^ t[1]; + v[i + 14] = blake256_constants[i + 6] ^ t[1]; for(i = 0; i < 16; i++) m[i] = U8TO32_BE(&in[i * 4]); @@ -390,16 +389,16 @@ static void neoscrypt_hash_init_blake256(blake256_hash_state *S) { } static void neoscrypt_hash_update_blake256(blake256_hash_state *S, - const uint8_t *in, size_t inlen) { - size_t blocks, want; + const uchar *in, uint inlen) { + uint blocks, want; /* handle the previous data */ if(S->leftover) { - want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); + want = (BLOCK_SIZE - S->leftover); want = (want < inlen) ? want : inlen; - memcpy(S->buffer + S->leftover, in, want); - S->leftover += (uint32_t)want; - if(S->leftover < SCRYPT_HASH_BLOCK_SIZE) + neoscrypt_copy(S->buffer + S->leftover, in, want); + S->leftover += (uint)want; + if(S->leftover < BLOCK_SIZE) return; in += want; inlen -= want; @@ -407,30 +406,29 @@ static void neoscrypt_hash_update_blake256(blake256_hash_state *S, } /* handle the current data */ - blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); - S->leftover = (uint32_t)(inlen - blocks); + blocks = (inlen & ~(BLOCK_SIZE - 1)); + S->leftover = (uint)(inlen - blocks); if(blocks) { - blake256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE); + blake256_blocks(S, in, blocks / BLOCK_SIZE); in += blocks; } /* handle leftover data */ if(S->leftover) - memcpy(S->buffer, in, S->leftover); + neoscrypt_copy(S->buffer, in, S->leftover); } -static void neoscrypt_hash_finish_blake256(blake256_hash_state *S, - uint8_t *hash) { - uint32_t th, tl, bits; +static void neoscrypt_hash_finish_blake256(blake256_hash_state *S, uchar *hash) { + uint th, tl, bits; bits = (S->leftover << 3); tl = S->T[0] + bits; th = S->T[1]; if(S->leftover == 0) { - S->T[0] = (uint32_t)0 - (uint32_t)512; - S->T[1] = (uint32_t)0 - (uint32_t)1; + S->T[0] = (uint)0 - (uint)512; + S->T[1] = (uint)0 - (uint)1; } else if(S->T[0] == 0) { - S->T[0] = ((uint32_t)0 - (uint32_t)512) + bits; + S->T[0] = ((uint)0 - (uint)512) + bits; S->T[1] = S->T[1] - 1; } else { S->T[0] -= (512 - bits); @@ -438,13 +436,13 @@ static void neoscrypt_hash_finish_blake256(blake256_hash_state *S, S->buffer[S->leftover] = 0x80; if(S->leftover <= 55) { - memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover); + neoscrypt_erase(S->buffer + S->leftover + 1, 55 - S->leftover); } else { - memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover); + neoscrypt_erase(S->buffer + S->leftover + 1, 63 - S->leftover); blake256_blocks(S, S->buffer, 1); - S->T[0] = (uint32_t)0 - (uint32_t)512; - S->T[1] = (uint32_t)0 - (uint32_t)1; - memset(S->buffer, 0, 56); + S->T[0] = (uint)0 - (uint)512; + S->T[1] = (uint)0 - (uint)1; + neoscrypt_erase(S->buffer, 56); } S->buffer[55] |= 1; U32TO8_BE(S->buffer + 56, th); @@ -461,14 +459,6 @@ static void neoscrypt_hash_finish_blake256(blake256_hash_state *S, U32TO8_BE(&hash[28], S->H[7]); } -static void neoscrypt_hash_blake256(hash_digest hash, const uint8_t *m, - size_t mlen) { - blake256_hash_state st; - neoscrypt_hash_init_blake256(&st); - neoscrypt_hash_update_blake256(&st, m, mlen); - neoscrypt_hash_finish_blake256(&st, hash); -} - /* HMAC for BLAKE-256 */ @@ -477,42 +467,48 @@ typedef struct blake256_hmac_state_t { blake256_hash_state outer; } blake256_hmac_state; -static void neoscrypt_hmac_init_blake256(blake256_hmac_state *st, - const uint8_t *key, size_t keylen) { - uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0}; - size_t i; +static inline void neoscrypt_hmac_init_blake256(blake256_hmac_state *st, + const uchar *key, uint keylen) { + uchar pad[BLOCK_SIZE + DIGEST_SIZE]; + uint *P = (uint *) pad; + uint i; - neoscrypt_hash_init_blake256(&st->inner); - neoscrypt_hash_init_blake256(&st->outer); + /* The pad initialisation for the inner loop */ + for(i = 0; i < (BLOCK_SIZE >> 2); i++) + P[i] = 0x36363636; - if(keylen <= SCRYPT_HASH_BLOCK_SIZE) { - /* use the key directly if it's <= blocksize bytes */ - memcpy(pad, key, keylen); + if(keylen <= BLOCK_SIZE) { + /* XOR the key into the pad */ + neoscrypt_xor(pad, key, keylen); } else { - /* if it's > blocksize bytes, hash it */ - neoscrypt_hash_blake256(pad, key, keylen); + /* Hash the key and XOR into the pad */ + blake256_hash_state st0; + neoscrypt_hash_init_blake256(&st0); + neoscrypt_hash_update_blake256(&st0, key, keylen); + neoscrypt_hash_finish_blake256(&st0, &pad[BLOCK_SIZE]); + neoscrypt_xor(&pad[0], &pad[BLOCK_SIZE], DIGEST_SIZE); } - /* inner = (key ^ 0x36) */ - /* h(inner || ...) */ - for(i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) - pad[i] ^= 0x36; - neoscrypt_hash_update_blake256(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE); - - /* outer = (key ^ 0x5c) */ - /* h(outer || ...) */ - for(i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) - pad[i] ^= (0x5c ^ 0x36); - neoscrypt_hash_update_blake256(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE); + neoscrypt_hash_init_blake256(&st->inner); + /* h(inner || pad) */ + neoscrypt_hash_update_blake256(&st->inner, pad, BLOCK_SIZE); + + /* The pad re-initialisation for the outer loop */ + for(i = 0; i < (BLOCK_SIZE >> 2); i++) + P[i] ^= (0x36363636 ^ 0x5C5C5C5C); + + neoscrypt_hash_init_blake256(&st->outer); + /* h(outer || pad) */ + neoscrypt_hash_update_blake256(&st->outer, pad, BLOCK_SIZE); } -static void neoscrypt_hmac_update_blake256(blake256_hmac_state *st, - const uint8_t *m, size_t mlen) { +static inline void neoscrypt_hmac_update_blake256(blake256_hmac_state *st, + const uchar *m, uint mlen) { /* h(inner || m...) */ neoscrypt_hash_update_blake256(&st->inner, m, mlen); } -static void neoscrypt_hmac_finish_blake256(blake256_hmac_state *st, +static inline void neoscrypt_hmac_finish_blake256(blake256_hmac_state *st, hash_digest mac) { /* h(inner || m) */ hash_digest innerhash; @@ -526,15 +522,15 @@ static void neoscrypt_hmac_finish_blake256(blake256_hmac_state *st, /* PBKDF2 for BLAKE-256 */ -static void neoscrypt_pbkdf2_blake256(const uint8_t *password, - size_t password_len, const uint8_t *salt, size_t salt_len, uint32_t N, - uint8_t *output, size_t output_len) { +static void neoscrypt_pbkdf2_blake256(const uchar *password, + uint password_len, const uchar *salt, uint salt_len, uint N, + uchar *output, uint output_len) { blake256_hmac_state hmac_pw, hmac_pw_salt, work; hash_digest ti, u; - uint8_t be[4]; - uint32_t i, j, k, blocks; + uchar be[4]; + uint i, j, k, blocks; - /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */ + /* bytes must be <= (0xffffffff - (DIGEST_SIZE - 1)), which they will always be under scrypt */ /* hmac(password, ...) */ neoscrypt_hmac_init_blake256(&hmac_pw, password, password_len); @@ -543,20 +539,20 @@ static void neoscrypt_pbkdf2_blake256(const uint8_t *password, hmac_pw_salt = hmac_pw; neoscrypt_hmac_update_blake256(&hmac_pw_salt, salt, salt_len); - blocks = ((uint32_t)output_len + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE; + blocks = ((uint)output_len + (DIGEST_SIZE - 1)) / DIGEST_SIZE; for(i = 1; i <= blocks; i++) { /* U1 = hmac(password, salt || be(i)) */ U32TO8_BE(be, i); work = hmac_pw_salt; neoscrypt_hmac_update_blake256(&work, be, 4); neoscrypt_hmac_finish_blake256(&work, ti); - memcpy(u, ti, sizeof(u)); + neoscrypt_copy(u, ti, sizeof(u)); /* T[i] = U1 ^ U2 ^ U3... */ for(j = 0; j < N - 1; j++) { /* UX = hmac(password, U{X-1}) */ work = hmac_pw; - neoscrypt_hmac_update_blake256(&work, u, SCRYPT_HASH_DIGEST_SIZE); + neoscrypt_hmac_update_blake256(&work, u, DIGEST_SIZE); neoscrypt_hmac_finish_blake256(&work, u); /* T[i] ^= UX */ @@ -564,18 +560,18 @@ static void neoscrypt_pbkdf2_blake256(const uint8_t *password, ti[k] ^= u[k]; } - memcpy(output, ti, (output_len > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : output_len); - output += SCRYPT_HASH_DIGEST_SIZE; - output_len -= SCRYPT_HASH_DIGEST_SIZE; + neoscrypt_copy(output, ti, (output_len > DIGEST_SIZE) ? DIGEST_SIZE : output_len); + output += DIGEST_SIZE; + output_len -= DIGEST_SIZE; } } -#endif +#endif /* BLAKE256 */ /* NeoScrypt */ -#if (ASM) +#ifdef ASM extern void neoscrypt_copy(void *dstp, const void *srcp, uint len); extern void neoscrypt_erase(void *dstp, uint len); @@ -654,11 +650,11 @@ static void neoscrypt_chacha(uint *X, uint rounds) { /* Fast 32-bit / 64-bit memcpy(); * len must be a multiple of 32 bytes */ static void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len) { - ulong *dst = (ulong *) dstp; - ulong *src = (ulong *) srcp; + size_t *dst = (size_t *) dstp; + size_t *src = (size_t *) srcp; uint i; - for(i = 0; i < (len / sizeof(ulong)); i += 4) { + for(i = 0; i < (len / sizeof(size_t)); i += 4) { dst[i] = src[i]; dst[i + 1] = src[i + 1]; dst[i + 2] = src[i + 2]; @@ -669,12 +665,12 @@ static void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len) { /* Fast 32-bit / 64-bit block swapper; * len must be a multiple of 32 bytes */ static void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len) { - ulong *blkA = (ulong *) blkAp; - ulong *blkB = (ulong *) blkBp; - register ulong t0, t1, t2, t3; + size_t *blkA = (size_t *) blkAp; + size_t *blkB = (size_t *) blkBp; + register size_t t0, t1, t2, t3; uint i; - for(i = 0; i < (len / sizeof(ulong)); i += 4) { + for(i = 0; i < (len / sizeof(size_t)); i += 4) { t0 = blkA[i]; t1 = blkA[i + 1]; t2 = blkA[i + 2]; @@ -693,11 +689,11 @@ static void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len) { /* Fast 32-bit / 64-bit block XOR engine; * len must be a multiple of 32 bytes */ static void neoscrypt_blkxor(void *dstp, const void *srcp, uint len) { - ulong *dst = (ulong *) dstp; - ulong *src = (ulong *) srcp; + size_t *dst = (size_t *) dstp; + size_t *src = (size_t *) srcp; uint i; - for(i = 0; i < (len / sizeof(ulong)); i += 4) { + for(i = 0; i < (len / sizeof(size_t)); i += 4) { dst[i] ^= src[i]; dst[i + 1] ^= src[i + 1]; dst[i + 2] ^= src[i + 2]; @@ -707,14 +703,14 @@ static void neoscrypt_blkxor(void *dstp, const void *srcp, uint len) { /* 32-bit / 64-bit optimised memcpy() */ void neoscrypt_copy(void *dstp, const void *srcp, uint len) { - ulong *dst = (ulong *) dstp; - ulong *src = (ulong *) srcp; + size_t *dst = (size_t *) dstp; + size_t *src = (size_t *) srcp; uint i, tail; - for(i = 0; i < (len / sizeof(ulong)); i++) + for(i = 0; i < (len / sizeof(size_t)); i++) dst[i] = src[i]; - tail = len & (sizeof(ulong) - 1); + tail = len & (sizeof(size_t) - 1); if(tail) { uchar *dstb = (uchar *) dstp; uchar *srcb = (uchar *) srcp; @@ -726,14 +722,14 @@ void neoscrypt_copy(void *dstp, const void *srcp, uint len) { /* 32-bit / 64-bit optimised memory erase aka memset() to zero */ void neoscrypt_erase(void *dstp, uint len) { - const ulong null = 0; - ulong *dst = (ulong *) dstp; + const size_t null = 0; + size_t *dst = (size_t *) dstp; uint i, tail; - for(i = 0; i < (len / sizeof(ulong)); i++) + for(i = 0; i < (len / sizeof(size_t)); i++) dst[i] = null; - tail = len & (sizeof(ulong) - 1); + tail = len & (sizeof(size_t) - 1); if(tail) { uchar *dstb = (uchar *) dstp; @@ -744,14 +740,14 @@ void neoscrypt_erase(void *dstp, uint len) { /* 32-bit / 64-bit optimised XOR engine */ void neoscrypt_xor(void *dstp, const void *srcp, uint len) { - ulong *dst = (ulong *) dstp; - ulong *src = (ulong *) srcp; + size_t *dst = (size_t *) dstp; + size_t *src = (size_t *) srcp; uint i, tail; - for(i = 0; i < (len / sizeof(ulong)); i++) + for(i = 0; i < (len / sizeof(size_t)); i++) dst[i] ^= src[i]; - tail = len & (sizeof(ulong) - 1); + tail = len & (sizeof(size_t) - 1); if(tail) { uchar *dstb = (uchar *) dstp; uchar *srcb = (uchar *) srcp; @@ -761,15 +757,11 @@ void neoscrypt_xor(void *dstp, const void *srcp, uint len) { } } -#endif +#endif /* ASM */ /* BLAKE2s */ -#define BLAKE2S_BLOCK_SIZE 64U -#define BLAKE2S_OUT_SIZE 32U -#define BLAKE2S_KEY_SIZE 32U - /* Parameter block of 32 bytes */ typedef struct blake2s_param_t { uchar digest_length; @@ -789,10 +781,10 @@ typedef struct blake2s_state_t { uint h[8]; uint t[2]; uint f[2]; - uchar buf[2 * BLAKE2S_BLOCK_SIZE]; + uchar buf[2 * BLOCK_SIZE]; uint buflen; uint padding[3]; - uchar tempbuf[BLAKE2S_BLOCK_SIZE]; + uchar tempbuf[BLOCK_SIZE]; } blake2s_state; static const uint blake2s_IV[8] = { @@ -800,7 +792,7 @@ static const uint blake2s_IV[8] = { 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 }; -#if (ASM) +#ifdef ASM extern void blake2s_compress(blake2s_state *S); @@ -2209,7 +2201,7 @@ static void blake2s_compress(blake2s_state *S) { S->h[7] ^= v[7] ^ v[15]; } -#endif +#endif /* ASM */ static void blake2s_update(blake2s_state *S, const uchar *input, uint input_size) { @@ -2217,18 +2209,18 @@ static void blake2s_update(blake2s_state *S, const uchar *input, while(input_size > 0) { left = S->buflen; - fill = 2 * BLAKE2S_BLOCK_SIZE - left; + fill = 2 * BLOCK_SIZE - left; if(input_size > fill) { /* Buffer fill */ neoscrypt_copy(S->buf + left, input, fill); S->buflen += fill; /* Counter increment */ - S->t[0] += BLAKE2S_BLOCK_SIZE; + S->t[0] += BLOCK_SIZE; /* Compress */ blake2s_compress(S); /* Shift buffer left */ - neoscrypt_copy(S->buf, S->buf + BLAKE2S_BLOCK_SIZE, BLAKE2S_BLOCK_SIZE); - S->buflen -= BLAKE2S_BLOCK_SIZE; + neoscrypt_copy(S->buf, S->buf + BLOCK_SIZE, BLOCK_SIZE); + S->buflen -= BLOCK_SIZE; input += fill; input_size -= fill; } else { @@ -2243,7 +2235,7 @@ static void blake2s_update(blake2s_state *S, const uchar *input, void neoscrypt_blake2s(const void *input, const uint input_size, const void *key, const uchar key_size, void *output, const uchar output_size) { - uchar block[BLAKE2S_BLOCK_SIZE]; + uchar block[BLOCK_SIZE]; blake2s_param P[1]; blake2s_state S[1]; @@ -2258,30 +2250,30 @@ void neoscrypt_blake2s(const void *input, const uint input_size, neoscrypt_copy(S, blake2s_IV, 32); neoscrypt_xor(S, P, 32); - neoscrypt_erase(block, BLAKE2S_BLOCK_SIZE); + neoscrypt_erase(block, BLOCK_SIZE); neoscrypt_copy(block, key, key_size); - blake2s_update(S, (uchar *) block, BLAKE2S_BLOCK_SIZE); + blake2s_update(S, (uchar *) block, BLOCK_SIZE); /* Update */ blake2s_update(S, (uchar *) input, input_size); /* Finish */ - if(S->buflen > BLAKE2S_BLOCK_SIZE) { - S->t[0] += BLAKE2S_BLOCK_SIZE; + if(S->buflen > BLOCK_SIZE) { + S->t[0] += BLOCK_SIZE; blake2s_compress(S); - S->buflen -= BLAKE2S_BLOCK_SIZE; - neoscrypt_copy(S->buf, S->buf + BLAKE2S_BLOCK_SIZE, S->buflen); + S->buflen -= BLOCK_SIZE; + neoscrypt_copy(S->buf, S->buf + BLOCK_SIZE, S->buflen); } S->t[0] += S->buflen; S->f[0] = ~0U; - neoscrypt_erase(S->buf + S->buflen, 2 * BLAKE2S_BLOCK_SIZE - S->buflen); + neoscrypt_erase(S->buf + S->buflen, 2 * BLOCK_SIZE - S->buflen); blake2s_compress(S); /* Write back */ neoscrypt_copy(output, S, output_size); } -#if !(OPT) +#ifndef OPT #define FASTKDF_BUFFER_SIZE 256U @@ -2293,9 +2285,7 @@ void neoscrypt_fastkdf(const uchar *password, uint password_len, const uchar *salt, uint salt_len, uint N, uchar *output, uint output_len) { const size_t stack_align = 0x40; const uint kdf_buf_size = FASTKDF_BUFFER_SIZE, - prf_input_size = BLAKE2S_BLOCK_SIZE, - prf_key_size = BLAKE2S_KEY_SIZE, - prf_output_size = BLAKE2S_OUT_SIZE; + prf_input_size = 64, prf_key_size = 32, prf_output_size = 32; uint bufptr, a, b, i, j; uchar *A, *B, *prf_input, *prf_key, *prf_output; @@ -2381,7 +2371,7 @@ void neoscrypt_fastkdf(const uchar *password, uint password_len, #else -#if (ASM) +#ifdef ASM extern void neoscrypt_fastkdf_opt(const uchar *password, const uchar *salt, uchar *output, uint mode); @@ -2437,20 +2427,16 @@ void neoscrypt_fastkdf_opt(const uchar *password, const uchar *salt, neoscrypt_copy(&S[12], &B[bufptr], 32); neoscrypt_erase(&S[20], 32); - /* BLAKE2s: update input */ - neoscrypt_copy(&S[28], &A[bufptr], 64); - S[44] = 128; - - /* BLAKE2s: compress */ + /* BLAKE2s: compress IV using key */ S[8] = 64; blake2s_compress((blake2s_state *) S); - S[44] = 64; - neoscrypt_copy(&S[12], &S[28], 64); - /* BLAKE2s: compress again */ + /* BLAKE2s: update input */ + neoscrypt_copy(&S[12], &A[bufptr], 64); + + /* BLAKE2s: compress again using input */ S[8] = 128; S[10] = ~0U; - neoscrypt_erase(&S[28], 64); blake2s_compress((blake2s_state *) S); for(j = 0, bufptr = 0; j < 8; j++) { @@ -2482,12 +2468,12 @@ void neoscrypt_fastkdf_opt(const uchar *password, const uchar *salt, } } -#endif /* (ASM) */ +#endif /* ASM */ -#endif /* (OPT) */ +#endif /* !(OPT) */ -#if !(ASM) +#ifndef ASM /* Configurable optimised block mixer */ static void neoscrypt_blkmix(uint *X, uint *Y, uint r, uint mixmode) { @@ -2506,14 +2492,14 @@ static void neoscrypt_blkmix(uint *X, uint *Y, uint r, uint mixmode) { if(r == 1) { if(mixer) { - neoscrypt_blkxor(&X[0], &X[16], SCRYPT_BLOCK_SIZE); + neoscrypt_blkxor(&X[0], &X[16], BLOCK_SIZE); neoscrypt_chacha(&X[0], rounds); - neoscrypt_blkxor(&X[16], &X[0], SCRYPT_BLOCK_SIZE); + neoscrypt_blkxor(&X[16], &X[0], BLOCK_SIZE); neoscrypt_chacha(&X[16], rounds); } else { - neoscrypt_blkxor(&X[0], &X[16], SCRYPT_BLOCK_SIZE); + neoscrypt_blkxor(&X[0], &X[16], BLOCK_SIZE); neoscrypt_salsa(&X[0], rounds); - neoscrypt_blkxor(&X[16], &X[0], SCRYPT_BLOCK_SIZE); + neoscrypt_blkxor(&X[16], &X[0], BLOCK_SIZE); neoscrypt_salsa(&X[16], rounds); } return; @@ -2521,43 +2507,43 @@ static void neoscrypt_blkmix(uint *X, uint *Y, uint r, uint mixmode) { if(r == 2) { if(mixer) { - neoscrypt_blkxor(&X[0], &X[48], SCRYPT_BLOCK_SIZE); + neoscrypt_blkxor(&X[0], &X[48], BLOCK_SIZE); neoscrypt_chacha(&X[0], rounds); - neoscrypt_blkxor(&X[16], &X[0], SCRYPT_BLOCK_SIZE); + neoscrypt_blkxor(&X[16], &X[0], BLOCK_SIZE); neoscrypt_chacha(&X[16], rounds); - neoscrypt_blkxor(&X[32], &X[16], SCRYPT_BLOCK_SIZE); + neoscrypt_blkxor(&X[32], &X[16], BLOCK_SIZE); neoscrypt_chacha(&X[32], rounds); - neoscrypt_blkxor(&X[48], &X[32], SCRYPT_BLOCK_SIZE); + neoscrypt_blkxor(&X[48], &X[32], BLOCK_SIZE); neoscrypt_chacha(&X[48], rounds); - neoscrypt_blkswp(&X[16], &X[32], SCRYPT_BLOCK_SIZE); + neoscrypt_blkswp(&X[16], &X[32], BLOCK_SIZE); } else { - neoscrypt_blkxor(&X[0], &X[48], SCRYPT_BLOCK_SIZE); + neoscrypt_blkxor(&X[0], &X[48], BLOCK_SIZE); neoscrypt_salsa(&X[0], rounds); - neoscrypt_blkxor(&X[16], &X[0], SCRYPT_BLOCK_SIZE); + neoscrypt_blkxor(&X[16], &X[0], BLOCK_SIZE); neoscrypt_salsa(&X[16], rounds); - neoscrypt_blkxor(&X[32], &X[16], SCRYPT_BLOCK_SIZE); + neoscrypt_blkxor(&X[32], &X[16], BLOCK_SIZE); neoscrypt_salsa(&X[32], rounds); - neoscrypt_blkxor(&X[48], &X[32], SCRYPT_BLOCK_SIZE); + neoscrypt_blkxor(&X[48], &X[32], BLOCK_SIZE); neoscrypt_salsa(&X[48], rounds); - neoscrypt_blkswp(&X[16], &X[32], SCRYPT_BLOCK_SIZE); + neoscrypt_blkswp(&X[16], &X[32], BLOCK_SIZE); } return; } /* Reference code for any reasonable r */ for(i = 0; i < 2 * r; i++) { - if(i) neoscrypt_blkxor(&X[16 * i], &X[16 * (i - 1)], SCRYPT_BLOCK_SIZE); - else neoscrypt_blkxor(&X[0], &X[16 * (2 * r - 1)], SCRYPT_BLOCK_SIZE); + if(i) neoscrypt_blkxor(&X[16 * i], &X[16 * (i - 1)], BLOCK_SIZE); + else neoscrypt_blkxor(&X[0], &X[16 * (2 * r - 1)], BLOCK_SIZE); if(mixer) neoscrypt_chacha(&X[16 * i], rounds); else neoscrypt_salsa(&X[16 * i], rounds); - neoscrypt_blkcpy(&Y[16 * i], &X[16 * i], SCRYPT_BLOCK_SIZE); + neoscrypt_blkcpy(&Y[16 * i], &X[16 * i], BLOCK_SIZE); } for(i = 0; i < r; i++) - neoscrypt_blkcpy(&X[16 * i], &Y[16 * 2 * i], SCRYPT_BLOCK_SIZE); + neoscrypt_blkcpy(&X[16 * i], &Y[16 * 2 * i], BLOCK_SIZE); for(i = 0; i < r; i++) - neoscrypt_blkcpy(&X[16 * (i + r)], &Y[16 * (2 * i + 1)], SCRYPT_BLOCK_SIZE); + neoscrypt_blkcpy(&X[16 * (i + r)], &Y[16 * (2 * i + 1)], BLOCK_SIZE); } @@ -2610,14 +2596,14 @@ void neoscrypt(const uchar *password, uchar *output, uint profile) { r = (1 << ((profile >> 5) & 0x7)); } - uchar stack[(N + 3) * r * 2 * SCRYPT_BLOCK_SIZE + stack_align]; - /* X = r * 2 * SCRYPT_BLOCK_SIZE */ + uchar stack[(N + 3) * r * 2 * BLOCK_SIZE + stack_align]; + /* X = r * 2 * BLOCK_SIZE */ X = (uint *) (((size_t)stack & ~(stack_align - 1)) + stack_align); /* Z is a copy of X for ChaCha */ Z = &X[32 * r]; /* Y is an X sized temporal space */ Y = &X[64 * r]; - /* V = N * r * 2 * SCRYPT_BLOCK_SIZE */ + /* V = N * r * 2 * BLOCK_SIZE */ V = &X[96 * r]; /* X = KDF(password, salt) */ @@ -2627,25 +2613,25 @@ void neoscrypt(const uchar *password, uchar *output, uint profile) { default: case(0x0): -#if (OPT) +#ifdef OPT neoscrypt_fastkdf_opt(password, password, (uchar *) X, 0); #else neoscrypt_fastkdf(password, 80, password, 80, 32, - (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE); + (uchar *) X, r * 2 * BLOCK_SIZE); #endif break; -#if (SHA256) +#ifdef SHA256 case(0x1): neoscrypt_pbkdf2_sha256(password, 80, password, 80, 1, - (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE); + (uchar *) X, r * 2 * BLOCK_SIZE); break; #endif -#if (BLAKE256) +#ifdef BLAKE256 case(0x2): neoscrypt_pbkdf2_blake256(password, 80, password, 80, 1, - (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE); + (uchar *) X, r * 2 * BLOCK_SIZE); break; #endif @@ -2655,12 +2641,12 @@ void neoscrypt(const uchar *password, uchar *output, uint profile) { if(dblmix) { /* blkcpy(Z, X) */ - neoscrypt_blkcpy(&Z[0], &X[0], r * 2 * SCRYPT_BLOCK_SIZE); + neoscrypt_blkcpy(&Z[0], &X[0], r * 2 * BLOCK_SIZE); /* Z = SMix(Z) */ for(i = 0; i < N; i++) { /* blkcpy(V, Z) */ - neoscrypt_blkcpy(&V[i * (32 * r)], &Z[0], r * 2 * SCRYPT_BLOCK_SIZE); + neoscrypt_blkcpy(&V[i * (32 * r)], &Z[0], r * 2 * BLOCK_SIZE); /* blkmix(Z, Y) */ neoscrypt_blkmix(&Z[0], &Y[0], r, (mixmode | 0x0100)); } @@ -2669,7 +2655,7 @@ void neoscrypt(const uchar *password, uchar *output, uint profile) { /* integerify(Z) mod N */ j = (32 * r) * (Z[16 * (2 * r - 1)] & (N - 1)); /* blkxor(Z, V) */ - neoscrypt_blkxor(&Z[0], &V[j], r * 2 * SCRYPT_BLOCK_SIZE); + neoscrypt_blkxor(&Z[0], &V[j], r * 2 * BLOCK_SIZE); /* blkmix(Z, Y) */ neoscrypt_blkmix(&Z[0], &Y[0], r, (mixmode | 0x0100)); } @@ -2678,7 +2664,7 @@ void neoscrypt(const uchar *password, uchar *output, uint profile) { /* X = SMix(X) */ for(i = 0; i < N; i++) { /* blkcpy(V, X) */ - neoscrypt_blkcpy(&V[i * (32 * r)], &X[0], r * 2 * SCRYPT_BLOCK_SIZE); + neoscrypt_blkcpy(&V[i * (32 * r)], &X[0], r * 2 * BLOCK_SIZE); /* blkmix(X, Y) */ neoscrypt_blkmix(&X[0], &Y[0], r, mixmode); } @@ -2686,39 +2672,39 @@ void neoscrypt(const uchar *password, uchar *output, uint profile) { /* integerify(X) mod N */ j = (32 * r) * (X[16 * (2 * r - 1)] & (N - 1)); /* blkxor(X, V) */ - neoscrypt_blkxor(&X[0], &V[j], r * 2 * SCRYPT_BLOCK_SIZE); + neoscrypt_blkxor(&X[0], &V[j], r * 2 * BLOCK_SIZE); /* blkmix(X, Y) */ neoscrypt_blkmix(&X[0], &Y[0], r, mixmode); } if(dblmix) /* blkxor(X, Z) */ - neoscrypt_blkxor(&X[0], &Z[0], r * 2 * SCRYPT_BLOCK_SIZE); + neoscrypt_blkxor(&X[0], &Z[0], r * 2 * BLOCK_SIZE); /* output = KDF(password, X) */ switch(kdf) { default: case(0x0): -#if (OPT) +#ifdef OPT neoscrypt_fastkdf_opt(password, (uchar *) X, output, 1); #else neoscrypt_fastkdf(password, 80, (uchar *) X, - r * 2 * SCRYPT_BLOCK_SIZE, 32, output, 32); + r * 2 * BLOCK_SIZE, 32, output, 32); #endif break; -#if (SHA256) +#ifdef SHA256 case(0x1): neoscrypt_pbkdf2_sha256(password, 80, (uchar *) X, - r * 2 * SCRYPT_BLOCK_SIZE, 1, output, 32); + r * 2 * BLOCK_SIZE, 1, output, 32); break; #endif -#if (BLAKE256) +#ifdef BLAKE256 case(0x2): neoscrypt_pbkdf2_blake256(password, 80, (uchar *) X, - r * 2 * SCRYPT_BLOCK_SIZE, 1, output, 32); + r * 2 * BLOCK_SIZE, 1, output, 32); break; #endif @@ -2729,7 +2715,7 @@ void neoscrypt(const uchar *password, uchar *output, uint profile) { #endif /* !(ASM) */ -#if (ASM) && (MINER_4WAY) +#if defined(ASM) && defined(MINER_4WAY) extern void neoscrypt_xor_salsa_4way(uint *X, uint *X0, uint *Y, uint double_rounds); extern void neoscrypt_xor_chacha_4way(uint *Z, uint *Z0, uint *Y, uint double_rounds); @@ -2750,11 +2736,11 @@ extern void neoscrypt_xor_4way(void *dstp, const void *srcAp, /* The following code is for reference only */ static void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len) { - ulong *dst = (ulong *) dstp; - ulong *src = (ulong *) srcp; + size_t *dst = (size_t *) dstp; + size_t *src = (size_t *) srcp; uint i; - for(i = 0; i < (len / sizeof(ulong)); i += 4) { + for(i = 0; i < (len / sizeof(size_t)); i += 4) { dst[i] = src[i]; dst[i + 1] = src[i + 1]; dst[i + 2] = src[i + 2]; @@ -2763,12 +2749,12 @@ static void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len) { } static void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len) { - ulong *blkA = (ulong *) blkAp; - ulong *blkB = (ulong *) blkBp; - register ulong t0, t1, t2, t3; + size_t *blkA = (size_t *) blkAp; + size_t *blkB = (size_t *) blkBp; + register size_t t0, t1, t2, t3; uint i; - for(i = 0; i < (len / sizeof(ulong)); i += 4) { + for(i = 0; i < (len / sizeof(size_t)); i += 4) { t0 = blkA[i]; t1 = blkA[i + 1]; t2 = blkA[i + 2]; @@ -2785,11 +2771,11 @@ static void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len) { } static void neoscrypt_blkxor(void *dstp, const void *srcp, uint len) { - ulong *dst = (ulong *) dstp; - ulong *src = (ulong *) srcp; + size_t *dst = (size_t *) dstp; + size_t *src = (size_t *) srcp; uint i; - for(i = 0; i < (len / sizeof(ulong)); i += 4) { + for(i = 0; i < (len / sizeof(size_t)); i += 4) { dst[i] ^= src[i]; dst[i + 1] ^= src[i + 1]; dst[i + 2] ^= src[i + 2]; @@ -2848,148 +2834,142 @@ static void neoscrypt_xor_4way(void *dstp, const void *srcAp, #endif -/* 4-way NeoScrypt implementation; - * Basic customisation (required): - * profile bit 0: - * 0 = NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20; - * 1 = Scrypt(1024, 1, 1) with Salsa20/8; - * profile bits 31 to 1 are reserved */ -void neoscrypt_4way(const uchar *password, uchar *output, uint profile) { - const size_t stack_align = 0x40; +/* 4-way NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20 */ +void neoscrypt_4way(const uchar *password, uchar *output, uchar *scratchpad) { + const uint N = 128, r = 2, double_rounds = 10; + uint *X, *Z, *V, *Y, *P; uint i, j0, j1, j2, j3, k; - /* 2 * SCRYPT_BLOCK_SIZE compacted to 128 below */ - -#if (SHA256) - if(!profile) { -#endif - const uint N = 128, r = 2, double_rounds = 10; - uint *X, *Z, *V, *Y, *P; - - uchar stack[4 * ((N + 3) * r * 128 + 80) + stack_align]; - X = (uint *) (((size_t)stack & ~(stack_align - 1)) + stack_align); - Z = &X[4 * 32 * r]; - V = &X[4 * 64 * r]; - /* Y is a temporary work space */ - Y = &X[4 * (N + 2) * 32 * r]; - /* P is a set of passwords 80 bytes each */ - P = &X[4 * (N + 3) * 32 * r]; - - /* Load the password and increment nonces */ - for(k = 0; k < 4; k++) { - neoscrypt_copy(&P[k * 20], password, 80); - P[(k + 1) * 20 - 1] += k; - } + /* 2 * BLOCK_SIZE compacted to 128 below */; - neoscrypt_fastkdf_4way((uchar *) &P[0], (uchar *) &P[0], (uchar *) &Y[0], 0); + /* Scratchpad size is 4 * ((N + 3) * r * 128 + 80) bytes */ - neoscrypt_pack_4way(&X[0], &Y[0], 4 * r * 128); + X = (uint *) &scratchpad[0]; + Z = &X[4 * 32 * r]; + V = &X[4 * 64 * r]; + /* Y is a temporary work space */ + Y = &X[4 * (N + 2) * 32 * r]; + /* P is a set of passwords 80 bytes each */ + P = &X[4 * (N + 3) * 32 * r]; - neoscrypt_blkcpy(&Z[0], &X[0], 4 * r * 128); + /* Load the password and increment nonces */ + for(k = 0; k < 4; k++) { + neoscrypt_copy(&P[k * 20], password, 80); + P[(k + 1) * 20 - 1] += k; + } - for(i = 0; i < N; i++) { - neoscrypt_blkcpy(&V[i * r * 128], &Z[0], 4 * r * 128); - neoscrypt_xor_chacha_4way(&Z[0], &Z[192], &Y[0], double_rounds); - neoscrypt_xor_chacha_4way(&Z[64], &Z[0], &Y[0], double_rounds); - neoscrypt_xor_chacha_4way(&Z[128], &Z[64], &Y[0], double_rounds); - neoscrypt_xor_chacha_4way(&Z[192], &Z[128], &Y[0], double_rounds); - neoscrypt_blkswp(&Z[64], &Z[128], r * 128); - } + neoscrypt_fastkdf_4way((uchar *) &P[0], (uchar *) &P[0], (uchar *) &Y[0], + (uchar *) &scratchpad[0], 0); - for(i = 0; i < N; i++) { - j0 = (4 * r * 32) * (Z[64 * (2 * r - 1)] & (N - 1)); - j1 = (4 * r * 32) * (Z[1 + (64 * (2 * r - 1))] & (N - 1)); - j2 = (4 * r * 32) * (Z[2 + (64 * (2 * r - 1))] & (N - 1)); - j3 = (4 * r * 32) * (Z[3 + (64 * (2 * r - 1))] & (N - 1)); - neoscrypt_xor_4way(&Z[0], - &V[j0], &V[j1], &V[j2], &V[j3], 4 * r * 128); - neoscrypt_xor_chacha_4way(&Z[0], &Z[192], &Y[0], double_rounds); - neoscrypt_xor_chacha_4way(&Z[64], &Z[0], &Y[0], double_rounds); - neoscrypt_xor_chacha_4way(&Z[128], &Z[64], &Y[0], double_rounds); - neoscrypt_xor_chacha_4way(&Z[192], &Z[128], &Y[0], double_rounds); - neoscrypt_blkswp(&Z[64], &Z[128], 256); - } + neoscrypt_pack_4way(&X[0], &Y[0], 4 * r * 128); - for(i = 0; i < N; i++) { - neoscrypt_blkcpy(&V[i * r * 128], &X[0], 4 * r * 128); - neoscrypt_xor_salsa_4way(&X[0], &X[192], &Y[0], double_rounds); - neoscrypt_xor_salsa_4way(&X[64], &X[0], &Y[0], double_rounds); - neoscrypt_xor_salsa_4way(&X[128], &X[64], &Y[0], double_rounds); - neoscrypt_xor_salsa_4way(&X[192], &X[128], &Y[0], double_rounds); - neoscrypt_blkswp(&X[64], &X[128], r * 128); - } + neoscrypt_blkcpy(&Z[0], &X[0], 4 * r * 128); - for(i = 0; i < N; i++) { - j0 = (4 * r * 32) * (X[64 * (2 * r - 1)] & (N - 1)); - j1 = (4 * r * 32) * (X[1 + (64 * (2 * r - 1))] & (N - 1)); - j2 = (4 * r * 32) * (X[2 + (64 * (2 * r - 1))] & (N - 1)); - j3 = (4 * r * 32) * (X[3 + (64 * (2 * r - 1))] & (N - 1)); - neoscrypt_xor_4way(&X[0], - &V[j0], &V[j1], &V[j2], &V[j3], 4 * r * 128); - neoscrypt_xor_salsa_4way(&X[0], &X[192], &Y[0], double_rounds); - neoscrypt_xor_salsa_4way(&X[64], &X[0], &Y[0], double_rounds); - neoscrypt_xor_salsa_4way(&X[128], &X[64], &Y[0], double_rounds); - neoscrypt_xor_salsa_4way(&X[192], &X[128], &Y[0], double_rounds); - neoscrypt_blkswp(&X[64], &X[128], r * 128); - } + for(i = 0; i < N; i++) { + neoscrypt_blkcpy(&V[i * r * 128], &Z[0], 4 * r * 128); + neoscrypt_xor_chacha_4way(&Z[0], &Z[192], &Y[0], double_rounds); + neoscrypt_xor_chacha_4way(&Z[64], &Z[0], &Y[0], double_rounds); + neoscrypt_xor_chacha_4way(&Z[128], &Z[64], &Y[0], double_rounds); + neoscrypt_xor_chacha_4way(&Z[192], &Z[128], &Y[0], double_rounds); + neoscrypt_blkswp(&Z[64], &Z[128], r * 128); + } - neoscrypt_blkxor(&X[0], &Z[0], 4 * r * 128); + for(i = 0; i < N; i++) { + j0 = (4 * r * 32) * (Z[64 * (2 * r - 1)] & (N - 1)); + j1 = (4 * r * 32) * (Z[1 + (64 * (2 * r - 1))] & (N - 1)); + j2 = (4 * r * 32) * (Z[2 + (64 * (2 * r - 1))] & (N - 1)); + j3 = (4 * r * 32) * (Z[3 + (64 * (2 * r - 1))] & (N - 1)); + neoscrypt_xor_4way(&Z[0], + &V[j0], &V[j1], &V[j2], &V[j3], 4 * r * 128); + neoscrypt_xor_chacha_4way(&Z[0], &Z[192], &Y[0], double_rounds); + neoscrypt_xor_chacha_4way(&Z[64], &Z[0], &Y[0], double_rounds); + neoscrypt_xor_chacha_4way(&Z[128], &Z[64], &Y[0], double_rounds); + neoscrypt_xor_chacha_4way(&Z[192], &Z[128], &Y[0], double_rounds); + neoscrypt_blkswp(&Z[64], &Z[128], 256); + } - neoscrypt_unpack_4way(&Y[0], &X[0], 4 * r * 128); + for(i = 0; i < N; i++) { + neoscrypt_blkcpy(&V[i * r * 128], &X[0], 4 * r * 128); + neoscrypt_xor_salsa_4way(&X[0], &X[192], &Y[0], double_rounds); + neoscrypt_xor_salsa_4way(&X[64], &X[0], &Y[0], double_rounds); + neoscrypt_xor_salsa_4way(&X[128], &X[64], &Y[0], double_rounds); + neoscrypt_xor_salsa_4way(&X[192], &X[128], &Y[0], double_rounds); + neoscrypt_blkswp(&X[64], &X[128], r * 128); + } - neoscrypt_fastkdf_4way((uchar *) &P[0], (uchar *) &Y[0], (uchar *) &output[0], 1); + for(i = 0; i < N; i++) { + j0 = (4 * r * 32) * (X[64 * (2 * r - 1)] & (N - 1)); + j1 = (4 * r * 32) * (X[1 + (64 * (2 * r - 1))] & (N - 1)); + j2 = (4 * r * 32) * (X[2 + (64 * (2 * r - 1))] & (N - 1)); + j3 = (4 * r * 32) * (X[3 + (64 * (2 * r - 1))] & (N - 1)); + neoscrypt_xor_4way(&X[0], + &V[j0], &V[j1], &V[j2], &V[j3], 4 * r * 128); + neoscrypt_xor_salsa_4way(&X[0], &X[192], &Y[0], double_rounds); + neoscrypt_xor_salsa_4way(&X[64], &X[0], &Y[0], double_rounds); + neoscrypt_xor_salsa_4way(&X[128], &X[64], &Y[0], double_rounds); + neoscrypt_xor_salsa_4way(&X[192], &X[128], &Y[0], double_rounds); + neoscrypt_blkswp(&X[64], &X[128], r * 128); + } -#if (SHA256) - } else { + neoscrypt_blkxor(&X[0], &Z[0], 4 * r * 128); - const uint N = 1024, r = 1, double_rounds = 4; - uint *X, *V, *Y, *P; + neoscrypt_unpack_4way(&Y[0], &X[0], 4 * r * 128); - uchar stack[4 * ((N + 2) * r * 128 + 80) + stack_align]; - X = (uint *) (((size_t)stack & ~(stack_align - 1)) + stack_align); - V = &X[4 * 32 * r]; - Y = &X[4 * (N + 1) * 32 * r]; - P = &X[4 * (N + 2) * 32 * r]; + neoscrypt_fastkdf_4way((uchar *) &P[0], (uchar *) &Y[0], (uchar *) &output[0], + (uchar *) &scratchpad[0], 1); +} - for(k = 0; k < 4; k++) { - neoscrypt_copy(&P[k * 20], password, 80); - P[(k + 1) * 20 - 1] += k; - } +#ifdef SHA256 +/* 4-way Scrypt(1024, 1, 1) with Salsa20/8 */ +void scrypt_4way(const uchar *password, uchar *output, uchar *scratchpad) { + const uint N = 1024, r = 1, double_rounds = 4; + uint *X, *V, *Y, *P; + uint i, j0, j1, j2, j3, k; - for(k = 0; k < 4; k++) - neoscrypt_pbkdf2_sha256((uchar *) &P[k * 20], 80, - (uchar *) &P[k * 20], 80, 1, - (uchar *) &Y[k * r * 32], r * 128); + /* Scratchpad size is 4 * ((N + 2) * r * 128 + 80) bytes */ - neoscrypt_pack_4way(&X[0], &Y[0], 4 * r * 128); + X = (uint *) &scratchpad[0]; + V = &X[4 * 32 * r]; + Y = &X[4 * (N + 1) * 32 * r]; + P = &X[4 * (N + 2) * 32 * r]; - for(i = 0; i < N; i++) { - neoscrypt_blkcpy(&V[i * r * 128], &X[0], 4 * r * 128); - neoscrypt_xor_salsa_4way(&X[0], &X[64], &Y[0], double_rounds); - neoscrypt_xor_salsa_4way(&X[64], &X[0], &Y[0], double_rounds); - } + for(k = 0; k < 4; k++) { + neoscrypt_copy(&P[k * 20], password, 80); + P[(k + 1) * 20 - 1] += k; + } - for(i = 0; i < N; i++) { - j0 = (4 * r * 32) * (X[64 * (2 * r - 1)] & (N - 1)); - j1 = (4 * r * 32) * (X[1 + (64 * (2 * r - 1))] & (N - 1)); - j2 = (4 * r * 32) * (X[2 + (64 * (2 * r - 1))] & (N - 1)); - j3 = (4 * r * 32) * (X[3 + (64 * (2 * r - 1))] & (N - 1)); - neoscrypt_xor_4way(&X[0], - &V[j0], &V[j1], &V[j2], &V[j3], 4 * r * 128); - neoscrypt_xor_salsa_4way(&X[0], &X[64], &Y[0], double_rounds); - neoscrypt_xor_salsa_4way(&X[64], &X[0], &Y[0], double_rounds); - } + for(k = 0; k < 4; k++) + neoscrypt_pbkdf2_sha256((uchar *) &P[k * 20], 80, + (uchar *) &P[k * 20], 80, 1, + (uchar *) &Y[k * r * 32], r * 128); - neoscrypt_unpack_4way(&Y[0], &X[0], 4 * r * 128); + neoscrypt_pack_4way(&X[0], &Y[0], 4 * r * 128); - for(k = 0; k < 4; k++) - neoscrypt_pbkdf2_sha256((uchar *) &P[k * 20], 80, - (uchar *) &Y[k * r * 32], r * 128, 1, - (uchar *) &output[k * 32], 32); + for(i = 0; i < N; i++) { + neoscrypt_blkcpy(&V[i * r * 128], &X[0], 4 * r * 128); + neoscrypt_xor_salsa_4way(&X[0], &X[64], &Y[0], double_rounds); + neoscrypt_xor_salsa_4way(&X[64], &X[0], &Y[0], double_rounds); + } + for(i = 0; i < N; i++) { + j0 = (4 * r * 32) * (X[64 * (2 * r - 1)] & (N - 1)); + j1 = (4 * r * 32) * (X[1 + (64 * (2 * r - 1))] & (N - 1)); + j2 = (4 * r * 32) * (X[2 + (64 * (2 * r - 1))] & (N - 1)); + j3 = (4 * r * 32) * (X[3 + (64 * (2 * r - 1))] & (N - 1)); + neoscrypt_xor_4way(&X[0], + &V[j0], &V[j1], &V[j2], &V[j3], 4 * r * 128); + neoscrypt_xor_salsa_4way(&X[0], &X[64], &Y[0], double_rounds); + neoscrypt_xor_salsa_4way(&X[64], &X[0], &Y[0], double_rounds); } -#endif /* (SHA256) */ + + neoscrypt_unpack_4way(&Y[0], &X[0], 4 * r * 128); + + for(k = 0; k < 4; k++) + neoscrypt_pbkdf2_sha256((uchar *) &P[k * 20], 80, + (uchar *) &Y[k * r * 32], r * 128, 1, + (uchar *) &output[k * 32], 32); } +#endif /* SHA256 */ extern void blake2s_compress_4way(void *T); @@ -3012,7 +2992,7 @@ void neoscrypt_blake2s_4way(const uchar *input, const uchar *key, uchar *output) uint *T; /* Align and set up the buffer in stack */ - uchar stack[1024 + stack_align]; + uchar stack[704 + stack_align]; T = (uint *) (((size_t)stack & ~(stack_align - 1)) + stack_align); /* Initialise */ @@ -3023,26 +3003,17 @@ void neoscrypt_blake2s_4way(const uchar *input, const uchar *key, uchar *output) neoscrypt_pack_4way(&T[48], &key[0], 128); neoscrypt_erase(&T[80], 128); - /* Update inputs */ - neoscrypt_pack_4way(&T[112], &input[0], 256); - T[176] = 128; - T[177] = 128; - T[178] = 128; - T[179] = 128; - - /* Compress */ + /* Compress IVs using keys */ T[32] = 64; T[33] = 64; T[34] = 64; T[35] = 64; blake2s_compress_4way(&T[0]); - T[176] = 64; - T[177] = 64; - T[178] = 64; - T[179] = 64; - neoscrypt_copy(&T[48], &T[112], 256); - /* Compress again */ + /* Update inputs */ + neoscrypt_pack_4way(&T[48], &input[0], 256); + + /* Compress using inputs */ T[32] = 128; T[33] = 128; T[34] = 128; @@ -3051,7 +3022,6 @@ void neoscrypt_blake2s_4way(const uchar *input, const uchar *key, uchar *output) T[41] = ~0U; T[42] = ~0U; T[43] = ~0U; - neoscrypt_erase(&T[112], 256); blake2s_compress_4way(&T[0]); neoscrypt_unpack_4way(&output[0], &T[0], 128); @@ -3060,25 +3030,22 @@ void neoscrypt_blake2s_4way(const uchar *input, const uchar *key, uchar *output) /* 4-way FastKDF with BLAKE2s integrated */ void neoscrypt_fastkdf_4way(const uchar *password, const uchar *salt, - uchar *output, uint mode) { - const size_t stack_align = 0x40; + uchar *output, uchar *scratchpad, uint mode) { uint bufptr_a = 0, bufptr_b = 0, bufptr_c = 0, bufptr_d = 0; uint output_len, i, j; uint *T; uchar *Aa, *Ab, *Ac, *Ad; uchar *Ba, *Bb, *Bc, *Bd; - /* Align and set up the buffers in stack */ - uchar stack[3456 + stack_align]; - T = (uint *) (((size_t)stack & ~(stack_align - 1)) + stack_align); - Aa = (uchar *) &T[256]; - Ab = (uchar *) &T[336]; - Ac = (uchar *) &T[416]; - Ad = (uchar *) &T[496]; - Ba = (uchar *) &T[576]; - Bb = (uchar *) &T[648]; - Bc = (uchar *) &T[720]; - Bd = (uchar *) &T[792]; + T = (uint *) &scratchpad[0]; + Aa = (uchar *) &T[176]; + Ab = (uchar *) &T[256]; + Ac = (uchar *) &T[336]; + Ad = (uchar *) &T[416]; + Ba = (uchar *) &T[496]; + Bb = (uchar *) &T[568]; + Bc = (uchar *) &T[640]; + Bd = (uchar *) &T[712]; neoscrypt_copy(&Aa[0], &password[0], 80); neoscrypt_copy(&Aa[80], &password[0], 80); @@ -3141,7 +3108,7 @@ void neoscrypt_fastkdf_4way(const uchar *password, const uchar *salt, neoscrypt_copy(&T[0], blake2s_IV_P_XOR_4way, 128); neoscrypt_erase(&T[32], 64); - /* BLAKE2s: update key */ + /* BLAKE2s: update keys */ for(j = 0; j < 32; j += 8) { T[j + 48] = *((uint *) &Ba[bufptr_a + j]); T[j + 49] = *((uint *) &Bb[bufptr_b + j]); @@ -3154,35 +3121,26 @@ void neoscrypt_fastkdf_4way(const uchar *password, const uchar *salt, } neoscrypt_erase(&T[80], 128); - /* BLAKE2s: update input */ - for(j = 0; j < 64; j += 8) { - T[j + 112] = *((uint *) &Aa[bufptr_a + j]); - T[j + 113] = *((uint *) &Ab[bufptr_b + j]); - T[j + 114] = *((uint *) &Ac[bufptr_c + j]); - T[j + 115] = *((uint *) &Ad[bufptr_d + j]); - T[j + 116] = *((uint *) &Aa[bufptr_a + j + 4]); - T[j + 117] = *((uint *) &Ab[bufptr_b + j + 4]); - T[j + 118] = *((uint *) &Ac[bufptr_c + j + 4]); - T[j + 119] = *((uint *) &Ad[bufptr_d + j + 4]); - } - T[176] = 128; - T[177] = 128; - T[178] = 128; - T[179] = 128; - - /* BLAKE2s: compress */ + /* BLAKE2s: compress IVs using keys */ T[32] = 64; T[33] = 64; T[34] = 64; T[35] = 64; blake2s_compress_4way(&T[0]); - T[176] = 64; - T[177] = 64; - T[178] = 64; - T[179] = 64; - neoscrypt_copy(&T[48], &T[112], 256); - /* BLAKE2s: compress again */ + /* BLAKE2s: update inputs */ + for(j = 0; j < 64; j += 8) { + T[j + 48] = *((uint *) &Aa[bufptr_a + j]); + T[j + 49] = *((uint *) &Ab[bufptr_b + j]); + T[j + 50] = *((uint *) &Ac[bufptr_c + j]); + T[j + 51] = *((uint *) &Ad[bufptr_d + j]); + T[j + 52] = *((uint *) &Aa[bufptr_a + j + 4]); + T[j + 53] = *((uint *) &Ab[bufptr_b + j + 4]); + T[j + 54] = *((uint *) &Ac[bufptr_c + j + 4]); + T[j + 55] = *((uint *) &Ad[bufptr_d + j + 4]); + } + + /* BLAKE2s: compress using inputs */ T[32] = 128; T[33] = 128; T[34] = 128; @@ -3191,8 +3149,8 @@ void neoscrypt_fastkdf_4way(const uchar *password, const uchar *salt, T[41] = ~0U; T[42] = ~0U; T[43] = ~0U; - neoscrypt_erase(&T[112], 256); blake2s_compress_4way(&T[0]); + bufptr_a = 0; bufptr_b = 0; bufptr_c = 0; @@ -3295,13 +3253,11 @@ void neoscrypt_fastkdf_4way(const uchar *password, const uchar *salt, #endif /* (ASM) && (MINER_4WAY) */ -#if !(ASM) - +#ifndef ASM uint cpu_vec_exts() { - /* No assembly, no exensions */ + /* No assembly, no extensions */ return(0); } - #endif diff --git a/neoscrypt.h b/neoscrypt.h index 1ce337b48..f385b4368 100644 --- a/neoscrypt.h +++ b/neoscrypt.h @@ -2,7 +2,7 @@ extern "C" { #endif -void neoscrypt(const unsigned char *input, unsigned char *output, +void neoscrypt(const unsigned char *password, unsigned char *output, unsigned int profile); void neoscrypt_blake2s(const void *input, const unsigned int input_size, @@ -13,15 +13,21 @@ void neoscrypt_copy(void *dstp, const void *srcp, unsigned int len); void neoscrypt_erase(void *dstp, unsigned int len); void neoscrypt_xor(void *dstp, const void *srcp, unsigned int len); -#if (ASM) && (MINER_4WAY) -void neoscrypt_4way(const unsigned char *input, unsigned char *output, - unsigned int profile); +#if defined(ASM) && defined(MINER_4WAY) +void neoscrypt_4way(const unsigned char *password, unsigned char *output, + unsigned char *scratchpad); + +#ifdef SHA256 +void scrypt_4way(const unsigned char *password, unsigned char *output, + unsigned char *scratchpad); +#endif void neoscrypt_blake2s_4way(const unsigned char *input, const unsigned char *key, unsigned char *output); void neoscrypt_fastkdf_4way(const unsigned char *password, - const unsigned char *salt, unsigned char *output, unsigned int mode); + const unsigned char *salt, unsigned char *output, unsigned char *scratchpad, + const unsigned int mode); #endif unsigned int cpu_vec_exts(void); @@ -30,13 +36,9 @@ unsigned int cpu_vec_exts(void); } #else -#if (WINDOWS) && (__APPLE__) -/* sizeof(unsigned long) = 4 for MinGW64 and Mac GCC */ -typedef unsigned long long ulong; -#else -typedef unsigned long ulong; -#endif -typedef unsigned int uint; +typedef unsigned long long ullong; +typedef signed long long llong; +typedef unsigned int uint; typedef unsigned char uchar; #ifndef MIN @@ -47,11 +49,10 @@ typedef unsigned char uchar; #define MAX(a, b) ((a) > (b) ? a : b) #endif -#define SCRYPT_BLOCK_SIZE 64 -#define SCRYPT_HASH_BLOCK_SIZE 64 -#define SCRYPT_HASH_DIGEST_SIZE 32 +#define BLOCK_SIZE 64 +#define DIGEST_SIZE 32 -typedef uchar hash_digest[SCRYPT_HASH_DIGEST_SIZE]; +typedef uchar hash_digest[DIGEST_SIZE]; #define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b))) #define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b))) diff --git a/neoscrypt_asm.S b/neoscrypt_asm.S index cabe085ed..3889cb8a6 100644 --- a/neoscrypt_asm.S +++ b/neoscrypt_asm.S @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014-2015 John Doering + * Copyright (c) 2014-2016 John Doering * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -24,7 +24,11 @@ * SUCH DAMAGE. */ -#if (ASM) && (__x86_64__) +#if defined(ASM) && defined(__x86_64__) + +/* MOVQ_FIX addresses incorrect behaviour of old GNU assembler when transferring + * data between a 64-bit general purpose register and an MMX/SSE register: + * suffix or operands invalid for `movq' */ /* blake2s_compress(mem) * AMD64 BLAKE2s block compression; @@ -39,12 +43,19 @@ _blake2s_compress: pushq %r13 pushq %r14 pushq %r15 -#if (WIN64) +#ifdef WIN64 pushq %rdi pushq %rsi movq %rcx, %rdi #endif + +#ifndef MOVQ_FIX movq %rsp, %mm0 +#else + movd %esp, %mm0 + shrq $32, %rsp + movd %esp, %mm7 +#endif /* initialise */ movl 0(%rdi), %eax @@ -1301,8 +1312,16 @@ _blake2s_compress: xorl %r12d, 16(%rdi) /* finalise */ xorl %r14d, 24(%rdi) /* finalise */ +#ifndef MOVQ_FIX movq %mm0, %rsp -#if (WIN64) +#else + movd %mm0, %esp + movd %mm7, %eax + shlq $32, %rax + orq %rax, %rsp +#endif + +#ifdef WIN64 popq %rsi popq %rdi #endif @@ -1312,6 +1331,7 @@ _blake2s_compress: popq %r12 popq %rbp popq %rbx + emms ret @@ -1321,7 +1341,7 @@ _blake2s_compress: .globl _neoscrypt_copy neoscrypt_copy: _neoscrypt_copy: -#if (WIN64) +#ifdef WIN64 movq %rdi, %r10 movq %rsi, %r11 movq %rcx, %rdi @@ -1372,7 +1392,7 @@ _neoscrypt_copy: jnz .byte_copy .copy_finish: -#if (WIN64) +#ifdef WIN64 movq %r10, %rdi movq %r11, %rsi #endif @@ -1385,7 +1405,7 @@ _neoscrypt_copy: .globl _neoscrypt_erase neoscrypt_erase: _neoscrypt_erase: -#if (WIN64) +#ifdef WIN64 movq %rdi, %r10 movq %rsi, %r11 movq %rcx, %rdi @@ -1428,7 +1448,7 @@ _neoscrypt_erase: jnz .byte_erase .erase_finish: -#if (WIN64) +#ifdef WIN64 movq %r10, %rdi movq %r11, %rsi #endif @@ -1441,7 +1461,7 @@ _neoscrypt_erase: .globl _neoscrypt_xor neoscrypt_xor: _neoscrypt_xor: -#if (WIN64) +#ifdef WIN64 movq %rdi, %r10 movq %rsi, %r11 movq %rcx, %rdi @@ -1496,7 +1516,7 @@ _neoscrypt_xor: jnz .byte_xor .xor_finish: -#if (WIN64) +#ifdef WIN64 movq %r10, %rdi movq %r11, %rsi #endif @@ -1515,7 +1535,7 @@ _neoscrypt_fastkdf_opt: pushq %r13 pushq %r14 pushq %r15 -#if (WIN64) +#ifdef WIN64 pushq %rdi pushq %rsi subq $160, %rsp @@ -1535,7 +1555,9 @@ _neoscrypt_fastkdf_opt: movq %r9, %rcx #endif - subq $980, %rsp +/* 64 bytes (local variables) + 64 bytes (alignment space) + 320 bytes (password + * buffer) + 288 bytes (salt buffer) + 112 bytes (BLAKE2s space) = 848 bytes */ + subq $848, %rsp leaq 128(%rsp), %rbp andq $0xFFFFFFFFFFFFFFC0, %rbp movq %rdx, 48(%rsp) @@ -1661,40 +1683,26 @@ _neoscrypt_fastkdf_opt: movdqa %xmm5, 80(%r14) movdqa %xmm5, 96(%r14) - movdqu 0(%rbp), %xmm0 - movdqu 16(%rbp), %xmm1 - movdqu 32(%rbp), %xmm2 - movdqu 48(%rbp), %xmm3 - movdqa %xmm0, 112(%r14) - movdqa %xmm1, 128(%r14) - movdqa %xmm2, 144(%r14) - movdqa %xmm3, 160(%r14) - movl $128, 176(%r14) - -#if (WIN64) +#ifdef WIN64 movq %r14, %rcx #else movq %r14, %rdi #endif call blake2s_compress - pxor %xmm5, %xmm5 - movdqa 112(%r14), %xmm0 - movdqa 128(%r14), %xmm1 - movdqa 144(%r14), %xmm2 - movdqa 160(%r14), %xmm3 - movl $128, 32(%r14) - movl $0xFFFFFFFF, 40(%r14) + movdqu 0(%rbp), %xmm0 + movdqu 16(%rbp), %xmm1 + movdqu 32(%rbp), %xmm2 + movdqu 48(%rbp), %xmm3 movdqa %xmm0, 48(%r14) movdqa %xmm1, 64(%r14) movdqa %xmm2, 80(%r14) movdqa %xmm3, 96(%r14) - movdqa %xmm5, 112(%r14) - movdqa %xmm5, 128(%r14) - movdqa %xmm5, 144(%r14) - movdqa %xmm5, 160(%r14) -#if (WIN64) + movl $128, 32(%r14) + movl $0xFFFFFFFF, 40(%r14) + +#ifdef WIN64 movq %r14, %rcx #else movq %r14, %rdi @@ -1710,7 +1718,12 @@ _neoscrypt_fastkdf_opt: psadbw %xmm5, %xmm0 movhlps %xmm0, %xmm1 paddq %xmm1, %xmm0 +#ifndef MOVQ_FIX movq %xmm0, %r13 +#else + movq %xmm0, 0(%r14) + movq 0(%r14), %r13 +#endif andq $0xFF, %r13 leaq 320(%r12, %r13), %rbx movdqu 0(%rbx), %xmm0 @@ -1724,7 +1737,7 @@ _neoscrypt_fastkdf_opt: movq $32, %rdx cmpq %r13, %rdx jc .fastkdf_headupd -#if (WIN64) +#ifdef WIN64 movq %rdx, %r8 leaq 256(%rbx), %rcx movq %rbx, %rdx @@ -1744,7 +1757,7 @@ _neoscrypt_fastkdf_opt: jnc .fastkdf_loop_end movq %r13, %rax subq %rdx, %rax -#if (WIN64) +#ifdef WIN64 leaq 320(%r12), %rcx leaq 576(%r12), %rdx movq %rax, %r8 @@ -1767,7 +1780,7 @@ _neoscrypt_fastkdf_opt: jc .fastkdf_crosscopy leaq 320(%r12, %r13), %rbp -#if (WIN64) +#ifdef WIN64 movq %rbp, %rcx movq %r12, %rdx movq %r15, %r8 @@ -1777,7 +1790,7 @@ _neoscrypt_fastkdf_opt: movq %r15, %rdx #endif call neoscrypt_xor -#if (WIN64) +#ifdef WIN64 movq %r14, %rcx movq %rbp, %rdx movq %r15, %r8 @@ -1791,7 +1804,7 @@ _neoscrypt_fastkdf_opt: .fastkdf_crosscopy: leaq 320(%r12, %r13), %rbx -#if (WIN64) +#ifdef WIN64 movq %rbx, %rcx movq %r12, %rdx movq %rbp, %r8 @@ -1803,7 +1816,7 @@ _neoscrypt_fastkdf_opt: call neoscrypt_xor leaq 320(%r12), %rdi leaq 0(%r12, %rbp), %rsi -#if (WIN64) +#ifdef WIN64 movq %rdi, %rcx movq %rsi, %rdx movq %r15, %r8 @@ -1813,7 +1826,7 @@ _neoscrypt_fastkdf_opt: subq %rbp, %rdx #endif call neoscrypt_xor -#if (WIN64) +#ifdef WIN64 movq %r14, %rcx movq %rbx, %rdx movq %rbp, %r8 @@ -1823,7 +1836,7 @@ _neoscrypt_fastkdf_opt: movq %rbp, %rdx #endif call neoscrypt_copy -#if (WIN64) +#ifdef WIN64 leaq 0(%r14, %rbp), %rcx leaq 320(%r12), %rdx movq %r15, %r8 @@ -1837,9 +1850,9 @@ _neoscrypt_fastkdf_opt: call neoscrypt_copy .fastkdf_finish: - addq $980, %rsp + addq $848, %rsp -#if (WIN64) +#ifdef WIN64 movdqu 0(%rsp), %xmm15 movdqu 16(%rsp), %xmm14 movdqu 32(%rsp), %xmm13 @@ -2488,7 +2501,7 @@ neoscrypt_xor_chacha: .globl _neoscrypt neoscrypt: _neoscrypt: -#if (WIN64) +#ifdef WIN64 pushq %rdi pushq %rsi movq %rcx, %rdi @@ -2506,13 +2519,13 @@ _neoscrypt: movq %rsi, %r15 movq %rdx, %rbx -#if (SHA256) +#ifdef SHA256 /* Scrypt mode */ testl $0x01, %ebx jnz .scrypt #endif -#if (WIN64) +#ifdef WIN64 /* attempt to allocate 33280 + 128 bytes of stack space fails miserably; * have to use malloc() and free() instead */ subq $128, %rsp @@ -2535,11 +2548,11 @@ _neoscrypt: movq %rax, 32(%rsp) /* memory base: X, Z, V */ leaq 128(%rsp), %rbp -#endif /* (WIN64) */ +#endif /* WIN64 */ /* FastKDF */ -#if (WIN64) -#if (OPT) +#ifdef WIN64 +#ifdef OPT movq %r14, %rcx movq %r14, %rdx movq %rbp, %r8 @@ -2555,18 +2568,18 @@ _neoscrypt: movq %rbp, 40(%rsp) movq $256, 48(%rsp) call neoscrypt_fastkdf -#endif /* (OPT) */ +#endif /* OPT */ #else -#if (OPT) +#ifdef OPT movq %r14, %rdi movq %r14, %rsi movq %rbp, %rdx xorq %rcx, %rcx -#if (__APPLE__) +#ifdef __APPLE__ call _neoscrypt_fastkdf_opt #else call neoscrypt_fastkdf_opt -#endif /* (__APPLE__) */ +#endif /* __APPLE__ */ #else movq $80, %rax movq %r14, %rdi @@ -2576,13 +2589,13 @@ _neoscrypt: movq $32, %r8 movq %rbp, %r9 movq $256, 0(%rsp) -#if (__APPLE__) +#ifdef __APPLE__ call _neoscrypt_fastkdf #else call neoscrypt_fastkdf -#endif /* (__APPLE__) */ -#endif /* (OPT) */ -#endif /* (WIN64) */ +#endif /* __APPLE__ */ +#endif /* OPT */ +#endif /* WIN64 */ /* blkcpy(Z, X) */ leaq 256(%rbp), %rax @@ -3033,8 +3046,8 @@ _neoscrypt: movdqa %xmm15, 240(%rbp) /* FastKDF */ -#if (WIN64) -#if (OPT) +#ifdef WIN64 +#ifdef OPT movq %r14, %rcx movq %rbp, %rdx movq %r15, %r8 @@ -3051,19 +3064,19 @@ _neoscrypt: movq %r15, 40(%rsp) movq %rax, 48(%rsp) call neoscrypt_fastkdf -#endif /* (OPT) */ +#endif /* OPT */ #else -#if (OPT) +#ifdef OPT movq %r14, %rdi movq %rbp, %rsi movq %r15, %rdx xorq %rcx, %rcx incq %rcx -#if (__APPLE__) +#ifdef __APPLE__ call _neoscrypt_fastkdf_opt #else call neoscrypt_fastkdf_opt -#endif /* (__APPLE__) */ +#endif /* __APPLE__ */ #else movq %r14, %rdi movq $80, %rsi @@ -3072,15 +3085,15 @@ _neoscrypt: movq $32, %r8 movq %r15, %r9 movq $32, 0(%rsp) -#if (__APPLE__) +#ifdef __APPLE__ call _neoscrypt_fastkdf #else call neoscrypt_fastkdf -#endif /* (__APPLE__) */ -#endif /* (OPT) */ -#endif /* (WIN64) */ +#endif /* __APPLE__ */ +#endif /* OPT */ +#endif /* WIN64 */ -#if (WIN64) +#ifdef WIN64 /* free memory */ movq 64(%rsp), %rcx call free @@ -3096,7 +3109,7 @@ _neoscrypt: popq %r12 popq %rbp popq %rbx -#if (WIN64) +#ifdef WIN64 popq %rsi popq %rdi #endif @@ -3519,8 +3532,8 @@ _neoscrypt: movdqa %xmm15, 240(%rbp) /* FastKDF */ -#if (WIN64) -#if (OPT) +#ifdef WIN64 +#ifdef OPT movq %r14, %rcx movq %rbp, %rdx movq %r15, %r8 @@ -3537,19 +3550,19 @@ _neoscrypt: movq %r15, 40(%rsp) movq %rax, 48(%rsp) call neoscrypt_fastkdf -#endif /* (OPT) */ +#endif /* OPT */ #else -#if (OPT) +#ifdef OPT movq %r14, %rdi movq %rbp, %rsi movq %r15, %rdx xorq %rcx, %rcx incq %rcx -#if (__APPLE__) +#ifdef __APPLE__ call _neoscrypt_fastkdf_opt #else call neoscrypt_fastkdf_opt -#endif /* (__APPLE__) */ +#endif /* __APPLE__ */ #else movq %r14, %rdi movq $80, %rsi @@ -3559,15 +3572,15 @@ _neoscrypt: movq %rax, %r8 movq %r15, %r9 movq %rax, 0(%rsp) -#if (__APPLE__) +#ifdef __APPLE__ call _neoscrypt_fastkdf #else call neoscrypt_fastkdf -#endif /* (__APPLE__) */ -#endif /* (OPT) */ -#endif /* (WIN64) */ +#endif /* __APPLE__ */ +#endif /* OPT */ +#endif /* WIN64 */ -#if (WIN64) +#ifdef WIN64 /* free memory */ movq 64(%rsp), %rcx call free @@ -3583,16 +3596,16 @@ _neoscrypt: popq %r12 popq %rbp popq %rbx -#if (WIN64) +#ifdef WIN64 popq %rsi popq %rdi #endif ret -#if (SHA256) +#ifdef SHA256 .scrypt: -#if (WIN64) +#ifdef WIN64 /* attempt to allocate 131200 + 128 bytes of stack space fails miserably; * have to use malloc() and free() instead */ subq $128, %rsp @@ -3615,10 +3628,10 @@ _neoscrypt: movq %rax, 32(%rsp) /* memory base: X, Z, V */ leaq 128(%rsp), %rbp -#endif /* (WIN64) */ +#endif /* WIN64 */ /* PBKDF2-HMAC-SHA256 */ -#if (WIN64) +#ifdef WIN64 movq $80, %rax movq %r14, %rcx movq %rax, %rdx @@ -3637,12 +3650,12 @@ _neoscrypt: movq $1, %r8 movq %rbp, %r9 movq $128, 0(%rsp) -#if (__APPLE__) +#ifdef __APPLE__ call _neoscrypt_pbkdf2_sha256 #else call neoscrypt_pbkdf2_sha256 -#endif /* (__APPLE__) */ -#endif /* (WIN64) */ +#endif /* __APPLE__ */ +#endif /* WIN64 */ /* SSE2 switch */ testl $0x1000, %ebx @@ -3737,7 +3750,7 @@ _neoscrypt: jnz .salsa_s2 /* PBKDF2-HMAC-SHA256 */ -#if (WIN64) +#ifdef WIN64 movq %r14, %rcx movq $80, %rdx movq %rbp, %r8 @@ -3754,15 +3767,15 @@ _neoscrypt: movq $1, %r8 movq %r15, %r9 movq $32, 0(%rsp) -#if (__APPLE__) +#ifdef __APPLE__ call _neoscrypt_pbkdf2_sha256 #else call neoscrypt_pbkdf2_sha256 -#endif /* (__APPLE__) */ +#endif /* __APPLE__ */ -#endif /* (WIN64) */ +#endif /* WIN64 */ -#if (WIN64) +#ifdef WIN64 /* free memory */ movq 64(%rsp), %rcx call free @@ -3778,7 +3791,7 @@ _neoscrypt: popq %r12 popq %rbp popq %rbx -#if (WIN64) +#ifdef WIN64 popq %rsi popq %rdi #endif @@ -3880,7 +3893,7 @@ _neoscrypt: call neoscrypt_salsa_tangle_sse2 /* PBKDF2-HMAC-SHA256 */ -#if (WIN64) +#ifdef WIN64 movq %r14, %rcx movq $80, %rdx movq %rbp, %r8 @@ -3897,14 +3910,14 @@ _neoscrypt: movq $1, %r8 movq %r15, %r9 movq $32, 0(%rsp) -#if (__APPLE__) +#ifdef __APPLE__ call _neoscrypt_pbkdf2_sha256 #else call neoscrypt_pbkdf2_sha256 -#endif /* (__APPLE__) */ -#endif /* (WIN64) */ +#endif /* __APPLE__ */ +#endif /* WIN64 */ -#if (WIN64) +#ifdef WIN64 /* free memory */ movq 64(%rsp), %rcx call free @@ -3920,15 +3933,15 @@ _neoscrypt: popq %r12 popq %rbp popq %rbx -#if (WIN64) +#ifdef WIN64 popq %rsi popq %rdi #endif ret -#endif /* (SHA256) */ +#endif /* SHA256 */ -#if (MINER_4WAY) +#ifdef MINER_4WAY /* blake2s_compress_4way(mem) * AMD64 (SSE2) BLAKE2s 4-way block compression */ @@ -3942,7 +3955,7 @@ _blake2s_compress_4way: pushq %r13 pushq %r14 pushq %r15 -#if (WIN64) +#ifdef WIN64 pushq %rdi pushq %rsi subq $160, %rsp @@ -3960,7 +3973,7 @@ _blake2s_compress_4way: #endif /* initialise */ - leaq 768(%rdi), %rsi + leaq 448(%rdi), %rsi movdqa 0(%rdi), %xmm0 movdqa 16(%rdi), %xmm8 movdqa 32(%rdi), %xmm4 @@ -3985,14 +3998,26 @@ _blake2s_compress_4way: movq %r8, 136(%rsi) /* movq %r9, 144(%rsi) */ /* movq %r9, 152(%rsi) */ +#ifndef MOVQ_FIX movq %r9, %xmm9 movlhps %xmm9, %xmm9 +#else + movq %r9, 0(%rsi) + movq %r9, 8(%rsi) + movdqa 0(%rsi), %xmm9 +#endif movq %r10, 160(%rsi) movq %r10, 168(%rsi) /* movq %r11, 176(%rsi) */ /* movq %r11, 184(%rsi) */ +#ifndef MOVQ_FIX movq %r11, %xmm11 movlhps %xmm11, %xmm11 +#else + movq %r11, 0(%rsi) + movq %r11, 8(%rsi) + movdqa 0(%rsi), %xmm11 +#endif movq $0x510E527F510E527F, %r12 movq $0x9B05688C9B05688C, %r13 movq $0x1F83D9AB1F83D9AB, %r14 @@ -4026,18 +4051,30 @@ _blake2s_compress_4way: /* movq %rax, 192(%rsi) */ /* movq %rbx, 200(%rsi) */ /* movdqa 192(%rsi), %xmm3 */ /* A */ +#ifndef MOVQ_FIX movq %rax, %xmm3 movq %rbx, %xmm15 movlhps %xmm15, %xmm3 +#else + movq %rax, 0(%rsi) + movq %rbx, 8(%rsi) + movdqa 0(%rsi), %xmm3 +#endif movq %rcx, 208(%rsi) pxor %xmm0, %xmm3 /* A */ movq %rdx, 216(%rsi) /* movq %r8, 224(%rsi) */ /* movq %r9, 232(%rsi) */ /* movdqa 224(%rsi), %xmm7 */ /* C */ +#ifndef MOVQ_FIX movq %r8, %xmm7 movq %r9, %xmm15 movlhps %xmm15, %xmm7 +#else + movq %r8, 0(%rsi) + movq %r9, 8(%rsi) + movdqa 0(%rsi), %xmm7 +#endif movq %r10, 240(%rsi) pxor %xmm4, %xmm7 /* C */ movq %r11, 248(%rsi) @@ -6815,7 +6852,7 @@ _blake2s_compress_4way: movdqa %xmm13, 96(%rdi) movdqa %xmm5, 112(%rdi) -#if (WIN64) +#ifdef WIN64 movdqu 0(%rsp), %xmm15 movdqu 16(%rsp), %xmm14 movdqu 32(%rsp), %xmm13 @@ -6846,7 +6883,7 @@ _blake2s_compress_4way: .globl _neoscrypt_blkcpy neoscrypt_blkcpy: _neoscrypt_blkcpy: -#if (WIN64) +#ifdef WIN64 movq %rdi, %r10 movq %rsi, %r11 movq %rcx, %rdi @@ -6870,7 +6907,7 @@ _neoscrypt_blkcpy: addq %rax, %rsi decl %ecx jnz .blkcpy -#if (WIN64) +#ifdef WIN64 movq %r10, %rdi movq %r11, %rsi #endif @@ -6884,7 +6921,7 @@ _neoscrypt_blkcpy: .globl _neoscrypt_blkswp neoscrypt_blkswp: _neoscrypt_blkswp: -#if (WIN64) +#ifdef WIN64 movq %rdi, %r10 movq %rsi, %r11 movq %rcx, %rdi @@ -6916,7 +6953,7 @@ _neoscrypt_blkswp: addq %rax, %rsi decl %ecx jnz .blkswp -#if (WIN64) +#ifdef WIN64 movq %r10, %rdi movq %r11, %rsi #endif @@ -6930,7 +6967,7 @@ _neoscrypt_blkswp: .globl _neoscrypt_blkxor neoscrypt_blkxor: _neoscrypt_blkxor: -#if (WIN64) +#ifdef WIN64 movq %rdi, %r10 movq %rsi, %r11 movq %rcx, %rdi @@ -6958,7 +6995,7 @@ _neoscrypt_blkxor: addq %rax, %rsi decl %ecx jnz .blkxor -#if (WIN64) +#ifdef WIN64 movq %r10, %rdi movq %r11, %rsi #endif @@ -6971,7 +7008,7 @@ _neoscrypt_blkxor: .globl _neoscrypt_pack_4way neoscrypt_pack_4way: _neoscrypt_pack_4way: -#if (WIN64) +#ifdef WIN64 pushq %rdi pushq %rsi movq %rcx, %rdi @@ -6998,7 +7035,7 @@ _neoscrypt_pack_4way: decq %rcx jnz .pack_4way -#if (WIN64) +#ifdef WIN64 popq %rsi popq %rdi #endif @@ -7011,7 +7048,7 @@ _neoscrypt_pack_4way: .globl _neoscrypt_unpack_4way neoscrypt_unpack_4way: _neoscrypt_unpack_4way: -#if (WIN64) +#ifdef WIN64 pushq %rdi pushq %rsi movq %rcx, %rdi @@ -7038,7 +7075,7 @@ _neoscrypt_unpack_4way: decq %rcx jnz .unpack_4way -#if (WIN64) +#ifdef WIN64 popq %rsi popq %rdi #endif @@ -7053,7 +7090,7 @@ neoscrypt_xor_4way: _neoscrypt_xor_4way: pushq %rbx pushq %rbp -#if (WIN64) +#ifdef WIN64 pushq %rdi pushq %rsi movq %rcx, %rdi @@ -7084,7 +7121,7 @@ _neoscrypt_xor_4way: decq %rcx jnz .xor_4way -#if (WIN64) +#ifdef WIN64 popq %rsi popq %rdi #endif @@ -7099,7 +7136,7 @@ _neoscrypt_xor_4way: .globl _neoscrypt_xor_salsa_4way neoscrypt_xor_salsa_4way: _neoscrypt_xor_salsa_4way: -#if (WIN64) +#ifdef WIN64 movq %rdi, %r10 movq %rsi, %r11 movq %rcx, %rdi @@ -7477,7 +7514,7 @@ _neoscrypt_xor_salsa_4way: movdqa %xmm13, 208(%rdi) movdqa %xmm14, 224(%rdi) movdqa %xmm15, 240(%rdi) -#if (WIN64) +#ifdef WIN64 movq %r10, %rdi movq %r11, %rsi #endif @@ -7490,7 +7527,7 @@ _neoscrypt_xor_salsa_4way: .globl _neoscrypt_xor_chacha_4way neoscrypt_xor_chacha_4way: _neoscrypt_xor_chacha_4way: -#if (WIN64) +#ifdef WIN64 movq %rdi, %r10 movq %rsi, %r11 movq %rcx, %rdi @@ -7876,13 +7913,13 @@ _neoscrypt_xor_chacha_4way: movdqa %xmm13, 208(%rdi) movdqa %xmm14, 224(%rdi) movdqa %xmm15, 240(%rdi) -#if (WIN64) +#ifdef WIN64 movq %r10, %rdi movq %r11, %rsi #endif ret -#endif /* (MINER_4WAY) */ +#endif /* MINER_4WAY */ /* cpu_vec_exts() * AMD64 detector of any processor vector extensions present @@ -7983,7 +8020,7 @@ _cpu_vec_exts: #endif /* (ASM) && (__x86_64__) */ -#if (ASM) && (__i386__) +#if defined(ASM) && defined(__i386__) /* blake2s_compress(mem) * i386 BLAKE2s block compression */ @@ -10187,12 +10224,15 @@ _neoscrypt_fastkdf_opt: pushl %esi pushl %edi - subl $948, %esp +/* 32 bytes (call stack and local variables + 64 bytes (alignment space) + + * 320 bytes (password buffer) + 288 bytes (salt buffer) + 112 bytes (BLAKE2s + * space) = 816 bytes */ + subl $816, %esp leal 96(%esp), %ebp andl $0xFFFFFFC0, %ebp movl %ebp, 28(%esp) - movl 968(%esp), %edx + movl 836(%esp), %edx movq 0(%edx), %mm0 movq 8(%edx), %mm1 movq 16(%edx), %mm2 @@ -10244,11 +10284,11 @@ _neoscrypt_fastkdf_opt: movq %mm0, 224(%ebp) movq %mm1, 232(%ebp) - movl 972(%esp), %edx + movl 840(%esp), %edx leal 320(%ebp), %ebx movl $32, 20(%esp) xorl %edi, %edi - testl $0x01, 980(%esp) + testl $0x01, 848(%esp) jnz .fastkdf_mode_one movl $256, 24(%esp) @@ -10404,6 +10444,9 @@ _neoscrypt_fastkdf_opt: movq %mm0, 96(%esi) movq %mm0, 104(%esi) + movl %esi, 0(%esp) + call blake2s_compress + movq 0(%ebp), %mm0 movq 8(%ebp), %mm1 movq 16(%ebp), %mm2 @@ -10412,53 +10455,24 @@ _neoscrypt_fastkdf_opt: movq 40(%ebp), %mm5 movq 48(%ebp), %mm6 movq 56(%ebp), %mm7 - movq %mm0, 112(%esi) - movq %mm1, 120(%esi) - movq %mm2, 128(%esi) - movq %mm3, 136(%esi) - movq %mm4, 144(%esi) - movq %mm5, 152(%esi) - movq %mm6, 160(%esi) - movq %mm7, 168(%esi) - movl $128, 176(%esi) - - movl %esi, 0(%esp) - call blake2s_compress - - movq 112(%esi), %mm0 - movq 120(%esi), %mm1 - movq 128(%esi), %mm2 - movq 136(%esi), %mm3 - movq 144(%esi), %mm4 - movq 152(%esi), %mm5 - movq 160(%esi), %mm6 - movq 168(%esi), %mm7 - movl $128, 32(%esi) - movl $0xFFFFFFFF, 40(%esi) movq %mm0, 48(%esi) movq %mm1, 56(%esi) movq %mm2, 64(%esi) movq %mm3, 72(%esi) - pxor %mm0, %mm0 movq %mm4, 80(%esi) movq %mm5, 88(%esi) movq %mm6, 96(%esi) movq %mm7, 104(%esi) - movq %mm0, 112(%esi) - movq %mm0, 120(%esi) - movq %mm0, 128(%esi) - movq %mm0, 136(%esi) - movq %mm0, 144(%esi) - movq %mm0, 152(%esi) - movq %mm0, 160(%esi) - movq %mm0, 168(%esi) + movl $128, 32(%esi) + movl $0xFFFFFFFF, 40(%esi) call blake2s_compress movq 0(%esi), %mm3 movq 8(%esi), %mm5 movq 16(%esi), %mm6 movq 24(%esi), %mm7 + pxor %mm0, %mm0 movq %mm3, %mm4 paddb %mm5, %mm3 paddb %mm6, %mm3 @@ -10524,7 +10538,7 @@ _neoscrypt_fastkdf_opt: movl %ebp, 4(%esp) movl %esi, 8(%esp) call neoscrypt_xor - movl 976(%esp), %eax + movl 844(%esp), %eax movl %eax, 0(%esp) movl %ebx, 4(%esp) call neoscrypt_copy @@ -10543,12 +10557,12 @@ _neoscrypt_fastkdf_opt: subl %ebx, %esi movl %esi, 8(%esp) call neoscrypt_xor - movl 976(%esp), %eax + movl 844(%esp), %eax movl %eax, 0(%esp) movl %edi, 4(%esp) movl %ebx, 8(%esp) call neoscrypt_copy - movl 976(%esp), %eax + movl 844(%esp), %eax leal 0(%eax, %ebx), %eax movl %eax, 0(%esp) leal 320(%ebp), %edx @@ -10557,7 +10571,7 @@ _neoscrypt_fastkdf_opt: call neoscrypt_copy .fastkdf_finish: - addl $948, %esp + addl $816, %esp popl %edi popl %esi popl %ebp @@ -11292,13 +11306,13 @@ _neoscrypt: movl 24(%esp), %edi movl 28(%esp), %ebx -#if (SHA256) +#ifdef SHA256 /* Scrypt mode */ testl $0x01, %ebx jnz .scrypt #endif -#if (WIN32) +#ifdef WIN32 /* attempt to allocate 33280 + 128 bytes of stack space fails miserably; * have to use malloc() and free() instead */ subl $64, %esp @@ -11321,16 +11335,16 @@ _neoscrypt: movl %eax, 32(%esp) /* memory base: X, Z, V */ leal 128(%esp), %ebp -#endif /* (WIN32) */ +#endif /* WIN32 */ /* FastKDF */ -#if (OPT) +#ifdef OPT movl %esi, 0(%esp) movl %esi, 4(%esp) movl %ebp, 8(%esp) xorl %eax, %eax movl %eax, 12(%esp) -#if (WIN32) || (__APPLE__) +#if defined(WIN32) || defined(__APPLE__) call _neoscrypt_fastkdf_opt #else call neoscrypt_fastkdf_opt @@ -11344,12 +11358,12 @@ _neoscrypt: movl $32, 16(%esp) movl %ebp, 20(%esp) movl $256, 24(%esp) -#if (WIN32) || (__APPLE__) +#if defined(WIN32) || defined(__APPLE__) call _neoscrypt_fastkdf #else call neoscrypt_fastkdf #endif /* (WIN32) || (__APPLE__) */ -#endif /* (OPT) */ +#endif /* OPT */ /* SSE2 switch */ testl $0x1000, %ebx @@ -12114,12 +12128,12 @@ _neoscrypt: movq %mm7, 248(%ebp) /* FastKDF */ -#if (OPT) +#ifdef OPT movl %esi, 0(%esp) movl %ebp, 4(%esp) movl %edi, 8(%esp) movl $1, 12(%esp) -#if (WIN32) || (__APPLE__) +#if defined(WIN32) || defined(__APPLE__) call _neoscrypt_fastkdf_opt #else call neoscrypt_fastkdf_opt @@ -12132,14 +12146,14 @@ _neoscrypt: movl $32, 16(%esp) movl %edi, 20(%esp) movl $32, 24(%esp) -#if (WIN32) || (__APPLE__) +#if defined(WIN32) || defined(__APPLE__) call _neoscrypt_fastkdf #else call neoscrypt_fastkdf #endif /* (WIN32) || (__APPLE__) */ -#endif /* (OPT) */ +#endif /* OPT */ -#if (WIN32) +#ifdef WIN32 /* free memory */ movl 32(%esp), %eax movl %eax, 0(%esp) @@ -12619,12 +12633,12 @@ _neoscrypt: movdqa %xmm7, 240(%ebp) /* FastKDF */ -#if (OPT) +#ifdef OPT movl %esi, 0(%esp) movl %ebp, 4(%esp) movl %edi, 8(%esp) movl $1, 12(%esp) -#if (WIN32) || (__APPLE__) +#if defined(WIN32) || defined(__APPLE__) call _neoscrypt_fastkdf_opt #else call neoscrypt_fastkdf_opt @@ -12637,14 +12651,14 @@ _neoscrypt: movl $32, 16(%esp) movl %edi, 20(%esp) movl $32, 24(%esp) -#if (WIN32) || (__APPLE__) +#if defined(WIN32) || defined(__APPLE__) call _neoscrypt_fastkdf #else call neoscrypt_fastkdf #endif /* (WIN32) || (__APPLE__) */ -#endif /* (OPT) */ +#endif /* OPT */ -#if (WIN32) +#ifdef WIN32 /* free memory */ movl 32(%esp), %eax movl %eax, 0(%esp) @@ -12661,10 +12675,10 @@ _neoscrypt: popl %ebx ret -#if (SHA256) +#ifdef SHA256 .scrypt: -#if (WIN32) +#ifdef WIN32 /* attempt to allocate 131200 + 128 bytes of stack space fails miserably; * have to use malloc() and free() instead */ subl $64, %esp @@ -12687,7 +12701,7 @@ _neoscrypt: movl %eax, 32(%esp) /* memory base: X, Z, V */ leal 128(%esp), %ebp -#endif /* (WIN32) */ +#endif /* WIN32 */ /* PBKDF2-HMAC-SHA256 */ movl $80, %eax @@ -12698,7 +12712,7 @@ _neoscrypt: movl $1, 16(%esp) movl %ebp, 20(%esp) movl $128, 24(%esp) -#if (WIN32) || (__APPLE__) +#if defined(WIN32) || defined(__APPLE__) call _neoscrypt_pbkdf2_sha256 #else call neoscrypt_pbkdf2_sha256 @@ -12843,13 +12857,13 @@ _neoscrypt: movl $1, 16(%esp) movl %edi, 20(%esp) movl $32, 24(%esp) -#if (WIN32) || (__APPLE__) +#if defined(WIN32) || defined(__APPLE__) call _neoscrypt_pbkdf2_sha256 #else call neoscrypt_pbkdf2_sha256 #endif -#if (WIN32) +#ifdef WIN32 /* free memory */ movl 32(%esp), %eax movl %eax, 0(%esp) @@ -12969,13 +12983,13 @@ _neoscrypt: movl $1, 16(%esp) movl %edi, 20(%esp) movl $32, 24(%esp) -#if (WIN32) || (__APPLE__) +#if defined(WIN32) || defined(__APPLE__) call _neoscrypt_pbkdf2_sha256 #else call neoscrypt_pbkdf2_sha256 #endif -#if (WIN32) +#ifdef WIN32 /* free memory */ movl 32(%esp), %eax movl %eax, 0(%esp) @@ -12992,9 +13006,9 @@ _neoscrypt: popl %ebx ret -#endif /* (SHA256) */ +#endif /* SHA256 */ -#if (MINER_4WAY) +#ifdef MINER_4WAY /* blake2s_compress_4way(mem) * i386 (SSE2) BLAKE2s 4-way block compression */ @@ -13009,7 +13023,7 @@ _blake2s_compress_4way: movl 20(%esp), %edi /* initialise */ - leal 768(%edi), %esi + leal 448(%edi), %esi movdqa 0(%edi), %xmm0 movdqa 16(%edi), %xmm2 movdqa 32(%edi), %xmm3 @@ -16925,7 +16939,7 @@ _neoscrypt_xor_chacha_4way: ret -#endif /* (MINER_4WAY) */ +#endif /* MINER_4WAY */ /* cpu_vec_exts() * i386 detector of any vector extensions present diff --git a/scrypt.c b/scrypt.c index 702551b3a..6ee9d8efd 100644 --- a/scrypt.c +++ b/scrypt.c @@ -695,10 +695,8 @@ static void scrypt_1024_1_1_256_24way(const uint32_t *input, } #endif /* HAVE_SCRYPT_6WAY */ -int scanhash_scrypt(int thr_id, uint32_t *pdata, - unsigned char *scratchbuf, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done) -{ +int scanhash_scrypt(int thr_id, uint32_t *pdata, unsigned char *scratchbuf, + const uint32_t *ptarget, uint32_t max_nonce, unsigned int *hashes_done) { uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8]; uint32_t midstate[8]; uint32_t n = pdata[19] - 1; diff --git a/sha2.c b/sha2.c index 367efda30..d90ed3023 100644 --- a/sha2.c +++ b/sha2.c @@ -469,8 +469,7 @@ void sha256d_ms_4way(uint32_t *hash, uint32_t *data, const uint32_t *midstate, const uint32_t *prehash); static inline int scanhash_sha256d_4way(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) -{ + const uint32_t *ptarget, uint32_t max_nonce, unsigned int *hashes_done) { uint32_t data[4 * 64] __attribute__((aligned(128))); uint32_t hash[4 * 8] __attribute__((aligned(32))); uint32_t midstate[4 * 8] __attribute__((aligned(32))); @@ -528,8 +527,7 @@ void sha256d_ms_8way(uint32_t *hash, uint32_t *data, const uint32_t *midstate, const uint32_t *prehash); static inline int scanhash_sha256d_8way(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) -{ + const uint32_t *ptarget, uint32_t max_nonce, unsigned int *hashes_done) { uint32_t data[8 * 64] __attribute__((aligned(128))); uint32_t hash[8 * 8] __attribute__((aligned(32))); uint32_t midstate[8 * 8] __attribute__((aligned(32))); @@ -582,8 +580,7 @@ static inline int scanhash_sha256d_8way(int thr_id, uint32_t *pdata, #endif /* HAVE_SHA256_8WAY */ int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done) -{ + uint32_t max_nonce, unsigned int *hashes_done) { uint32_t data[64] __attribute__((aligned(128))); uint32_t hash[8] __attribute__((aligned(32))); uint32_t midstate[8] __attribute__((aligned(32))); diff --git a/version.h b/version.h index f7539eb02..cf34759ea 100644 --- a/version.h +++ b/version.h @@ -1,3 +1,3 @@ #define VERSION_MAJOR 2 #define VERSION_MINOR 4 -#define VERSION_REVISION 2 +#define VERSION_REVISION 3