Skip to content

Commit

Permalink
ghash: move addmul operation into inline functions
Browse files Browse the repository at this point in the history
  • Loading branch information
jedisct1 committed Nov 14, 2022
1 parent 8489b4d commit ae82b44
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 158 deletions.
128 changes: 49 additions & 79 deletions src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c
Original file line number Diff line number Diff line change
Expand Up @@ -75,15 +75,6 @@ typedef struct I256 {
BlockVec mid;
} I256;

static inline I256 __vectorcall XOR256(const I256 a, const I256 b)
{
return (I256) {
SODIUM_C99(.hi =) XOR128(a.hi, b.hi),
SODIUM_C99(.lo =) XOR128(a.lo, b.lo),
SODIUM_C99(.mid =) XOR128(a.mid, b.mid),
};
}

typedef BlockVec Precomp;

typedef struct GHash {
Expand Down Expand Up @@ -307,6 +298,21 @@ gh_init(GHash *sth)
sth->acc = ZERO128;
}

static inline I256 __vectorcall gh_update0(const GHash *const sth, const unsigned char *const p,
const Precomp hn)
{
const BlockVec m = REV128(LOAD128(p));
return clmul128(XOR128(sth->acc, m), hn);
}

static inline void __vectorcall gh_update(I256 *const u, const unsigned char *p, const Precomp hn)
{
const BlockVec m = REV128(LOAD128(p));
const I256 t = clmul128(m, hn);
*u = (I256) { SODIUM_C99(.hi =) XOR128(u->hi, t.hi), SODIUM_C99(.lo =) XOR128(u->lo, t.lo),
SODIUM_C99(.mid =) XOR128(u->mid, t.mid) };
}

/* Absorb ad_len bytes of associated data. There has to be no partial block. */

static inline void
Expand All @@ -316,58 +322,48 @@ gh_ad_blocks(const State *st, GHash *sth, const unsigned char *ad, size_t ad_len

i = (size_t) 0U;
for (; i + PC_COUNT * 16 <= ad_len; i += PC_COUNT * 16) {
BlockVec m = REV128(LOAD128(ad + i));
I256 u = clmul128(XOR128(sth->acc, m), st->hx[PC_COUNT - 1 - 0]);
I256 u = gh_update0(sth, ad + i, st->hx[PC_COUNT - 1 - 0]);
size_t j;

for (j = 1; j < PC_COUNT; j += 1) {
m = REV128(LOAD128(ad + i + j * 16));
u = XOR256(u, clmul128(m, st->hx[PC_COUNT - 1 - j]));
gh_update(&u, ad + i + j * 16, st->hx[PC_COUNT - 1 - j]);
}
sth->acc = gcm_reduce(u);
}
for (; i + PC_COUNT * 16 / 2 <= ad_len; i += PC_COUNT * 16 / 2) {
BlockVec m = REV128(LOAD128(ad + i));
I256 u = clmul128(XOR128(sth->acc, m), st->hx[PC_COUNT / 2 - 1 - 0]);
I256 u = gh_update0(sth, ad + i, st->hx[PC_COUNT / 2 - 1 - 0]);
size_t j;

for (j = 1; j < PC_COUNT / 2; j += 1) {
m = REV128(LOAD128(ad + i + j * 16));
u = XOR256(u, clmul128(m, st->hx[PC_COUNT / 2 - 1 - j]));
gh_update(&u, ad + i + j * 16, st->hx[PC_COUNT / 2 - 1 - j]);
}
sth->acc = gcm_reduce(u);
}
for (; i + 4 * 16 <= ad_len; i += 4 * 16) {
BlockVec m = REV128(LOAD128(ad + i));
I256 u = clmul128(XOR128(sth->acc, m), st->hx[4 - 1 - 0]);
size_t j;
I256 u = gh_update0(sth, ad + i, st->hx[4 - 1 - 0]);

for (j = 1; j < 4; j += 1) {
m = REV128(LOAD128(ad + i + j * 16));
u = XOR256(u, clmul128(m, st->hx[4 - 1 - j]));
gh_update(&u, ad + i + j * 16, st->hx[4 - 1 - j]);
}
sth->acc = gcm_reduce(u);
}
for (; i + 2 * 16 <= ad_len; i += 2 * 16) {
BlockVec m = REV128(LOAD128(ad + i));
I256 u = clmul128(XOR128(sth->acc, m), st->hx[2 - 1 - 0]);
size_t j;
I256 u = gh_update0(sth, ad + i, st->hx[2 - 1 - 0]);

for (j = 1; j < 2; j += 1) {
m = REV128(LOAD128(ad + i + j * 16));
u = XOR256(u, clmul128(m, st->hx[2 - 1 - j]));
gh_update(&u, ad + i + j * 16, st->hx[2 - 1 - j]);
}
sth->acc = gcm_reduce(u);
}
if (i < ad_len) {
const size_t n = (ad_len - i) / 16;
BlockVec m = REV128(LOAD128(ad + i));
I256 u = clmul128(XOR128(sth->acc, m), st->hx[n - 1 - 0]);
I256 u = gh_update0(sth, ad + i, st->hx[n - 1 - 0]);
size_t j;

for (j = 1; j < n; j += 1) {
m = REV128(LOAD128(ad + i + j * 16));
u = XOR256(u, clmul128(m, st->hx[n - 1 - j]));
gh_update(&u, ad + i + j * 16, st->hx[n - 1 - j]);
}
i += n * 16;
sth->acc = gcm_reduce(u);
Expand Down Expand Up @@ -419,7 +415,6 @@ aes_gcm_encrypt_generic(const State *st, GHash *sth, unsigned char mac[ABYTES],
const BlockVec one = ONE128;
BlockVec final_block;
BlockVec rev_counters[PARALLEL_BLOCKS];
BlockVec m;
BlockVec counter;
size_t i;
size_t j;
Expand Down Expand Up @@ -459,11 +454,9 @@ aes_gcm_encrypt_generic(const State *st, GHash *sth, unsigned char mac[ABYTES],
encrypt_xor_wide(st, dst + i, src + i, rev_counters);

pi = i - PARALLEL_BLOCKS * 16;
m = REV128(LOAD128(dst + pi));
u = clmul128(XOR128(sth->acc, m), st->hx[2 * PARALLEL_BLOCKS - 1 - 0]);
u = gh_update0(sth, dst + pi, st->hx[2 * PARALLEL_BLOCKS - 1 - 0]);
for (j = 1; j < PARALLEL_BLOCKS; j += 1) {
m = REV128(LOAD128(dst + pi + j * 16));
u = XOR256(u, clmul128(m, st->hx[2 * PARALLEL_BLOCKS - 1 - j]));
gh_update(&u, dst + pi + j * 16, st->hx[2 * PARALLEL_BLOCKS - 1 - j]);
}

counter = incr_counters(rev_counters, counter, PARALLEL_BLOCKS);
Expand All @@ -472,18 +465,15 @@ aes_gcm_encrypt_generic(const State *st, GHash *sth, unsigned char mac[ABYTES],

pi = i;
for (j = 0; j < PARALLEL_BLOCKS; j += 1) {
m = REV128(LOAD128(dst + pi + j * 16));
u = XOR256(u, clmul128(m, st->hx[PARALLEL_BLOCKS - 1 - j]));
gh_update(&u, dst + pi + j * 16, st->hx[PARALLEL_BLOCKS - 1 - j]);
}
sth->acc = gcm_reduce(u);
}

pi = i - PARALLEL_BLOCKS * 16;
m = REV128(LOAD128(dst + pi));
u = clmul128(XOR128(sth->acc, m), st->hx[PARALLEL_BLOCKS - 1 - 0]);
u = gh_update0(sth, dst + pi, st->hx[PARALLEL_BLOCKS - 1 - 0]);
for (j = 1; j < PARALLEL_BLOCKS; j += 1) {
m = REV128(LOAD128(dst + pi + j * 16));
u = XOR256(u, clmul128(m, st->hx[PARALLEL_BLOCKS - 1 - j]));
gh_update(&u, dst + pi + j * 16, st->hx[PARALLEL_BLOCKS - 1 - j]);
}
sth->acc = gcm_reduce(u);
}
Expand All @@ -500,21 +490,17 @@ aes_gcm_encrypt_generic(const State *st, GHash *sth, unsigned char mac[ABYTES],
encrypt_xor_wide(st, dst + i, src + i, rev_counters);

pi = i - PARALLEL_BLOCKS * 16;
m = REV128(LOAD128(dst + pi));
u = clmul128(XOR128(sth->acc, m), st->hx[PARALLEL_BLOCKS - 1 - 0]);
u = gh_update0(sth, dst + pi, st->hx[PARALLEL_BLOCKS - 1 - 0]);
for (j = 1; j < PARALLEL_BLOCKS; j += 1) {
m = REV128(LOAD128(dst + pi + j * 16));
u = XOR256(u, clmul128(m, st->hx[PARALLEL_BLOCKS - 1 - j]));
gh_update(&u, dst + pi + j * 16, st->hx[PARALLEL_BLOCKS - 1 - j]);
}
sth->acc = gcm_reduce(u);
}

pi = i - PARALLEL_BLOCKS * 16;
m = REV128(LOAD128(dst + pi));
u = clmul128(XOR128(sth->acc, m), st->hx[PARALLEL_BLOCKS - 1 - 0]);
u = gh_update0(sth, dst + pi, st->hx[PARALLEL_BLOCKS - 1 - 0]);
for (j = 1; j < PARALLEL_BLOCKS; j += 1) {
m = REV128(LOAD128(dst + pi + j * 16));
u = XOR256(u, clmul128(m, st->hx[PARALLEL_BLOCKS - 1 - j]));
gh_update(&u, dst + pi + j * 16, st->hx[PARALLEL_BLOCKS - 1 - j]);
}
sth->acc = gcm_reduce(u);
}
Expand All @@ -527,11 +513,9 @@ aes_gcm_encrypt_generic(const State *st, GHash *sth, unsigned char mac[ABYTES],
encrypt_xor_block(st, dst + i + j * 16, src + i + j * 16, rev_counters[j]);
}

m = REV128(LOAD128(dst + i));
u = clmul128(XOR128(sth->acc, m), st->hx[4 - 1 - 0]);
u = gh_update0(sth, dst + i, st->hx[4 - 1 - 0]);
for (j = 1; j < 4; j += 1) {
m = REV128(LOAD128(dst + i + j * 16));
u = XOR256(u, clmul128(m, st->hx[4 - 1 - j]));
gh_update(&u, dst + i + j * 16, st->hx[4 - 1 - j]);
}
sth->acc = gcm_reduce(u);
}
Expand All @@ -544,11 +528,9 @@ aes_gcm_encrypt_generic(const State *st, GHash *sth, unsigned char mac[ABYTES],
encrypt_xor_block(st, dst + i + j * 16, src + i + j * 16, rev_counters[j]);
}

m = REV128(LOAD128(dst + i));
u = clmul128(XOR128(sth->acc, m), st->hx[2 - 1 - 0]);
u = gh_update0(sth, dst + i, st->hx[2 - 1 - 0]);
for (j = 1; j < 2; j += 1) {
m = REV128(LOAD128(dst + i + j * 16));
u = XOR256(u, clmul128(m, st->hx[2 - 1 - j]));
gh_update(&u, dst + i + j * 16, st->hx[2 - 1 - j]);
}
sth->acc = gcm_reduce(u);
}
Expand All @@ -558,8 +540,7 @@ aes_gcm_encrypt_generic(const State *st, GHash *sth, unsigned char mac[ABYTES],

for (; i + 16 < src_len; i += 16) {
encrypt_xor_block(st, dst + i, src + i, REV128(counter));
m = REV128(LOAD128(dst + i));
u = clmul128(XOR128(sth->acc, m), st->hx[1 - 1 - 0]);
u = gh_update0(sth, dst + i, st->hx[1 - 1 - 0]);
sth->acc = gcm_reduce(u);
counter = ADD64x2(counter, one);
}
Expand Down Expand Up @@ -603,7 +584,6 @@ aes_gcm_decrypt_generic(const State *st, GHash *sth, unsigned char mac[ABYTES],
const BlockVec one = ONE128;
BlockVec final_block;
BlockVec rev_counters[PARALLEL_BLOCKS];
BlockVec m;
BlockVec counter;
size_t i;
size_t j;
Expand Down Expand Up @@ -635,20 +615,17 @@ aes_gcm_decrypt_generic(const State *st, GHash *sth, unsigned char mac[ABYTES],
for (; i + 2 * PARALLEL_BLOCKS * 16 <= src_len; i += 2 * PARALLEL_BLOCKS * 16) {
counter = incr_counters(rev_counters, counter, PARALLEL_BLOCKS);

m = REV128(LOAD128(src + i));
u = clmul128(XOR128(sth->acc, m), st->hx[2 * PARALLEL_BLOCKS - 1 - 0]);
u = gh_update0(sth, src + i, st->hx[2 * PARALLEL_BLOCKS - 1 - 0]);
for (j = 1; j < PARALLEL_BLOCKS; j += 1) {
m = REV128(LOAD128(src + i + j * 16));
u = XOR256(u, clmul128(m, st->hx[2 * PARALLEL_BLOCKS - 1 - j]));
gh_update(&u, src + i + j * 16, st->hx[2 * PARALLEL_BLOCKS - 1 - j]);
}

encrypt_xor_wide(st, dst + i, src + i, rev_counters);

counter = incr_counters(rev_counters, counter, PARALLEL_BLOCKS);

for (j = 0; j < PARALLEL_BLOCKS; j += 1) {
m = REV128(LOAD128(src + i + j * 16));
u = XOR256(u, clmul128(m, st->hx[PARALLEL_BLOCKS - 1 - j]));
gh_update(&u, src + i + j * 16, st->hx[PARALLEL_BLOCKS - 1 - j]);
}
sth->acc = gcm_reduce(u);

Expand All @@ -661,11 +638,9 @@ aes_gcm_decrypt_generic(const State *st, GHash *sth, unsigned char mac[ABYTES],
for (; i + PARALLEL_BLOCKS * 16 <= src_len; i += PARALLEL_BLOCKS * 16) {
counter = incr_counters(rev_counters, counter, PARALLEL_BLOCKS);

m = REV128(LOAD128(src + i));
u = clmul128(XOR128(sth->acc, m), st->hx[PARALLEL_BLOCKS - 1 - 0]);
u = gh_update0(sth, src + i, st->hx[PARALLEL_BLOCKS - 1 - 0]);
for (j = 1; j < PARALLEL_BLOCKS; j += 1) {
m = REV128(LOAD128(src + i + j * 16));
u = XOR256(u, clmul128(m, st->hx[PARALLEL_BLOCKS - 1 - j]));
gh_update(&u, src + i + j * 16, st->hx[PARALLEL_BLOCKS - 1 - j]);
}
sth->acc = gcm_reduce(u);

Expand All @@ -677,11 +652,9 @@ aes_gcm_decrypt_generic(const State *st, GHash *sth, unsigned char mac[ABYTES],
for (; i + 4 * 16 <= src_len; i += 4 * 16) {
counter = incr_counters(rev_counters, counter, 4);

m = REV128(LOAD128(src + i));
u = clmul128(XOR128(sth->acc, m), st->hx[4 - 1 - 0]);
u = gh_update0(sth, src + i, st->hx[4 - 1 - 0]);
for (j = 1; j < 4; j += 1) {
m = REV128(LOAD128(src + i + j * 16));
u = XOR256(u, clmul128(m, st->hx[4 - 1 - j]));
gh_update(&u, src + i + j * 16, st->hx[4 - 1 - j]);
}
sth->acc = gcm_reduce(u);

Expand All @@ -695,11 +668,9 @@ aes_gcm_decrypt_generic(const State *st, GHash *sth, unsigned char mac[ABYTES],
for (; i + 2 * 16 <= src_len; i += 2 * 16) {
counter = incr_counters(rev_counters, counter, 2);

m = REV128(LOAD128(src + i));
u = clmul128(XOR128(sth->acc, m), st->hx[2 - 1 - 0]);
u = gh_update0(sth, src + i, st->hx[2 - 1 - 0]);
for (j = 1; j < 2; j += 1) {
m = REV128(LOAD128(src + i + j * 16));
u = XOR256(u, clmul128(m, st->hx[2 - 1 - j]));
gh_update(&u, src + i + j * 16, st->hx[2 - 1 - j]);
}
sth->acc = gcm_reduce(u);

Expand All @@ -712,8 +683,7 @@ aes_gcm_decrypt_generic(const State *st, GHash *sth, unsigned char mac[ABYTES],
full block authenticated along with the final block, hence < and not <= */

for (; i + 16 < src_len; i += 16) {
m = REV128(LOAD128(src + i));
u = clmul128(XOR128(sth->acc, m), st->hx[1 - 1 - 0]);
u = gh_update0(sth, src + i, st->hx[1 - 1 - 0]);
sth->acc = gcm_reduce(u);
encrypt_xor_block(st, dst + i, src + i, REV128(counter));
counter = ADD64x2(counter, one);
Expand Down
Loading

0 comments on commit ae82b44

Please sign in to comment.