-
Notifications
You must be signed in to change notification settings - Fork 165
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
By performing benchmarks on Intel Edison (a Silvermont Atom CPU) in x86_64 mode from v0.3.0 we find that SSE3 had various ups and down. Substantial changes since v0.3.0 were: HASH SSSE3 SSSE3 e12e3cd 165 210 3f3f31c 206 150 67ee3fd 205 205 0a69845 145 205 a5b6739 145 218 6310c1f 157 218 9a0d1b2 158 210 5874921 165 210 Best performance was from 67ee3fd until decode performance regressed from 205 to 145 MB/s with commit 0a69845. The commit before that (b6417f3) had best decode performance with relatively good encode. Core(-i7) processors do not should such large performance changes. This patch adds the ssse3 codec from b6417f3 as ssse3_atom. Signed-off-by: Ferry Toth <[email protected]>
- Loading branch information
Showing
18 changed files
with
466 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
#define CMPGT(s,n) _mm_cmpgt_epi8((s), _mm_set1_epi8(n)) | ||
#define CMPEQ(s,n) _mm_cmpeq_epi8((s), _mm_set1_epi8(n)) | ||
#define REPLACE(s,n) _mm_and_si128((s), _mm_set1_epi8(n)) | ||
#define RANGE(s,a,b) _mm_andnot_si128(CMPGT((s), (b)), CMPGT((s), (a) - 1)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
#include <stdint.h> | ||
#include <stddef.h> | ||
#include <stdlib.h> | ||
|
||
#include "../../../include/libbase64.h" | ||
#include "../../tables/tables.h" | ||
#include "../../codecs.h" | ||
#include "config.h" | ||
#include "../../env.h" | ||
|
||
#if HAVE_SSSE3 | ||
#include <tmmintrin.h> | ||
|
||
#include "../sse2/compare_macros.h" | ||
|
||
#include "dec_reshuffle.c" | ||
#include "enc_reshuffle.c" | ||
#include "enc_translate.c" | ||
|
||
#endif // __SSSE3__ | ||
|
||
BASE64_ENC_FUNCTION(ssse3_atom) | ||
{ | ||
#if HAVE_SSSE3 | ||
#include "enc_head.c" | ||
#include "enc_loop.c" | ||
#include "enc_tail.c" | ||
#else | ||
BASE64_ENC_STUB | ||
#endif | ||
} | ||
|
||
BASE64_DEC_FUNCTION(ssse3_atom) | ||
{ | ||
#if HAVE_SSSE3 | ||
#include "dec_head.c" | ||
#include "dec_loop.c" | ||
#include "dec_tail.c" | ||
#else | ||
BASE64_DEC_STUB | ||
#endif | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
int ret = 0; | ||
const uint8_t *c = (const uint8_t *)src; | ||
uint8_t *o = (uint8_t *)out; | ||
uint8_t q; | ||
|
||
// Use local temporaries to avoid cache thrashing: | ||
size_t outl = 0; | ||
struct base64_state st; | ||
st.eof = state->eof; | ||
st.bytes = state->bytes; | ||
st.carry = state->carry; | ||
|
||
// If we previously saw an EOF or an invalid character, bail out: | ||
if (st.eof) { | ||
*outlen = 0; | ||
ret = 0; | ||
// If there was a trailing '=' to check, check it: | ||
if (srclen && (st.eof == BASE64_AEOF)) { | ||
state->bytes = 0; | ||
state->eof = BASE64_EOF; | ||
ret = ((base64_table_dec_8bit[*c++] == 254) && (srclen == 1)) ? 1 : 0; | ||
} | ||
return ret; | ||
} | ||
|
||
// Turn four 6-bit numbers into three bytes: | ||
// out[0] = 11111122 | ||
// out[1] = 22223333 | ||
// out[2] = 33444444 | ||
|
||
// Duff's device again: | ||
switch (st.bytes) | ||
{ | ||
for (;;) | ||
{ | ||
case 0: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
// If we have SSSE3 support, pick off 16 bytes at a time for as long as we can, | ||
// but make sure that we quit before seeing any == markers at the end of the | ||
// string. Also, because we write four zeroes at the end of the output, ensure | ||
// that there are at least 6 valid bytes of input data remaining to close the | ||
// gap. 16 + 2 + 6 = 24 bytes: | ||
while (srclen >= 24) | ||
{ | ||
// Load string: | ||
__m128i str = _mm_loadu_si128((__m128i *)c); | ||
|
||
// The input consists of six character sets in the Base64 alphabet, | ||
// which we need to map back to the 6-bit values they represent. | ||
// There are three ranges, two singles, and then there's the rest. | ||
// | ||
// # From To Add Characters | ||
// 1 [43] [62] +19 + | ||
// 2 [47] [63] +16 / | ||
// 3 [48..57] [52..61] +4 0..9 | ||
// 4 [65..90] [0..25] -65 A..Z | ||
// 5 [97..122] [26..51] -71 a..z | ||
// (6) Everything else => invalid input | ||
|
||
const __m128i set1 = CMPEQ(str, '+'); | ||
const __m128i set2 = CMPEQ(str, '/'); | ||
const __m128i set3 = RANGE(str, '0', '9'); | ||
const __m128i set4 = RANGE(str, 'A', 'Z'); | ||
const __m128i set5 = RANGE(str, 'a', 'z'); | ||
|
||
__m128i delta = REPLACE(set1, 19); | ||
delta = _mm_or_si128(delta, REPLACE(set2, 16)); | ||
delta = _mm_or_si128(delta, REPLACE(set3, 4)); | ||
delta = _mm_or_si128(delta, REPLACE(set4, -65)); | ||
delta = _mm_or_si128(delta, REPLACE(set5, -71)); | ||
|
||
// Check for invalid input: if any of the delta values are zero, | ||
// fall back on bytewise code to do error checking and reporting: | ||
if (_mm_movemask_epi8(CMPEQ(delta, 0))) { | ||
break; | ||
} | ||
|
||
// Now simply add the delta values to the input: | ||
str = _mm_add_epi8(str, delta); | ||
|
||
// Reshuffle the input to packed 12-byte output format: | ||
str = dec_reshuffle(str); | ||
|
||
// Store back: | ||
_mm_storeu_si128((__m128i *)o, str); | ||
|
||
c += 16; | ||
o += 12; | ||
outl += 12; | ||
srclen -= 16; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
static inline __m128i | ||
dec_reshuffle (__m128i in) | ||
{ | ||
// Mask in a single byte per shift: | ||
const __m128i maskB2 = _mm_set1_epi32(0x003F0000); | ||
const __m128i maskB1 = _mm_set1_epi32(0x00003F00); | ||
|
||
// Pack bytes together: | ||
__m128i out = _mm_srli_epi32(in, 16); | ||
|
||
out = _mm_or_si128(out, _mm_srli_epi32(_mm_and_si128(in, maskB2), 2)); | ||
|
||
out = _mm_or_si128(out, _mm_slli_epi32(_mm_and_si128(in, maskB1), 12)); | ||
|
||
out = _mm_or_si128(out, _mm_slli_epi32(in, 26)); | ||
|
||
// Reshuffle and repack into 12-byte output format: | ||
return _mm_shuffle_epi8(out, _mm_setr_epi8( | ||
3, 2, 1, | ||
7, 6, 5, | ||
11, 10, 9, | ||
15, 14, 13, | ||
-1, -1, -1, -1)); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
if (srclen-- == 0) { | ||
ret = 1; | ||
break; | ||
} | ||
if ((q = base64_table_dec_8bit[*c++]) >= 254) { | ||
st.eof = BASE64_EOF; | ||
// Treat character '=' as invalid for byte 0: | ||
break; | ||
} | ||
st.carry = q << 2; | ||
st.bytes++; | ||
|
||
// Deliberate fallthrough: | ||
BASE64_FALLTHROUGH | ||
|
||
case 1: if (srclen-- == 0) { | ||
ret = 1; | ||
break; | ||
} | ||
if ((q = base64_table_dec_8bit[*c++]) >= 254) { | ||
st.eof = BASE64_EOF; | ||
// Treat character '=' as invalid for byte 1: | ||
break; | ||
} | ||
*o++ = st.carry | (q >> 4); | ||
st.carry = q << 4; | ||
st.bytes++; | ||
outl++; | ||
|
||
// Deliberate fallthrough: | ||
BASE64_FALLTHROUGH | ||
|
||
case 2: if (srclen-- == 0) { | ||
ret = 1; | ||
break; | ||
} | ||
if ((q = base64_table_dec_8bit[*c++]) >= 254) { | ||
st.bytes++; | ||
// When q == 254, the input char is '='. | ||
// Check if next byte is also '=': | ||
if (q == 254) { | ||
if (srclen-- != 0) { | ||
st.bytes = 0; | ||
// EOF: | ||
st.eof = BASE64_EOF; | ||
q = base64_table_dec_8bit[*c++]; | ||
ret = ((q == 254) && (srclen == 0)) ? 1 : 0; | ||
break; | ||
} | ||
else { | ||
// Almost EOF | ||
st.eof = BASE64_AEOF; | ||
ret = 1; | ||
break; | ||
} | ||
} | ||
// If we get here, there was an error: | ||
break; | ||
} | ||
*o++ = st.carry | (q >> 2); | ||
st.carry = q << 6; | ||
st.bytes++; | ||
outl++; | ||
|
||
// Deliberate fallthrough: | ||
BASE64_FALLTHROUGH | ||
|
||
case 3: if (srclen-- == 0) { | ||
ret = 1; | ||
break; | ||
} | ||
if ((q = base64_table_dec_8bit[*c++]) >= 254) { | ||
st.bytes = 0; | ||
st.eof = BASE64_EOF; | ||
// When q == 254, the input char is '='. Return 1 and EOF. | ||
// When q == 255, the input char is invalid. Return 0 and EOF. | ||
ret = ((q == 254) && (srclen == 0)) ? 1 : 0; | ||
break; | ||
} | ||
*o++ = st.carry | q; | ||
st.carry = 0; | ||
st.bytes = 0; | ||
outl++; | ||
} | ||
} | ||
state->eof = st.eof; | ||
state->bytes = st.bytes; | ||
state->carry = st.carry; | ||
*outlen = outl; | ||
return ret; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
// Assume that *out is large enough to contain the output. | ||
// Theoretically it should be 4/3 the length of src. | ||
const uint8_t *c = (const uint8_t *)src; | ||
uint8_t *o = (uint8_t *)out; | ||
|
||
// Use local temporaries to avoid cache thrashing: | ||
size_t outl = 0; | ||
struct base64_state st; | ||
st.bytes = state->bytes; | ||
st.carry = state->carry; | ||
|
||
// Turn three bytes into four 6-bit numbers: | ||
// in[0] = 00111111 | ||
// in[1] = 00112222 | ||
// in[2] = 00222233 | ||
// in[3] = 00333333 | ||
|
||
// Duff's device, a for() loop inside a switch() statement. Legal! | ||
switch (st.bytes) | ||
{ | ||
for (;;) | ||
{ | ||
case 0: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
// If we have SSSE3 support, pick off 12 bytes at a time for as long as we can. | ||
// But because we read 16 bytes at a time, ensure we have enough room to do a | ||
// full 16-byte read without segfaulting: | ||
while (srclen >= 16) | ||
{ | ||
// Load string: | ||
__m128i str = _mm_loadu_si128((__m128i *)c); | ||
|
||
// Reshuffle: | ||
str = enc_reshuffle(str); | ||
|
||
// Translate reshuffled bytes to the Base64 alphabet: | ||
str = enc_translate(str); | ||
|
||
// Store: | ||
_mm_storeu_si128((__m128i *)o, str); | ||
|
||
c += 12; // 3 * 4 bytes of input | ||
o += 16; // 4 * 4 bytes of output | ||
outl += 16; | ||
srclen -= 12; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
static inline __m128i | ||
enc_reshuffle (__m128i in) | ||
{ | ||
// input, bytes MSB to LSB: | ||
// 0 0 0 0 l k j i h g f e d c b a | ||
|
||
in = _mm_shuffle_epi8(in, _mm_set_epi8( | ||
10, 11, 9, 10, | ||
7, 8, 6, 7, | ||
4, 5, 3, 4, | ||
1, 2, 0, 1)); | ||
// in, bytes MSB to LSB: | ||
// k l j k | ||
// h i g h | ||
// e f d e | ||
// b c a b | ||
|
||
const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0fc0fc00)); | ||
// bits, upper case are most significant bits, lower case are least significant bits | ||
// 0000kkkk LL000000 JJJJJJ00 00000000 | ||
// 0000hhhh II000000 GGGGGG00 00000000 | ||
// 0000eeee FF000000 DDDDDD00 00000000 | ||
// 0000bbbb CC000000 AAAAAA00 00000000 | ||
|
||
const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040)); | ||
// 00000000 00kkkkLL 00000000 00JJJJJJ | ||
// 00000000 00hhhhII 00000000 00GGGGGG | ||
// 00000000 00eeeeFF 00000000 00DDDDDD | ||
// 00000000 00bbbbCC 00000000 00AAAAAA | ||
|
||
const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003f03f0)); | ||
// 00000000 00llllll 000000jj KKKK0000 | ||
// 00000000 00iiiiii 000000gg HHHH0000 | ||
// 00000000 00ffffff 000000dd EEEE0000 | ||
// 00000000 00cccccc 000000aa BBBB0000 | ||
|
||
const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010)); | ||
// 00llllll 00000000 00jjKKKK 00000000 | ||
// 00iiiiii 00000000 00ggHHHH 00000000 | ||
// 00ffffff 00000000 00ddEEEE 00000000 | ||
// 00cccccc 00000000 00aaBBBB 00000000 | ||
|
||
return _mm_or_si128(t1, t3); | ||
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ | ||
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG | ||
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD | ||
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA | ||
} |
Oops, something went wrong.