forked from aklomp/base64
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Use Wojciech Mula (@WojciechMula) implementation update for AVX2 / SSSE3 encoding. SSSE3 implementation is reused in SSE4.1, SSE4.2 and AVX dispatched encoding loops. SSE4.1 implementation is now useless but kept to ease integration of future updates if needed. Speed-up on i7-4870HQ @ 2.5 GHz (clang-800.0.42.1, x86_64) SSSE3 encoding: +20% SSE4.2 encoding: +8% AVX encoding: +7% AVX2 encoding: +3%
- Loading branch information
Showing
10 changed files
with
168 additions
and
132 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,22 +1,28 @@ | ||
// If we have AVX2 support, pick off 24 bytes at a time for as long as we can. | ||
// But because we read 32 bytes at a time, ensure we have enough room to do a | ||
// full 32-byte read without segfaulting: | ||
while (srclen >= 32) | ||
{ | ||
// Load string: | ||
__m256i str = _mm256_loadu_si256((__m256i *)c); | ||
|
||
// Reshuffle: | ||
str = enc_reshuffle(str); | ||
if (srclen >= 32) { | ||
const uint8_t* const o_orig = o; | ||
|
||
// Translate reshuffled bytes to the Base64 alphabet: | ||
str = enc_translate(str); | ||
// first load is done at c-0 not to get a segfault | ||
__m256i inputvector = _mm256_loadu_si256((__m256i *)(c - 0)); | ||
|
||
// Store: | ||
_mm256_storeu_si256((__m256i *)o, str); | ||
// shift by 4 bytes, as required by enc_reshuffle | ||
inputvector = _mm256_permutevar8x32_epi32(inputvector, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6)); | ||
|
||
c += 24; // 6 * 4 bytes of input | ||
o += 32; // 8 * 4 bytes of output | ||
outl += 32; | ||
srclen -= 24; | ||
for (;;) { | ||
inputvector = enc_reshuffle(inputvector); | ||
inputvector = enc_translate(inputvector); | ||
_mm256_storeu_si256((__m256i *)o, inputvector); | ||
c += 24; | ||
o += 32; | ||
srclen -= 24; | ||
if(srclen < 28) { | ||
break; | ||
} | ||
// Load at c-4, as required by enc_reshuffle | ||
inputvector = _mm256_loadu_si256((__m256i *)(c - 4)); | ||
} | ||
outl += (size_t)(o - o_orig); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
static inline __m128i | ||
enc_reshuffle (__m128i in) | ||
{ | ||
// input, bytes MSB to LSB: | ||
// 0 0 0 0 l k j i h g f e d c b a | ||
|
||
in = _mm_shuffle_epi8(in, _mm_set_epi8( | ||
10, 11, 9, 10, | ||
7, 8, 6, 7, | ||
4, 5, 3, 4, | ||
1, 2, 0, 1)); | ||
// in, bytes MSB to LSB: | ||
// k l j k | ||
// h i g h | ||
// e f d e | ||
// b c a b | ||
|
||
const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0fc0fc00)); | ||
// bits, upper case are most significant bits, lower case are least significant bits | ||
// 0000kkkk LL000000 JJJJJJ00 00000000 | ||
// 0000hhhh II000000 GGGGGG00 00000000 | ||
// 0000eeee FF000000 DDDDDD00 00000000 | ||
// 0000bbbb CC000000 AAAAAA00 00000000 | ||
|
||
const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040)); | ||
// 00000000 00kkkkLL 00000000 00JJJJJJ | ||
// 00000000 00hhhhII 00000000 00GGGGGG | ||
// 00000000 00eeeeFF 00000000 00DDDDDD | ||
// 00000000 00bbbbCC 00000000 00AAAAAA | ||
|
||
const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003f03f0)); | ||
// 00000000 00llllll 000000jj KKKK0000 | ||
// 00000000 00iiiiii 000000gg HHHH0000 | ||
// 00000000 00ffffff 000000dd EEEE0000 | ||
// 00000000 00cccccc 000000aa BBBB0000 | ||
|
||
const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010)); | ||
// 00llllll 00000000 00jjKKKK 00000000 | ||
// 00iiiiii 00000000 00ggHHHH 00000000 | ||
// 00ffffff 00000000 00ddEEEE 00000000 | ||
// 00cccccc 00000000 00aaBBBB 00000000 | ||
|
||
return _mm_or_si128(t1, t3); | ||
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ | ||
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG | ||
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD | ||
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA | ||
} |