Skip to content

Commit

Permalink
SSSE3->AVX2 encoding optimization
Browse files Browse the repository at this point in the history
Use Wojciech Mula (@WojciechMula) implementation update for AVX2 /
SSSE3 encoding.

SSSE3 implementation is reused in SSE4.1, SSE4.2 and AVX dispatched
encoding loops.

SSE4.1 implementation is now useless but kept to ease integration of
future updates if needed.

Speed-up on i7-4870HQ @ 2.5 GHz (clang-800.0.42.1, x86_64)
SSSE3 encoding: +20%
SSE4.2 encoding: +8%
AVX encoding: +7%
AVX2 encoding: +3%
  • Loading branch information
mayeut committed Feb 21, 2017
1 parent cfa8bf7 commit 67ee3fd
Show file tree
Hide file tree
Showing 10 changed files with 168 additions and 132 deletions.
1 change: 1 addition & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
Copyright (c) 2005-2007, Nick Galbreath
Copyright (c) 2013-2017, Alfred Klomp
Copyright (c) 2015-2017, Wojciech Mula
Copyright (c) 2016-2017, Matthieu Darbois
All rights reserved.

Expand Down
32 changes: 16 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -423,22 +423,22 @@ The tables below contain some results on random machines. All numbers measured w

x86 processors

| Processor | Plain enc | Plain dec | SSSE3 enc | SSSE3 dec | SSE4.1 enc | SSE4.1 dec| SSE4.2 enc | SSE4.2 dec| AVX enc | AVX dec | AVX2 enc | AVX2 dec |
|-------------------------------------------|----------:|----------:|----------:|----------:|-----------:|----------:|-----------:|----------:|--------:|--------:|---------:|---------:|
| i7-4771 @ 3.5 GHz | 833 | 1111\* | 3333\* | 4444\* | TBD | TBD | TBD | TBD | TBD | TBD | 4999\* | 6666\* |
| i7-4770 @ 3.4 GHz DDR1600 | 1831 | 1748\* | 3570\* | 3695\* | TBD | TBD | TBD | TBD | TBD | TBD | 6539\* | 6512\* |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 1 thread | 1779 | 1727\* | 3419\* | 3788\* | TBD | TBD | TBD | TBD | TBD | TBD | 4589\* | 5871\* |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 2 thread | 3367 | 3374\* | 4784\* | 6672\* | TBD | TBD | TBD | TBD | TBD | TBD | 5120\* | 7721\* |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4834 | 6075\* | 4906\* | 8154\* | TBD | TBD | TBD | TBD | TBD | TBD | 4839\* | 6911\* |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 4696 | 6361\* | 5227\* | 7737\* | TBD | TBD | TBD | TBD | TBD | TBD | 4813\* | 7189\* |
| i7-4870HQ @ 2.5 GHz | 1471 | 3066 | 5599 | 3886 | 5882 | 3888 | 6202 | 5098 | 6524 | 5281 | 8113 | 7063 |
| i5-4590S @ 3.0 GHz | 1721 | 1643\* | 3255\* | 3404\* | TBD | TBD | TBD | TBD | TBD | TBD | 4124\* | 5403\* |
| Xeon X5570 @ 2.93 GHz | 1097 | 1048\* | 2077\* | 2215\* | TBD | TBD | TBD | TBD | - | - | - | - |
| Pentium4 @ 3.4 GHz | 528 | 448\* | - | - | - | - | - | - | - | - | - | - |
| Atom N270 | 112 | 125\* | 331\* | 368\* | - | - | - | - | - | - | - | - |
| AMD E-450 | 370 | 332\* | 405\* | 366\* | - | - | - | - | - | - | - | - |
| Intel Edison @ 500 MHz | 79 | 92\* | 152\* | 172\* | TBD | TBD | TBD | TBD | - | - | - | - |
| Intel Edison @ 500 MHz OPENMP 2 thread | 158 | 184\* | 300\* | 343\* | TBD | TBD | TBD | TBD | - | - | - | - |
| Processor | Plain enc | Plain dec | SSSE3 enc | SSSE3 dec | SSE4.2 enc | SSE4.2 dec| AVX enc | AVX dec | AVX2 enc | AVX2 dec |
|-------------------------------------------|----------:|----------:|----------:|----------:|-----------:|----------:|--------:|--------:|---------:|---------:|
| i7-4771 @ 3.5 GHz | 833 | 1111\* | 3333\* | 4444\* | TBD | TBD | TBD | TBD | 4999\* | 6666\* |
| i7-4770 @ 3.4 GHz DDR1600 | 1831 | 1748\* | 3570\* | 3695\* | TBD | TBD | TBD | TBD | 6539\* | 6512\* |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 1 thread | 1779 | 1727\* | 3419\* | 3788\* | TBD | TBD | TBD | TBD | 4589\* | 5871\* |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 2 thread | 3367 | 3374\* | 4784\* | 6672\* | TBD | TBD | TBD | TBD | 5120\* | 7721\* |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4834 | 6075\* | 4906\* | 8154\* | TBD | TBD | TBD | TBD | 4839\* | 6911\* |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 4696 | 6361\* | 5227\* | 7737\* | TBD | TBD | TBD | TBD | 4813\* | 7189\* |
| i7-4870HQ @ 2.5 GHz | 1471 | 3066 | 6721 | 3886 | 6701 | 5098 | 7015 | 5281 | 8328 | 7063 |
| i5-4590S @ 3.0 GHz | 1721 | 1643\* | 3255\* | 3404\* | TBD | TBD | TBD | TBD | 4124\* | 5403\* |
| Xeon X5570 @ 2.93 GHz | 1097 | 1048\* | 2077\* | 2215\* | TBD | TBD | - | - | - | - |
| Pentium4 @ 3.4 GHz | 528 | 448\* | - | - | - | - | - | - | - | - |
| Atom N270 | 112 | 125\* | 331\* | 368\* | - | - | - | - | - | - |
| AMD E-450 | 370 | 332\* | 405\* | 366\* | - | - | - | - | - | - |
| Intel Edison @ 500 MHz | 79 | 92\* | 152\* | 172\* | TBD | TBD | - | - | - | - |
| Intel Edison @ 500 MHz OPENMP 2 thread | 158 | 184\* | 300\* | 343\* | TBD | TBD | - | - | - | - |

ARM processors

Expand Down
3 changes: 1 addition & 2 deletions lib/arch/avx/codec.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,9 @@

#include "../sse2/compare_macros.h"

#include "../ssse3/_mm_bswap_epi32.c"
#include "../ssse3/dec_reshuffle.c"
#include "../ssse3/enc_translate.c"
#include "../sse41/enc_reshuffle.c"
#include "../ssse3/enc_reshuffle.c"

#endif // __AVX__

Expand Down
130 changes: 79 additions & 51 deletions lib/arch/avx2/codec.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,57 +13,85 @@
#define REPLACE(s,n) _mm256_and_si256((s), _mm256_set1_epi8(n))
#define RANGE(s,a,b) _mm256_andnot_si256(CMPGT((s), (b)), CMPGT((s), (a) - 1))

static inline __m256i
_mm256_bswap_epi32 (const __m256i in)
{
// _mm256_shuffle_epi8() works on two 128-bit lanes separately:
return _mm256_shuffle_epi8(in, _mm256_setr_epi8(
3, 2, 1, 0,
7, 6, 5, 4,
11, 10, 9, 8,
15, 14, 13, 12,
3, 2, 1, 0,
7, 6, 5, 4,
11, 10, 9, 8,
15, 14, 13, 12));
}

static inline __m256i
enc_reshuffle (__m256i in)
{
// Spread out 32-bit words over both halves of the input register:
in = _mm256_permutevar8x32_epi32(in, _mm256_setr_epi32(
0, 1, 2, -1,
3, 4, 5, -1));

// Slice into 32-bit chunks and operate on all chunks in parallel.
// All processing is done within the 32-bit chunk. First, shuffle:
// before: [eeeeeeff|ccdddddd|bbbbcccc|aaaaaabb]
// after: [00000000|aaaaaabb|bbbbcccc|ccdddddd]
in = _mm256_shuffle_epi8(in, _mm256_set_epi8(
-1, 9, 10, 11,
-1, 6, 7, 8,
-1, 3, 4, 5,
-1, 0, 1, 2,
-1, 9, 10, 11,
-1, 6, 7, 8,
-1, 3, 4, 5,
-1, 0, 1, 2));

// merged = [0000aaaa|aabbbbbb|bbbbcccc|ccdddddd]
const __m256i merged = _mm256_blend_epi16(_mm256_slli_epi32(in, 4), in, 0x55);

// bd = [00000000|00bbbbbb|00000000|00dddddd]
const __m256i bd = _mm256_and_si256(merged, _mm256_set1_epi32(0x003F003F));

// ac = [00aaaaaa|00000000|00cccccc|00000000]
const __m256i ac = _mm256_and_si256(_mm256_slli_epi32(merged, 2), _mm256_set1_epi32(0x3F003F00));

// indices = [00aaaaaa|00bbbbbb|00cccccc|00dddddd]
const __m256i indices = _mm256_or_si256(ac, bd);

// return = [00dddddd|00cccccc|00bbbbbb|00aaaaaa]
return _mm256_bswap_epi32(indices);
static inline __m256i enc_reshuffle(const __m256i input) {
// translation from SSSE3 into AVX2 of procedure
// This one works with shifted (4 bytes) input in order to
// be able to work efficiently in the 2 128-bit lanes

// input, bytes MSB to LSB:
// 0 0 0 0 x w v u t s r q p o n m
// l k j i h g f e d c b a 0 0 0 0

const __m256i in = _mm256_shuffle_epi8(input, _mm256_set_epi8(
10, 11, 9, 10,
7, 8, 6, 7,
4, 5, 3, 4,
1, 2, 0, 1,

14, 15, 13, 14,
11, 12, 10, 11,
8, 9, 7, 8,
5, 6, 4, 5));
// in, bytes MSB to LSB:
// w x v w
// t u s t
// q r p q
// n o m n
// k l j k
// h i g h
// e f d e
// b c a b

const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
// bits, upper case are most significant bits, lower case are least significant bits.
// 0000wwww XX000000 VVVVVV00 00000000
// 0000tttt UU000000 SSSSSS00 00000000
// 0000qqqq RR000000 PPPPPP00 00000000
// 0000nnnn OO000000 MMMMMM00 00000000
// 0000kkkk LL000000 JJJJJJ00 00000000
// 0000hhhh II000000 GGGGGG00 00000000
// 0000eeee FF000000 DDDDDD00 00000000
// 0000bbbb CC000000 AAAAAA00 00000000

const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
// 00000000 00wwwwXX 00000000 00VVVVVV
// 00000000 00ttttUU 00000000 00SSSSSS
// 00000000 00qqqqRR 00000000 00PPPPPP
// 00000000 00nnnnOO 00000000 00MMMMMM
// 00000000 00kkkkLL 00000000 00JJJJJJ
// 00000000 00hhhhII 00000000 00GGGGGG
// 00000000 00eeeeFF 00000000 00DDDDDD
// 00000000 00bbbbCC 00000000 00AAAAAA

const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
// 00000000 00xxxxxx 000000vv WWWW0000
// 00000000 00uuuuuu 000000ss TTTT0000
// 00000000 00rrrrrr 000000pp QQQQ0000
// 00000000 00oooooo 000000mm NNNN0000
// 00000000 00llllll 000000jj KKKK0000
// 00000000 00iiiiii 000000gg HHHH0000
// 00000000 00ffffff 000000dd EEEE0000
// 00000000 00cccccc 000000aa BBBB0000

const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
// 00xxxxxx 00000000 00vvWWWW 00000000
// 00uuuuuu 00000000 00ssTTTT 00000000
// 00rrrrrr 00000000 00ppQQQQ 00000000
// 00oooooo 00000000 00mmNNNN 00000000
// 00llllll 00000000 00jjKKKK 00000000
// 00iiiiii 00000000 00ggHHHH 00000000
// 00ffffff 00000000 00ddEEEE 00000000
// 00cccccc 00000000 00aaBBBB 00000000

return _mm256_or_si256(t1, t3);
// 00xxxxxx 00wwwwXX 00vvWWWW 00VVVVVV
// 00uuuuuu 00ttttUU 00ssTTTT 00SSSSSS
// 00rrrrrr 00qqqqRR 00ppQQQQ 00PPPPPP
// 00oooooo 00nnnnOO 00mmNNNN 00MMMMMM
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
}

static inline __m256i
Expand Down
34 changes: 20 additions & 14 deletions lib/arch/avx2/enc_loop.c
Original file line number Diff line number Diff line change
@@ -1,22 +1,28 @@
// If we have AVX2 support, pick off 24 bytes at a time for as long as we can.
// But because we read 32 bytes at a time, ensure we have enough room to do a
// full 32-byte read without segfaulting:
while (srclen >= 32)
{
// Load string:
__m256i str = _mm256_loadu_si256((__m256i *)c);

// Reshuffle:
str = enc_reshuffle(str);
if (srclen >= 32) {
const uint8_t* const o_orig = o;

// Translate reshuffled bytes to the Base64 alphabet:
str = enc_translate(str);
// first load is done at c-0 not to get a segfault
__m256i inputvector = _mm256_loadu_si256((__m256i *)(c - 0));

// Store:
_mm256_storeu_si256((__m256i *)o, str);
// shift by 4 bytes, as required by enc_reshuffle
inputvector = _mm256_permutevar8x32_epi32(inputvector, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));

c += 24; // 6 * 4 bytes of input
o += 32; // 8 * 4 bytes of output
outl += 32;
srclen -= 24;
for (;;) {
inputvector = enc_reshuffle(inputvector);
inputvector = enc_translate(inputvector);
_mm256_storeu_si256((__m256i *)o, inputvector);
c += 24;
o += 32;
srclen -= 24;
if(srclen < 28) {
break;
}
// Load at c-4, as required by enc_reshuffle
inputvector = _mm256_loadu_si256((__m256i *)(c - 4));
}
outl += (size_t)(o - o_orig);
}
3 changes: 1 addition & 2 deletions lib/arch/sse41/codec.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,9 @@

#include "../sse2/compare_macros.h"

#include "../ssse3/_mm_bswap_epi32.c"
#include "../ssse3/dec_reshuffle.c"
#include "../ssse3/enc_translate.c"
#include "enc_reshuffle.c"
#include "../ssse3/enc_reshuffle.c"

#endif // __SSE4_1__

Expand Down
3 changes: 1 addition & 2 deletions lib/arch/sse42/codec.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,9 @@

#include "../sse2/compare_macros.h"

#include "../ssse3/_mm_bswap_epi32.c"
#include "../ssse3/dec_reshuffle.c"
#include "../ssse3/enc_translate.c"
#include "../sse41/enc_reshuffle.c"
#include "../ssse3/enc_reshuffle.c"

#endif // __SSE4_2__

Expand Down
9 changes: 0 additions & 9 deletions lib/arch/ssse3/_mm_bswap_epi32.c

This file was deleted.

37 changes: 1 addition & 36 deletions lib/arch/ssse3/codec.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,45 +10,10 @@

#include "../sse2/compare_macros.h"

#include "_mm_bswap_epi32.c"
#include "dec_reshuffle.c"
#include "enc_reshuffle.c"
#include "enc_translate.c"

static inline __m128i
enc_reshuffle (__m128i in)
{
// Slice into 32-bit chunks and operate on all chunks in parallel.
// All processing is done within the 32-bit chunk. First, shuffle:
// before: [eeeeeeff|ccdddddd|bbbbcccc|aaaaaabb]
// after: [00000000|aaaaaabb|bbbbcccc|ccdddddd]
in = _mm_shuffle_epi8(in, _mm_set_epi8(
-1, 9, 10, 11,
-1, 6, 7, 8,
-1, 3, 4, 5,
-1, 0, 1, 2));

// cd = [00000000|00000000|0000cccc|ccdddddd]
const __m128i cd = _mm_and_si128(in, _mm_set1_epi32(0x00000FFF));

// ab = [0000aaaa|aabbbbbb|00000000|00000000]
const __m128i ab = _mm_and_si128(_mm_slli_epi32(in, 4), _mm_set1_epi32(0x0FFF0000));

// merged = [0000aaaa|aabbbbbb|0000cccc|ccdddddd]
const __m128i merged = _mm_or_si128(ab, cd);

// bd = [00000000|00bbbbbb|00000000|00dddddd]
const __m128i bd = _mm_and_si128(merged, _mm_set1_epi32(0x003F003F));

// ac = [00aaaaaa|00000000|00cccccc|00000000]
const __m128i ac = _mm_and_si128(_mm_slli_epi32(merged, 2), _mm_set1_epi32(0x3F003F00));

// indices = [00aaaaaa|00bbbbbb|00cccccc|00dddddd]
const __m128i indices = _mm_or_si128(ac, bd);

// return = [00dddddd|00cccccc|00bbbbbb|00aaaaaa]
return _mm_bswap_epi32(indices);
}

#endif // __SSSE3__

BASE64_ENC_FUNCTION(ssse3)
Expand Down
48 changes: 48 additions & 0 deletions lib/arch/ssse3/enc_reshuffle.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
static inline __m128i
enc_reshuffle (__m128i in)
{
// input, bytes MSB to LSB:
// 0 0 0 0 l k j i h g f e d c b a

in = _mm_shuffle_epi8(in, _mm_set_epi8(
10, 11, 9, 10,
7, 8, 6, 7,
4, 5, 3, 4,
1, 2, 0, 1));
// in, bytes MSB to LSB:
// k l j k
// h i g h
// e f d e
// b c a b

const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0fc0fc00));
// bits, upper case are most significant bits, lower case are least significant bits
// 0000kkkk LL000000 JJJJJJ00 00000000
// 0000hhhh II000000 GGGGGG00 00000000
// 0000eeee FF000000 DDDDDD00 00000000
// 0000bbbb CC000000 AAAAAA00 00000000

const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));
// 00000000 00kkkkLL 00000000 00JJJJJJ
// 00000000 00hhhhII 00000000 00GGGGGG
// 00000000 00eeeeFF 00000000 00DDDDDD
// 00000000 00bbbbCC 00000000 00AAAAAA

const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003f03f0));
// 00000000 00llllll 000000jj KKKK0000
// 00000000 00iiiiii 000000gg HHHH0000
// 00000000 00ffffff 000000dd EEEE0000
// 00000000 00cccccc 000000aa BBBB0000

const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));
// 00llllll 00000000 00jjKKKK 00000000
// 00iiiiii 00000000 00ggHHHH 00000000
// 00ffffff 00000000 00ddEEEE 00000000
// 00cccccc 00000000 00aaBBBB 00000000

return _mm_or_si128(t1, t3);
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
}

0 comments on commit 67ee3fd

Please sign in to comment.