Skip to content

Commit

Permalink
codec: add ssse3_atom
Browse files Browse the repository at this point in the history
By performing benchmarks on Intel Edison (a Silvermont Atom CPU) in x86_64 mode
from v0.3.0 we find that SSE3 had  various ups and down. Substantial changes
since v0.3.0 were:
HASH	SSSE3	SSSE3
e12e3cd	165	210
3f3f31c	206	150
67ee3fd	205	205
0a69845	145	205
a5b6739	145	218
6310c1f	157	218
9a0d1b2	158	210
5874921	165	210
Best performance was from 67ee3fd until decode performance regressed
from 205 to 145 MB/s with commit 0a69845. The commit before that
(b6417f3) had best decode performance with relatively good encode.
Core(-i7) processors do not should such large performance changes.
This patch adds the ssse3 codec from b6417f3 as ssse3_atom.

Signed-off-by: Ferry Toth <[email protected]>
  • Loading branch information
htot committed Jun 22, 2022
1 parent 91dec51 commit 2765efd
Show file tree
Hide file tree
Showing 18 changed files with 466 additions and 3 deletions.
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ cmake_dependent_option(BASE64_REGENERATE_TABLES "regenerate the codec tables" OF
set(_IS_X86 "_TARGET_ARCH STREQUAL \"x86\" OR _TARGET_ARCH STREQUAL \"x64\"")
cmake_dependent_option(BASE64_WITH_SSSE3 "add SSSE 3 codepath" ON ${_IS_X86} OFF)
add_feature_info(SSSE3 BASE64_WITH_SSSE3 "add SSSE 3 codepath")
cmake_dependent_option(BASE64_WITH_SSSE3_ATOM "add SSSE 3 for ATOM codepath" ON ${_IS_X86} OFF)
add_feature_info(SSSE3 BASE64_WITH_SSSE3_ATOM "add SSSE 3 for codepath")
cmake_dependent_option(BASE64_WITH_SSE41 "add SSE 4.1 codepath" ON ${_IS_X86} OFF)
add_feature_info(SSE4.1 BASE64_WITH_SSE41 "add SSE 4.1 codepath")
cmake_dependent_option(BASE64_WITH_SSE42 "add SSE 4.2 codepath" ON ${_IS_X86} OFF)
Expand Down Expand Up @@ -116,6 +118,7 @@ add_library(base64
lib/arch/generic/codec.c

lib/arch/ssse3/codec.c
lib/arch/ssse3_atom/codec.c
lib/arch/sse41/codec.c
lib/arch/sse42/codec.c
lib/arch/avx/codec.c
Expand Down Expand Up @@ -204,6 +207,7 @@ if (_TARGET_ARCH STREQUAL "x86" OR _TARGET_ARCH STREQUAL "x64")
endmacro()

configure_codec(SSSE3 __SSSE3__)
configure_codec(SSSE3_ATOM __SSSE3__)
configure_codec(SSE41 __SSSE4_1__)
configure_codec(SSE42 __SSSE4_2__)
configure_codec(AVX)
Expand Down
1 change: 1 addition & 0 deletions cmake/Modules/TargetSIMDInstructionSet.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ macro(define_SIMD_compile_flags)
if (CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID STREQUAL "Clang" OR CMAKE_C_COMPILER_ID STREQUAL "AppleClang")
# x86
set(COMPILE_FLAGS_SSSE3 "-mssse3")
set(COMPILE_FLAGS_SSSE3_ATOM "-mssse3")
set(COMPILE_FLAGS_SSE41 "-msse4.1")
set(COMPILE_FLAGS_SSE42 "-msse4.2")
set(COMPILE_FLAGS_AVX "-mavx")
Expand Down
1 change: 1 addition & 0 deletions include/libbase64.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ extern "C" {
#define BASE64_FORCE_SSE41 (1 << 5)
#define BASE64_FORCE_SSE42 (1 << 6)
#define BASE64_FORCE_AVX (1 << 7)
#define BASE64_FORCE_SSSE3_ATOM (1 << 8)
#define BASE64_CHECK_SUPPORT (1 << 15)

struct base64_state {
Expand Down
4 changes: 4 additions & 0 deletions lib/arch/sse2/compare_macros.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#define CMPGT(s,n) _mm_cmpgt_epi8((s), _mm_set1_epi8(n))
#define CMPEQ(s,n) _mm_cmpeq_epi8((s), _mm_set1_epi8(n))
#define REPLACE(s,n) _mm_and_si128((s), _mm_set1_epi8(n))
#define RANGE(s,a,b) _mm_andnot_si128(CMPGT((s), (b)), CMPGT((s), (a) - 1))
42 changes: 42 additions & 0 deletions lib/arch/ssse3_atom/codec.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>

#include "../../../include/libbase64.h"
#include "../../tables/tables.h"
#include "../../codecs.h"
#include "config.h"
#include "../../env.h"

#if HAVE_SSSE3
#include <tmmintrin.h>

#include "../sse2/compare_macros.h"

#include "dec_reshuffle.c"
#include "enc_reshuffle.c"
#include "enc_translate.c"

#endif // __SSSE3__

BASE64_ENC_FUNCTION(ssse3_atom)
{
#if HAVE_SSSE3
#include "enc_head.c"
#include "enc_loop.c"
#include "enc_tail.c"
#else
BASE64_ENC_STUB
#endif
}

BASE64_DEC_FUNCTION(ssse3_atom)
{
#if HAVE_SSSE3
#include "dec_head.c"
#include "dec_loop.c"
#include "dec_tail.c"
#else
BASE64_DEC_STUB
#endif
}
36 changes: 36 additions & 0 deletions lib/arch/ssse3_atom/dec_head.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
int ret = 0;
const uint8_t *c = (const uint8_t *)src;
uint8_t *o = (uint8_t *)out;
uint8_t q;

// Use local temporaries to avoid cache thrashing:
size_t outl = 0;
struct base64_state st;
st.eof = state->eof;
st.bytes = state->bytes;
st.carry = state->carry;

// If we previously saw an EOF or an invalid character, bail out:
if (st.eof) {
*outlen = 0;
ret = 0;
// If there was a trailing '=' to check, check it:
if (srclen && (st.eof == BASE64_AEOF)) {
state->bytes = 0;
state->eof = BASE64_EOF;
ret = ((base64_table_dec_8bit[*c++] == 254) && (srclen == 1)) ? 1 : 0;
}
return ret;
}

// Turn four 6-bit numbers into three bytes:
// out[0] = 11111122
// out[1] = 22223333
// out[2] = 33444444

// Duff's device again:
switch (st.bytes)
{
for (;;)
{
case 0:
54 changes: 54 additions & 0 deletions lib/arch/ssse3_atom/dec_loop.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// If we have SSSE3 support, pick off 16 bytes at a time for as long as we can,
// but make sure that we quit before seeing any == markers at the end of the
// string. Also, because we write four zeroes at the end of the output, ensure
// that there are at least 6 valid bytes of input data remaining to close the
// gap. 16 + 2 + 6 = 24 bytes:
while (srclen >= 24)
{
// Load string:
__m128i str = _mm_loadu_si128((__m128i *)c);

// The input consists of six character sets in the Base64 alphabet,
// which we need to map back to the 6-bit values they represent.
// There are three ranges, two singles, and then there's the rest.
//
// # From To Add Characters
// 1 [43] [62] +19 +
// 2 [47] [63] +16 /
// 3 [48..57] [52..61] +4 0..9
// 4 [65..90] [0..25] -65 A..Z
// 5 [97..122] [26..51] -71 a..z
// (6) Everything else => invalid input

const __m128i set1 = CMPEQ(str, '+');
const __m128i set2 = CMPEQ(str, '/');
const __m128i set3 = RANGE(str, '0', '9');
const __m128i set4 = RANGE(str, 'A', 'Z');
const __m128i set5 = RANGE(str, 'a', 'z');

__m128i delta = REPLACE(set1, 19);
delta = _mm_or_si128(delta, REPLACE(set2, 16));
delta = _mm_or_si128(delta, REPLACE(set3, 4));
delta = _mm_or_si128(delta, REPLACE(set4, -65));
delta = _mm_or_si128(delta, REPLACE(set5, -71));

// Check for invalid input: if any of the delta values are zero,
// fall back on bytewise code to do error checking and reporting:
if (_mm_movemask_epi8(CMPEQ(delta, 0))) {
break;
}

// Now simply add the delta values to the input:
str = _mm_add_epi8(str, delta);

// Reshuffle the input to packed 12-byte output format:
str = dec_reshuffle(str);

// Store back:
_mm_storeu_si128((__m128i *)o, str);

c += 16;
o += 12;
outl += 12;
srclen -= 16;
}
24 changes: 24 additions & 0 deletions lib/arch/ssse3_atom/dec_reshuffle.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
static inline __m128i
dec_reshuffle (__m128i in)
{
// Mask in a single byte per shift:
const __m128i maskB2 = _mm_set1_epi32(0x003F0000);
const __m128i maskB1 = _mm_set1_epi32(0x00003F00);

// Pack bytes together:
__m128i out = _mm_srli_epi32(in, 16);

out = _mm_or_si128(out, _mm_srli_epi32(_mm_and_si128(in, maskB2), 2));

out = _mm_or_si128(out, _mm_slli_epi32(_mm_and_si128(in, maskB1), 12));

out = _mm_or_si128(out, _mm_slli_epi32(in, 26));

// Reshuffle and repack into 12-byte output format:
return _mm_shuffle_epi8(out, _mm_setr_epi8(
3, 2, 1,
7, 6, 5,
11, 10, 9,
15, 14, 13,
-1, -1, -1, -1));
}
90 changes: 90 additions & 0 deletions lib/arch/ssse3_atom/dec_tail.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
if (srclen-- == 0) {
ret = 1;
break;
}
if ((q = base64_table_dec_8bit[*c++]) >= 254) {
st.eof = BASE64_EOF;
// Treat character '=' as invalid for byte 0:
break;
}
st.carry = q << 2;
st.bytes++;

// Deliberate fallthrough:
BASE64_FALLTHROUGH

case 1: if (srclen-- == 0) {
ret = 1;
break;
}
if ((q = base64_table_dec_8bit[*c++]) >= 254) {
st.eof = BASE64_EOF;
// Treat character '=' as invalid for byte 1:
break;
}
*o++ = st.carry | (q >> 4);
st.carry = q << 4;
st.bytes++;
outl++;

// Deliberate fallthrough:
BASE64_FALLTHROUGH

case 2: if (srclen-- == 0) {
ret = 1;
break;
}
if ((q = base64_table_dec_8bit[*c++]) >= 254) {
st.bytes++;
// When q == 254, the input char is '='.
// Check if next byte is also '=':
if (q == 254) {
if (srclen-- != 0) {
st.bytes = 0;
// EOF:
st.eof = BASE64_EOF;
q = base64_table_dec_8bit[*c++];
ret = ((q == 254) && (srclen == 0)) ? 1 : 0;
break;
}
else {
// Almost EOF
st.eof = BASE64_AEOF;
ret = 1;
break;
}
}
// If we get here, there was an error:
break;
}
*o++ = st.carry | (q >> 2);
st.carry = q << 6;
st.bytes++;
outl++;

// Deliberate fallthrough:
BASE64_FALLTHROUGH

case 3: if (srclen-- == 0) {
ret = 1;
break;
}
if ((q = base64_table_dec_8bit[*c++]) >= 254) {
st.bytes = 0;
st.eof = BASE64_EOF;
// When q == 254, the input char is '='. Return 1 and EOF.
// When q == 255, the input char is invalid. Return 0 and EOF.
ret = ((q == 254) && (srclen == 0)) ? 1 : 0;
break;
}
*o++ = st.carry | q;
st.carry = 0;
st.bytes = 0;
outl++;
}
}
state->eof = st.eof;
state->bytes = st.bytes;
state->carry = st.carry;
*outlen = outl;
return ret;
23 changes: 23 additions & 0 deletions lib/arch/ssse3_atom/enc_head.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// Assume that *out is large enough to contain the output.
// Theoretically it should be 4/3 the length of src.
const uint8_t *c = (const uint8_t *)src;
uint8_t *o = (uint8_t *)out;

// Use local temporaries to avoid cache thrashing:
size_t outl = 0;
struct base64_state st;
st.bytes = state->bytes;
st.carry = state->carry;

// Turn three bytes into four 6-bit numbers:
// in[0] = 00111111
// in[1] = 00112222
// in[2] = 00222233
// in[3] = 00333333

// Duff's device, a for() loop inside a switch() statement. Legal!
switch (st.bytes)
{
for (;;)
{
case 0:
22 changes: 22 additions & 0 deletions lib/arch/ssse3_atom/enc_loop.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// If we have SSSE3 support, pick off 12 bytes at a time for as long as we can.
// But because we read 16 bytes at a time, ensure we have enough room to do a
// full 16-byte read without segfaulting:
while (srclen >= 16)
{
// Load string:
__m128i str = _mm_loadu_si128((__m128i *)c);

// Reshuffle:
str = enc_reshuffle(str);

// Translate reshuffled bytes to the Base64 alphabet:
str = enc_translate(str);

// Store:
_mm_storeu_si128((__m128i *)o, str);

c += 12; // 3 * 4 bytes of input
o += 16; // 4 * 4 bytes of output
outl += 16;
srclen -= 12;
}
48 changes: 48 additions & 0 deletions lib/arch/ssse3_atom/enc_reshuffle.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
static inline __m128i
enc_reshuffle (__m128i in)
{
// input, bytes MSB to LSB:
// 0 0 0 0 l k j i h g f e d c b a

in = _mm_shuffle_epi8(in, _mm_set_epi8(
10, 11, 9, 10,
7, 8, 6, 7,
4, 5, 3, 4,
1, 2, 0, 1));
// in, bytes MSB to LSB:
// k l j k
// h i g h
// e f d e
// b c a b

const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0fc0fc00));
// bits, upper case are most significant bits, lower case are least significant bits
// 0000kkkk LL000000 JJJJJJ00 00000000
// 0000hhhh II000000 GGGGGG00 00000000
// 0000eeee FF000000 DDDDDD00 00000000
// 0000bbbb CC000000 AAAAAA00 00000000

const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));
// 00000000 00kkkkLL 00000000 00JJJJJJ
// 00000000 00hhhhII 00000000 00GGGGGG
// 00000000 00eeeeFF 00000000 00DDDDDD
// 00000000 00bbbbCC 00000000 00AAAAAA

const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003f03f0));
// 00000000 00llllll 000000jj KKKK0000
// 00000000 00iiiiii 000000gg HHHH0000
// 00000000 00ffffff 000000dd EEEE0000
// 00000000 00cccccc 000000aa BBBB0000

const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));
// 00llllll 00000000 00jjKKKK 00000000
// 00iiiiii 00000000 00ggHHHH 00000000
// 00ffffff 00000000 00ddEEEE 00000000
// 00cccccc 00000000 00aaBBBB 00000000

return _mm_or_si128(t1, t3);
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
}
Loading

0 comments on commit 2765efd

Please sign in to comment.