Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benchmarks #95

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ cmake_dependent_option(BASE64_REGENERATE_TABLES "regenerate the codec tables" OF
set(_IS_X86 "_TARGET_ARCH STREQUAL \"x86\" OR _TARGET_ARCH STREQUAL \"x64\"")
cmake_dependent_option(BASE64_WITH_SSSE3 "add SSSE 3 codepath" ON ${_IS_X86} OFF)
add_feature_info(SSSE3 BASE64_WITH_SSSE3 "add SSSE 3 codepath")
cmake_dependent_option(BASE64_WITH_SSSE3_ATOM "add SSSE 3 for ATOM codepath" ON ${_IS_X86} OFF)
add_feature_info(SSSE3 BASE64_WITH_SSSE3_ATOM "add SSSE 3 for codepath")
cmake_dependent_option(BASE64_WITH_SSE41 "add SSE 4.1 codepath" ON ${_IS_X86} OFF)
add_feature_info(SSE4.1 BASE64_WITH_SSE41 "add SSE 4.1 codepath")
cmake_dependent_option(BASE64_WITH_SSE42 "add SSE 4.2 codepath" ON ${_IS_X86} OFF)
Expand Down Expand Up @@ -116,6 +118,7 @@ add_library(base64
lib/arch/generic/codec.c

lib/arch/ssse3/codec.c
lib/arch/ssse3_atom/codec.c
lib/arch/sse41/codec.c
lib/arch/sse42/codec.c
lib/arch/avx/codec.c
Expand Down Expand Up @@ -204,6 +207,7 @@ if (_TARGET_ARCH STREQUAL "x86" OR _TARGET_ARCH STREQUAL "x64")
endmacro()

configure_codec(SSSE3 __SSSE3__)
configure_codec(SSSE3_ATOM __SSSE3__)
configure_codec(SSE41 __SSSE4_1__)
configure_codec(SSE42 __SSSE4_2__)
configure_codec(AVX)
Expand Down
1 change: 1 addition & 0 deletions cmake/Modules/TargetSIMDInstructionSet.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ macro(define_SIMD_compile_flags)
if (CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID STREQUAL "Clang" OR CMAKE_C_COMPILER_ID STREQUAL "AppleClang")
# x86
set(COMPILE_FLAGS_SSSE3 "-mssse3")
set(COMPILE_FLAGS_SSSE3_ATOM "-mssse3")
set(COMPILE_FLAGS_SSE41 "-msse4.1")
set(COMPILE_FLAGS_SSE42 "-msse4.2")
set(COMPILE_FLAGS_AVX "-mavx")
Expand Down
2 changes: 2 additions & 0 deletions include/libbase64.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ extern "C" {
#define BASE64_FORCE_SSE41 (1 << 5)
#define BASE64_FORCE_SSE42 (1 << 6)
#define BASE64_FORCE_AVX (1 << 7)
#define BASE64_FORCE_SSSE3_ATOM (1 << 8)
#define BASE64_CHECK_SUPPORT (1 << 15)

struct base64_state {
int eof;
Expand Down
4 changes: 4 additions & 0 deletions lib/arch/sse2/compare_macros.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#define CMPGT(s,n) _mm_cmpgt_epi8((s), _mm_set1_epi8(n))
#define CMPEQ(s,n) _mm_cmpeq_epi8((s), _mm_set1_epi8(n))
#define REPLACE(s,n) _mm_and_si128((s), _mm_set1_epi8(n))
#define RANGE(s,a,b) _mm_andnot_si128(CMPGT((s), (b)), CMPGT((s), (a) - 1))
42 changes: 42 additions & 0 deletions lib/arch/ssse3_atom/codec.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>

#include "../../../include/libbase64.h"
#include "../../tables/tables.h"
#include "../../codecs.h"
#include "config.h"
#include "../../env.h"

#if HAVE_SSSE3
#include <tmmintrin.h>

#include "../sse2/compare_macros.h"

#include "dec_reshuffle.c"
#include "enc_reshuffle.c"
#include "enc_translate.c"

#endif // __SSSE3__

BASE64_ENC_FUNCTION(ssse3_atom)
{
#if HAVE_SSSE3
#include "enc_head.c"
#include "enc_loop.c"
#include "enc_tail.c"
#else
BASE64_ENC_STUB
#endif
}

BASE64_DEC_FUNCTION(ssse3_atom)
{
#if HAVE_SSSE3
#include "dec_head.c"
#include "dec_loop.c"
#include "dec_tail.c"
#else
BASE64_DEC_STUB
#endif
}
36 changes: 36 additions & 0 deletions lib/arch/ssse3_atom/dec_head.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
int ret = 0;
const uint8_t *c = (const uint8_t *)src;
uint8_t *o = (uint8_t *)out;
uint8_t q;

// Use local temporaries to avoid cache thrashing:
size_t outl = 0;
struct base64_state st;
st.eof = state->eof;
st.bytes = state->bytes;
st.carry = state->carry;

// If we previously saw an EOF or an invalid character, bail out:
if (st.eof) {
*outlen = 0;
ret = 0;
// If there was a trailing '=' to check, check it:
if (srclen && (st.eof == BASE64_AEOF)) {
state->bytes = 0;
state->eof = BASE64_EOF;
ret = ((base64_table_dec_8bit[*c++] == 254) && (srclen == 1)) ? 1 : 0;
}
return ret;
}

// Turn four 6-bit numbers into three bytes:
// out[0] = 11111122
// out[1] = 22223333
// out[2] = 33444444

// Duff's device again:
switch (st.bytes)
{
for (;;)
{
case 0:
54 changes: 54 additions & 0 deletions lib/arch/ssse3_atom/dec_loop.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// If we have SSSE3 support, pick off 16 bytes at a time for as long as we can,
// but make sure that we quit before seeing any == markers at the end of the
// string. Also, because we write four zeroes at the end of the output, ensure
// that there are at least 6 valid bytes of input data remaining to close the
// gap. 16 + 2 + 6 = 24 bytes:
while (srclen >= 24)
{
// Load string:
__m128i str = _mm_loadu_si128((__m128i *)c);

// The input consists of six character sets in the Base64 alphabet,
// which we need to map back to the 6-bit values they represent.
// There are three ranges, two singles, and then there's the rest.
//
// # From To Add Characters
// 1 [43] [62] +19 +
// 2 [47] [63] +16 /
// 3 [48..57] [52..61] +4 0..9
// 4 [65..90] [0..25] -65 A..Z
// 5 [97..122] [26..51] -71 a..z
// (6) Everything else => invalid input

const __m128i set1 = CMPEQ(str, '+');
const __m128i set2 = CMPEQ(str, '/');
const __m128i set3 = RANGE(str, '0', '9');
const __m128i set4 = RANGE(str, 'A', 'Z');
const __m128i set5 = RANGE(str, 'a', 'z');

__m128i delta = REPLACE(set1, 19);
delta = _mm_or_si128(delta, REPLACE(set2, 16));
delta = _mm_or_si128(delta, REPLACE(set3, 4));
delta = _mm_or_si128(delta, REPLACE(set4, -65));
delta = _mm_or_si128(delta, REPLACE(set5, -71));

// Check for invalid input: if any of the delta values are zero,
// fall back on bytewise code to do error checking and reporting:
if (_mm_movemask_epi8(CMPEQ(delta, 0))) {
break;
}

// Now simply add the delta values to the input:
str = _mm_add_epi8(str, delta);

// Reshuffle the input to packed 12-byte output format:
str = dec_reshuffle(str);

// Store back:
_mm_storeu_si128((__m128i *)o, str);

c += 16;
o += 12;
outl += 12;
srclen -= 16;
}
24 changes: 24 additions & 0 deletions lib/arch/ssse3_atom/dec_reshuffle.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
static inline __m128i
dec_reshuffle (__m128i in)
{
// Mask in a single byte per shift:
const __m128i maskB2 = _mm_set1_epi32(0x003F0000);
const __m128i maskB1 = _mm_set1_epi32(0x00003F00);

// Pack bytes together:
__m128i out = _mm_srli_epi32(in, 16);

out = _mm_or_si128(out, _mm_srli_epi32(_mm_and_si128(in, maskB2), 2));

out = _mm_or_si128(out, _mm_slli_epi32(_mm_and_si128(in, maskB1), 12));

out = _mm_or_si128(out, _mm_slli_epi32(in, 26));

// Reshuffle and repack into 12-byte output format:
return _mm_shuffle_epi8(out, _mm_setr_epi8(
3, 2, 1,
7, 6, 5,
11, 10, 9,
15, 14, 13,
-1, -1, -1, -1));
}
90 changes: 90 additions & 0 deletions lib/arch/ssse3_atom/dec_tail.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
if (srclen-- == 0) {
ret = 1;
break;
}
if ((q = base64_table_dec_8bit[*c++]) >= 254) {
st.eof = BASE64_EOF;
// Treat character '=' as invalid for byte 0:
break;
}
st.carry = q << 2;
st.bytes++;

// Deliberate fallthrough:
BASE64_FALLTHROUGH

case 1: if (srclen-- == 0) {
ret = 1;
break;
}
if ((q = base64_table_dec_8bit[*c++]) >= 254) {
st.eof = BASE64_EOF;
// Treat character '=' as invalid for byte 1:
break;
}
*o++ = st.carry | (q >> 4);
st.carry = q << 4;
st.bytes++;
outl++;

// Deliberate fallthrough:
BASE64_FALLTHROUGH

case 2: if (srclen-- == 0) {
ret = 1;
break;
}
if ((q = base64_table_dec_8bit[*c++]) >= 254) {
st.bytes++;
// When q == 254, the input char is '='.
// Check if next byte is also '=':
if (q == 254) {
if (srclen-- != 0) {
st.bytes = 0;
// EOF:
st.eof = BASE64_EOF;
q = base64_table_dec_8bit[*c++];
ret = ((q == 254) && (srclen == 0)) ? 1 : 0;
break;
}
else {
// Almost EOF
st.eof = BASE64_AEOF;
ret = 1;
break;
}
}
// If we get here, there was an error:
break;
}
*o++ = st.carry | (q >> 2);
st.carry = q << 6;
st.bytes++;
outl++;

// Deliberate fallthrough:
BASE64_FALLTHROUGH

case 3: if (srclen-- == 0) {
ret = 1;
break;
}
if ((q = base64_table_dec_8bit[*c++]) >= 254) {
st.bytes = 0;
st.eof = BASE64_EOF;
// When q == 254, the input char is '='. Return 1 and EOF.
// When q == 255, the input char is invalid. Return 0 and EOF.
ret = ((q == 254) && (srclen == 0)) ? 1 : 0;
break;
}
*o++ = st.carry | q;
st.carry = 0;
st.bytes = 0;
outl++;
}
}
state->eof = st.eof;
state->bytes = st.bytes;
state->carry = st.carry;
*outlen = outl;
return ret;
23 changes: 23 additions & 0 deletions lib/arch/ssse3_atom/enc_head.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// Assume that *out is large enough to contain the output.
// Theoretically it should be 4/3 the length of src.
const uint8_t *c = (const uint8_t *)src;
uint8_t *o = (uint8_t *)out;

// Use local temporaries to avoid cache thrashing:
size_t outl = 0;
struct base64_state st;
st.bytes = state->bytes;
st.carry = state->carry;

// Turn three bytes into four 6-bit numbers:
// in[0] = 00111111
// in[1] = 00112222
// in[2] = 00222233
// in[3] = 00333333

// Duff's device, a for() loop inside a switch() statement. Legal!
switch (st.bytes)
{
for (;;)
{
case 0:
22 changes: 22 additions & 0 deletions lib/arch/ssse3_atom/enc_loop.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// If we have SSSE3 support, pick off 12 bytes at a time for as long as we can.
// But because we read 16 bytes at a time, ensure we have enough room to do a
// full 16-byte read without segfaulting:
while (srclen >= 16)
{
// Load string:
__m128i str = _mm_loadu_si128((__m128i *)c);

// Reshuffle:
str = enc_reshuffle(str);

// Translate reshuffled bytes to the Base64 alphabet:
str = enc_translate(str);

// Store:
_mm_storeu_si128((__m128i *)o, str);

c += 12; // 3 * 4 bytes of input
o += 16; // 4 * 4 bytes of output
outl += 16;
srclen -= 12;
}
48 changes: 48 additions & 0 deletions lib/arch/ssse3_atom/enc_reshuffle.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
static inline __m128i
enc_reshuffle (__m128i in)
{
// input, bytes MSB to LSB:
// 0 0 0 0 l k j i h g f e d c b a

in = _mm_shuffle_epi8(in, _mm_set_epi8(
10, 11, 9, 10,
7, 8, 6, 7,
4, 5, 3, 4,
1, 2, 0, 1));
// in, bytes MSB to LSB:
// k l j k
// h i g h
// e f d e
// b c a b

const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0fc0fc00));
// bits, upper case are most significant bits, lower case are least significant bits
// 0000kkkk LL000000 JJJJJJ00 00000000
// 0000hhhh II000000 GGGGGG00 00000000
// 0000eeee FF000000 DDDDDD00 00000000
// 0000bbbb CC000000 AAAAAA00 00000000

const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));
// 00000000 00kkkkLL 00000000 00JJJJJJ
// 00000000 00hhhhII 00000000 00GGGGGG
// 00000000 00eeeeFF 00000000 00DDDDDD
// 00000000 00bbbbCC 00000000 00AAAAAA

const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003f03f0));
// 00000000 00llllll 000000jj KKKK0000
// 00000000 00iiiiii 000000gg HHHH0000
// 00000000 00ffffff 000000dd EEEE0000
// 00000000 00cccccc 000000aa BBBB0000

const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));
// 00llllll 00000000 00jjKKKK 00000000
// 00iiiiii 00000000 00ggHHHH 00000000
// 00ffffff 00000000 00ddEEEE 00000000
// 00cccccc 00000000 00aaBBBB 00000000

return _mm_or_si128(t1, t3);
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
}
Loading