From 2765efd90d37201f9b054e195e93191fb0a2865e Mon Sep 17 00:00:00 2001 From: Ferry Toth Date: Wed, 22 Jun 2022 21:21:12 +0200 Subject: [PATCH] codec: add ssse3_atom By performing benchmarks on Intel Edison (a Silvermont Atom CPU) in x86_64 mode from v0.3.0 we find that SSE3 had various ups and down. Substantial changes since v0.3.0 were: HASH SSSE3 SSSE3 e12e3cd 165 210 3f3f31c 206 150 67ee3fd 205 205 0a69845 145 205 a5b6739 145 218 6310c1f 157 218 9a0d1b2 158 210 5874921 165 210 Best performance was from 67ee3fd until decode performance regressed from 205 to 145 MB/s with commit 0a69845. The commit before that (b6417f3) had best decode performance with relatively good encode. Core(-i7) processors do not should such large performance changes. This patch adds the ssse3 codec from b6417f3 as ssse3_atom. Signed-off-by: Ferry Toth --- CMakeLists.txt | 4 + cmake/Modules/TargetSIMDInstructionSet.cmake | 1 + include/libbase64.h | 1 + lib/arch/sse2/compare_macros.h | 4 + lib/arch/ssse3_atom/codec.c | 42 +++++++++ lib/arch/ssse3_atom/dec_head.c | 36 ++++++++ lib/arch/ssse3_atom/dec_loop.c | 54 ++++++++++++ lib/arch/ssse3_atom/dec_reshuffle.c | 24 ++++++ lib/arch/ssse3_atom/dec_tail.c | 90 ++++++++++++++++++++ lib/arch/ssse3_atom/enc_head.c | 23 +++++ lib/arch/ssse3_atom/enc_loop.c | 22 +++++ lib/arch/ssse3_atom/enc_reshuffle.c | 48 +++++++++++ lib/arch/ssse3_atom/enc_tail.c | 34 ++++++++ lib/arch/ssse3_atom/enc_translate.c | 34 ++++++++ lib/codec_choose.c | 47 +++++++++- lib/lib.c | 2 +- test/benchmark.c | 2 +- test/codec_supported.c | 1 + 18 files changed, 466 insertions(+), 3 deletions(-) create mode 100644 lib/arch/sse2/compare_macros.h create mode 100644 lib/arch/ssse3_atom/codec.c create mode 100644 lib/arch/ssse3_atom/dec_head.c create mode 100644 lib/arch/ssse3_atom/dec_loop.c create mode 100644 lib/arch/ssse3_atom/dec_reshuffle.c create mode 100644 lib/arch/ssse3_atom/dec_tail.c create mode 100644 lib/arch/ssse3_atom/enc_head.c create mode 100644 lib/arch/ssse3_atom/enc_loop.c create mode 100644 lib/arch/ssse3_atom/enc_reshuffle.c create mode 100644 lib/arch/ssse3_atom/enc_tail.c create mode 100644 lib/arch/ssse3_atom/enc_translate.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 68d5d3eb..9c0feaf6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,6 +54,8 @@ cmake_dependent_option(BASE64_REGENERATE_TABLES "regenerate the codec tables" OF set(_IS_X86 "_TARGET_ARCH STREQUAL \"x86\" OR _TARGET_ARCH STREQUAL \"x64\"") cmake_dependent_option(BASE64_WITH_SSSE3 "add SSSE 3 codepath" ON ${_IS_X86} OFF) add_feature_info(SSSE3 BASE64_WITH_SSSE3 "add SSSE 3 codepath") +cmake_dependent_option(BASE64_WITH_SSSE3_ATOM "add SSSE 3 for ATOM codepath" ON ${_IS_X86} OFF) +add_feature_info(SSSE3 BASE64_WITH_SSSE3_ATOM "add SSSE 3 for codepath") cmake_dependent_option(BASE64_WITH_SSE41 "add SSE 4.1 codepath" ON ${_IS_X86} OFF) add_feature_info(SSE4.1 BASE64_WITH_SSE41 "add SSE 4.1 codepath") cmake_dependent_option(BASE64_WITH_SSE42 "add SSE 4.2 codepath" ON ${_IS_X86} OFF) @@ -116,6 +118,7 @@ add_library(base64 lib/arch/generic/codec.c lib/arch/ssse3/codec.c + lib/arch/ssse3_atom/codec.c lib/arch/sse41/codec.c lib/arch/sse42/codec.c lib/arch/avx/codec.c @@ -204,6 +207,7 @@ if (_TARGET_ARCH STREQUAL "x86" OR _TARGET_ARCH STREQUAL "x64") endmacro() configure_codec(SSSE3 __SSSE3__) + configure_codec(SSSE3_ATOM __SSSE3__) configure_codec(SSE41 __SSSE4_1__) configure_codec(SSE42 __SSSE4_2__) configure_codec(AVX) diff --git a/cmake/Modules/TargetSIMDInstructionSet.cmake b/cmake/Modules/TargetSIMDInstructionSet.cmake index ba1f6e51..c1154282 100644 --- a/cmake/Modules/TargetSIMDInstructionSet.cmake +++ b/cmake/Modules/TargetSIMDInstructionSet.cmake @@ -17,6 +17,7 @@ macro(define_SIMD_compile_flags) if (CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID STREQUAL "Clang" OR CMAKE_C_COMPILER_ID STREQUAL "AppleClang") # x86 set(COMPILE_FLAGS_SSSE3 "-mssse3") + set(COMPILE_FLAGS_SSSE3_ATOM "-mssse3") set(COMPILE_FLAGS_SSE41 "-msse4.1") set(COMPILE_FLAGS_SSE42 "-msse4.2") set(COMPILE_FLAGS_AVX "-mavx") diff --git a/include/libbase64.h b/include/libbase64.h index 10a83f85..1e3b58a3 100644 --- a/include/libbase64.h +++ b/include/libbase64.h @@ -53,6 +53,7 @@ extern "C" { #define BASE64_FORCE_SSE41 (1 << 5) #define BASE64_FORCE_SSE42 (1 << 6) #define BASE64_FORCE_AVX (1 << 7) +#define BASE64_FORCE_SSSE3_ATOM (1 << 8) #define BASE64_CHECK_SUPPORT (1 << 15) struct base64_state { diff --git a/lib/arch/sse2/compare_macros.h b/lib/arch/sse2/compare_macros.h new file mode 100644 index 00000000..76d92887 --- /dev/null +++ b/lib/arch/sse2/compare_macros.h @@ -0,0 +1,4 @@ +#define CMPGT(s,n) _mm_cmpgt_epi8((s), _mm_set1_epi8(n)) +#define CMPEQ(s,n) _mm_cmpeq_epi8((s), _mm_set1_epi8(n)) +#define REPLACE(s,n) _mm_and_si128((s), _mm_set1_epi8(n)) +#define RANGE(s,a,b) _mm_andnot_si128(CMPGT((s), (b)), CMPGT((s), (a) - 1)) diff --git a/lib/arch/ssse3_atom/codec.c b/lib/arch/ssse3_atom/codec.c new file mode 100644 index 00000000..25834e95 --- /dev/null +++ b/lib/arch/ssse3_atom/codec.c @@ -0,0 +1,42 @@ +#include +#include +#include + +#include "../../../include/libbase64.h" +#include "../../tables/tables.h" +#include "../../codecs.h" +#include "config.h" +#include "../../env.h" + +#if HAVE_SSSE3 +#include + +#include "../sse2/compare_macros.h" + +#include "dec_reshuffle.c" +#include "enc_reshuffle.c" +#include "enc_translate.c" + +#endif // __SSSE3__ + +BASE64_ENC_FUNCTION(ssse3_atom) +{ +#if HAVE_SSSE3 + #include "enc_head.c" + #include "enc_loop.c" + #include "enc_tail.c" +#else + BASE64_ENC_STUB +#endif +} + +BASE64_DEC_FUNCTION(ssse3_atom) +{ +#if HAVE_SSSE3 + #include "dec_head.c" + #include "dec_loop.c" + #include "dec_tail.c" +#else + BASE64_DEC_STUB +#endif +} diff --git a/lib/arch/ssse3_atom/dec_head.c b/lib/arch/ssse3_atom/dec_head.c new file mode 100644 index 00000000..55c75ef0 --- /dev/null +++ b/lib/arch/ssse3_atom/dec_head.c @@ -0,0 +1,36 @@ +int ret = 0; +const uint8_t *c = (const uint8_t *)src; +uint8_t *o = (uint8_t *)out; +uint8_t q; + +// Use local temporaries to avoid cache thrashing: +size_t outl = 0; +struct base64_state st; +st.eof = state->eof; +st.bytes = state->bytes; +st.carry = state->carry; + +// If we previously saw an EOF or an invalid character, bail out: +if (st.eof) { + *outlen = 0; + ret = 0; + // If there was a trailing '=' to check, check it: + if (srclen && (st.eof == BASE64_AEOF)) { + state->bytes = 0; + state->eof = BASE64_EOF; + ret = ((base64_table_dec_8bit[*c++] == 254) && (srclen == 1)) ? 1 : 0; + } + return ret; +} + +// Turn four 6-bit numbers into three bytes: +// out[0] = 11111122 +// out[1] = 22223333 +// out[2] = 33444444 + +// Duff's device again: +switch (st.bytes) +{ + for (;;) + { + case 0: diff --git a/lib/arch/ssse3_atom/dec_loop.c b/lib/arch/ssse3_atom/dec_loop.c new file mode 100644 index 00000000..e660d4c6 --- /dev/null +++ b/lib/arch/ssse3_atom/dec_loop.c @@ -0,0 +1,54 @@ +// If we have SSSE3 support, pick off 16 bytes at a time for as long as we can, +// but make sure that we quit before seeing any == markers at the end of the +// string. Also, because we write four zeroes at the end of the output, ensure +// that there are at least 6 valid bytes of input data remaining to close the +// gap. 16 + 2 + 6 = 24 bytes: +while (srclen >= 24) +{ + // Load string: + __m128i str = _mm_loadu_si128((__m128i *)c); + + // The input consists of six character sets in the Base64 alphabet, + // which we need to map back to the 6-bit values they represent. + // There are three ranges, two singles, and then there's the rest. + // + // # From To Add Characters + // 1 [43] [62] +19 + + // 2 [47] [63] +16 / + // 3 [48..57] [52..61] +4 0..9 + // 4 [65..90] [0..25] -65 A..Z + // 5 [97..122] [26..51] -71 a..z + // (6) Everything else => invalid input + + const __m128i set1 = CMPEQ(str, '+'); + const __m128i set2 = CMPEQ(str, '/'); + const __m128i set3 = RANGE(str, '0', '9'); + const __m128i set4 = RANGE(str, 'A', 'Z'); + const __m128i set5 = RANGE(str, 'a', 'z'); + + __m128i delta = REPLACE(set1, 19); + delta = _mm_or_si128(delta, REPLACE(set2, 16)); + delta = _mm_or_si128(delta, REPLACE(set3, 4)); + delta = _mm_or_si128(delta, REPLACE(set4, -65)); + delta = _mm_or_si128(delta, REPLACE(set5, -71)); + + // Check for invalid input: if any of the delta values are zero, + // fall back on bytewise code to do error checking and reporting: + if (_mm_movemask_epi8(CMPEQ(delta, 0))) { + break; + } + + // Now simply add the delta values to the input: + str = _mm_add_epi8(str, delta); + + // Reshuffle the input to packed 12-byte output format: + str = dec_reshuffle(str); + + // Store back: + _mm_storeu_si128((__m128i *)o, str); + + c += 16; + o += 12; + outl += 12; + srclen -= 16; +} diff --git a/lib/arch/ssse3_atom/dec_reshuffle.c b/lib/arch/ssse3_atom/dec_reshuffle.c new file mode 100644 index 00000000..b8cd0c13 --- /dev/null +++ b/lib/arch/ssse3_atom/dec_reshuffle.c @@ -0,0 +1,24 @@ +static inline __m128i +dec_reshuffle (__m128i in) +{ + // Mask in a single byte per shift: + const __m128i maskB2 = _mm_set1_epi32(0x003F0000); + const __m128i maskB1 = _mm_set1_epi32(0x00003F00); + + // Pack bytes together: + __m128i out = _mm_srli_epi32(in, 16); + + out = _mm_or_si128(out, _mm_srli_epi32(_mm_and_si128(in, maskB2), 2)); + + out = _mm_or_si128(out, _mm_slli_epi32(_mm_and_si128(in, maskB1), 12)); + + out = _mm_or_si128(out, _mm_slli_epi32(in, 26)); + + // Reshuffle and repack into 12-byte output format: + return _mm_shuffle_epi8(out, _mm_setr_epi8( + 3, 2, 1, + 7, 6, 5, + 11, 10, 9, + 15, 14, 13, + -1, -1, -1, -1)); +} diff --git a/lib/arch/ssse3_atom/dec_tail.c b/lib/arch/ssse3_atom/dec_tail.c new file mode 100644 index 00000000..e5831b21 --- /dev/null +++ b/lib/arch/ssse3_atom/dec_tail.c @@ -0,0 +1,90 @@ + if (srclen-- == 0) { + ret = 1; + break; + } + if ((q = base64_table_dec_8bit[*c++]) >= 254) { + st.eof = BASE64_EOF; + // Treat character '=' as invalid for byte 0: + break; + } + st.carry = q << 2; + st.bytes++; + + // Deliberate fallthrough: + BASE64_FALLTHROUGH + + case 1: if (srclen-- == 0) { + ret = 1; + break; + } + if ((q = base64_table_dec_8bit[*c++]) >= 254) { + st.eof = BASE64_EOF; + // Treat character '=' as invalid for byte 1: + break; + } + *o++ = st.carry | (q >> 4); + st.carry = q << 4; + st.bytes++; + outl++; + + // Deliberate fallthrough: + BASE64_FALLTHROUGH + + case 2: if (srclen-- == 0) { + ret = 1; + break; + } + if ((q = base64_table_dec_8bit[*c++]) >= 254) { + st.bytes++; + // When q == 254, the input char is '='. + // Check if next byte is also '=': + if (q == 254) { + if (srclen-- != 0) { + st.bytes = 0; + // EOF: + st.eof = BASE64_EOF; + q = base64_table_dec_8bit[*c++]; + ret = ((q == 254) && (srclen == 0)) ? 1 : 0; + break; + } + else { + // Almost EOF + st.eof = BASE64_AEOF; + ret = 1; + break; + } + } + // If we get here, there was an error: + break; + } + *o++ = st.carry | (q >> 2); + st.carry = q << 6; + st.bytes++; + outl++; + + // Deliberate fallthrough: + BASE64_FALLTHROUGH + + case 3: if (srclen-- == 0) { + ret = 1; + break; + } + if ((q = base64_table_dec_8bit[*c++]) >= 254) { + st.bytes = 0; + st.eof = BASE64_EOF; + // When q == 254, the input char is '='. Return 1 and EOF. + // When q == 255, the input char is invalid. Return 0 and EOF. + ret = ((q == 254) && (srclen == 0)) ? 1 : 0; + break; + } + *o++ = st.carry | q; + st.carry = 0; + st.bytes = 0; + outl++; + } +} +state->eof = st.eof; +state->bytes = st.bytes; +state->carry = st.carry; +*outlen = outl; +return ret; diff --git a/lib/arch/ssse3_atom/enc_head.c b/lib/arch/ssse3_atom/enc_head.c new file mode 100644 index 00000000..594054cf --- /dev/null +++ b/lib/arch/ssse3_atom/enc_head.c @@ -0,0 +1,23 @@ +// Assume that *out is large enough to contain the output. +// Theoretically it should be 4/3 the length of src. +const uint8_t *c = (const uint8_t *)src; +uint8_t *o = (uint8_t *)out; + +// Use local temporaries to avoid cache thrashing: +size_t outl = 0; +struct base64_state st; +st.bytes = state->bytes; +st.carry = state->carry; + +// Turn three bytes into four 6-bit numbers: +// in[0] = 00111111 +// in[1] = 00112222 +// in[2] = 00222233 +// in[3] = 00333333 + +// Duff's device, a for() loop inside a switch() statement. Legal! +switch (st.bytes) +{ + for (;;) + { + case 0: diff --git a/lib/arch/ssse3_atom/enc_loop.c b/lib/arch/ssse3_atom/enc_loop.c new file mode 100644 index 00000000..0518595e --- /dev/null +++ b/lib/arch/ssse3_atom/enc_loop.c @@ -0,0 +1,22 @@ +// If we have SSSE3 support, pick off 12 bytes at a time for as long as we can. +// But because we read 16 bytes at a time, ensure we have enough room to do a +// full 16-byte read without segfaulting: +while (srclen >= 16) +{ + // Load string: + __m128i str = _mm_loadu_si128((__m128i *)c); + + // Reshuffle: + str = enc_reshuffle(str); + + // Translate reshuffled bytes to the Base64 alphabet: + str = enc_translate(str); + + // Store: + _mm_storeu_si128((__m128i *)o, str); + + c += 12; // 3 * 4 bytes of input + o += 16; // 4 * 4 bytes of output + outl += 16; + srclen -= 12; +} diff --git a/lib/arch/ssse3_atom/enc_reshuffle.c b/lib/arch/ssse3_atom/enc_reshuffle.c new file mode 100644 index 00000000..088b3999 --- /dev/null +++ b/lib/arch/ssse3_atom/enc_reshuffle.c @@ -0,0 +1,48 @@ +static inline __m128i +enc_reshuffle (__m128i in) +{ + // input, bytes MSB to LSB: + // 0 0 0 0 l k j i h g f e d c b a + + in = _mm_shuffle_epi8(in, _mm_set_epi8( + 10, 11, 9, 10, + 7, 8, 6, 7, + 4, 5, 3, 4, + 1, 2, 0, 1)); + // in, bytes MSB to LSB: + // k l j k + // h i g h + // e f d e + // b c a b + + const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0fc0fc00)); + // bits, upper case are most significant bits, lower case are least significant bits + // 0000kkkk LL000000 JJJJJJ00 00000000 + // 0000hhhh II000000 GGGGGG00 00000000 + // 0000eeee FF000000 DDDDDD00 00000000 + // 0000bbbb CC000000 AAAAAA00 00000000 + + const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040)); + // 00000000 00kkkkLL 00000000 00JJJJJJ + // 00000000 00hhhhII 00000000 00GGGGGG + // 00000000 00eeeeFF 00000000 00DDDDDD + // 00000000 00bbbbCC 00000000 00AAAAAA + + const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003f03f0)); + // 00000000 00llllll 000000jj KKKK0000 + // 00000000 00iiiiii 000000gg HHHH0000 + // 00000000 00ffffff 000000dd EEEE0000 + // 00000000 00cccccc 000000aa BBBB0000 + + const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010)); + // 00llllll 00000000 00jjKKKK 00000000 + // 00iiiiii 00000000 00ggHHHH 00000000 + // 00ffffff 00000000 00ddEEEE 00000000 + // 00cccccc 00000000 00aaBBBB 00000000 + + return _mm_or_si128(t1, t3); + // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ + // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG + // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD + // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA +} diff --git a/lib/arch/ssse3_atom/enc_tail.c b/lib/arch/ssse3_atom/enc_tail.c new file mode 100644 index 00000000..45312546 --- /dev/null +++ b/lib/arch/ssse3_atom/enc_tail.c @@ -0,0 +1,34 @@ + if (srclen-- == 0) { + break; + } + *o++ = base64_table_enc_6bit[*c >> 2]; + st.carry = (*c++ << 4) & 0x30; + st.bytes++; + outl += 1; + + // Deliberate fallthrough: + BASE64_FALLTHROUGH + + case 1: if (srclen-- == 0) { + break; + } + *o++ = base64_table_enc_6bit[st.carry | (*c >> 4)]; + st.carry = (*c++ << 2) & 0x3C; + st.bytes++; + outl += 1; + + // Deliberate fallthrough: + BASE64_FALLTHROUGH + + case 2: if (srclen-- == 0) { + break; + } + *o++ = base64_table_enc_6bit[st.carry | (*c >> 6)]; + *o++ = base64_table_enc_6bit[*c++ & 0x3F]; + st.bytes = 0; + outl += 2; + } +} +state->bytes = st.bytes; +state->carry = st.carry; +*outlen = outl; diff --git a/lib/arch/ssse3_atom/enc_translate.c b/lib/arch/ssse3_atom/enc_translate.c new file mode 100644 index 00000000..9fbbbf75 --- /dev/null +++ b/lib/arch/ssse3_atom/enc_translate.c @@ -0,0 +1,34 @@ +static inline __m128i +enc_translate (const __m128i in) +{ + // LUT contains Absolute offset for all ranges: + const __m128i lut = _mm_setr_epi8( + 65, 71, -4, -4, + -4, -4, -4, -4, + -4, -4, -4, -4, + -19, -16, 0, 0 + ); + + // Translate values 0..63 to the Base64 alphabet. There are five sets: + // # From To Abs Index Characters + // 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ + // 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz + // 2 [52..61] [48..57] -4 [2..11] 0123456789 + // 3 [62] [43] -19 12 + + // 4 [63] [47] -16 13 / + + // Create LUT indices from input: + // the index for range #0 is right, others are 1 less than expected: + __m128i indices = _mm_subs_epu8(in, _mm_set1_epi8(51)); + + // mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0: + __m128i mask = CMPGT(in, 25); + + // substract -1, so add 1 to indices for range #[1..4], All indices are now correct: + indices = _mm_sub_epi8(indices, mask); + + // Add offsets to input values: + __m128i out = _mm_add_epi8(in, _mm_shuffle_epi8(lut, indices)); + + return out; +} diff --git a/lib/codec_choose.c b/lib/codec_choose.c index d97039e7..dd0b7c1d 100644 --- a/lib/codec_choose.c +++ b/lib/codec_choose.c @@ -83,12 +83,14 @@ BASE64_CODEC_FUNCS(ssse3) BASE64_CODEC_FUNCS(sse41) BASE64_CODEC_FUNCS(sse42) BASE64_CODEC_FUNCS(avx) +BASE64_CODEC_FUNCS(ssse3_atom) static bool avx2_supported(void); static bool avx_supported(void); static bool sse42_supported(void); static bool sse41_supported(void); static bool ssse3_supported(void); +static bool ssse3_atom_supported(void); static bool codec_choose_forced (struct codec *codec, int flags) @@ -101,7 +103,7 @@ codec_choose_forced (struct codec *codec, int flags) check = flags & BASE64_CHECK_SUPPORT; flags = flags & ~BASE64_CHECK_SUPPORT; - if (!(flags & 0xFF)) { + if (!(flags & 0xFFF)) { return false; } if (flags & BASE64_FORCE_AVX2) { @@ -161,6 +163,15 @@ codec_choose_forced (struct codec *codec, int flags) codec->dec = NULL; } } + if (flags & BASE64_FORCE_SSSE3_ATOM) { + if (!check || ssse3_atom_supported()) { + codec->enc = base64_stream_encode_ssse3_atom; + codec->dec = base64_stream_decode_ssse3_atom; + } else { + codec->enc = NULL; + codec->dec = NULL; + } + } return true; } @@ -362,6 +373,35 @@ static bool ssse3_supported(void) static bool ssse3_supported(void) {return false;} #endif +#if HAVE_SSSE3 +static bool ssse3_atom_supported(void) +{ + unsigned int eax, ebx = 0, ecx = 0, edx; + unsigned int max_level; + +#ifdef _MSC_VER + int info[4]; + __cpuidex(info, 0, 0); + max_level = info[0]; +#else + max_level = __get_cpuid_max(0, NULL); +#endif + + // Check for SSSE3 support: + if (max_level >= 1) { + __cpuid(1, eax, ebx, ecx, edx); + if (ecx & bit_SSSE3) { + return true; + } + } + + return false; +} +#else +static bool ssse3_atom_supported(void) {return false;} +#endif + + static bool codec_choose_x86 (struct codec *codec) { @@ -390,6 +430,11 @@ codec_choose_x86 (struct codec *codec) codec->dec = base64_stream_decode_ssse3; return true; } + if(ssse3_atom_supported()) { + codec->enc = base64_stream_encode_ssse3_atom; + codec->dec = base64_stream_decode_ssse3_atom; + return true; + } (void)codec; return false; diff --git a/lib/lib.c b/lib/lib.c index 66637c68..c8b6721c 100644 --- a/lib/lib.c +++ b/lib/lib.c @@ -19,7 +19,7 @@ void base64_stream_encode_init (struct base64_state *state, int flags) { // If any of the codec flags are set, redo choice: - if (codec.enc == NULL || flags & 0xFF) { + if (codec.enc == NULL || flags & 0xFFF) { codec_choose(&codec, flags); } state->eof = 0; diff --git a/test/benchmark.c b/test/benchmark.c index 80d21a38..0734ee98 100644 --- a/test/benchmark.c +++ b/test/benchmark.c @@ -213,7 +213,7 @@ main () } // Loop over all buffer sizes: - for (size_t i = 0; i < sizeof(sizes) / sizeof(sizes[0]); i++) { + for (size_t i = 0; i < sizeof(sizes) / sizeof(sizes[0]) ; i++) { printf("Testing with buffer size %s, fastest of %d * %d\n", sizes[i].label, sizes[i].repeat, sizes[i].batch); diff --git a/test/codec_supported.c b/test/codec_supported.c index 716c1f61..d5579c61 100644 --- a/test/codec_supported.c +++ b/test/codec_supported.c @@ -11,6 +11,7 @@ static char *_codecs[] = , "SSE41" , "SSE42" , "AVX" +, "SSSE3_ATOM" , NULL } ;