diff --git a/CMakeLists.txt b/CMakeLists.txt index 68d5d3eb..9c0feaf6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,6 +54,8 @@ cmake_dependent_option(BASE64_REGENERATE_TABLES "regenerate the codec tables" OF set(_IS_X86 "_TARGET_ARCH STREQUAL \"x86\" OR _TARGET_ARCH STREQUAL \"x64\"") cmake_dependent_option(BASE64_WITH_SSSE3 "add SSSE 3 codepath" ON ${_IS_X86} OFF) add_feature_info(SSSE3 BASE64_WITH_SSSE3 "add SSSE 3 codepath") +cmake_dependent_option(BASE64_WITH_SSSE3_ATOM "add SSSE 3 for ATOM codepath" ON ${_IS_X86} OFF) +add_feature_info(SSSE3 BASE64_WITH_SSSE3_ATOM "add SSSE 3 for codepath") cmake_dependent_option(BASE64_WITH_SSE41 "add SSE 4.1 codepath" ON ${_IS_X86} OFF) add_feature_info(SSE4.1 BASE64_WITH_SSE41 "add SSE 4.1 codepath") cmake_dependent_option(BASE64_WITH_SSE42 "add SSE 4.2 codepath" ON ${_IS_X86} OFF) @@ -116,6 +118,7 @@ add_library(base64 lib/arch/generic/codec.c lib/arch/ssse3/codec.c + lib/arch/ssse3_atom/codec.c lib/arch/sse41/codec.c lib/arch/sse42/codec.c lib/arch/avx/codec.c @@ -204,6 +207,7 @@ if (_TARGET_ARCH STREQUAL "x86" OR _TARGET_ARCH STREQUAL "x64") endmacro() configure_codec(SSSE3 __SSSE3__) + configure_codec(SSSE3_ATOM __SSSE3__) configure_codec(SSE41 __SSSE4_1__) configure_codec(SSE42 __SSSE4_2__) configure_codec(AVX) diff --git a/cmake/Modules/TargetSIMDInstructionSet.cmake b/cmake/Modules/TargetSIMDInstructionSet.cmake index ba1f6e51..c1154282 100644 --- a/cmake/Modules/TargetSIMDInstructionSet.cmake +++ b/cmake/Modules/TargetSIMDInstructionSet.cmake @@ -17,6 +17,7 @@ macro(define_SIMD_compile_flags) if (CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID STREQUAL "Clang" OR CMAKE_C_COMPILER_ID STREQUAL "AppleClang") # x86 set(COMPILE_FLAGS_SSSE3 "-mssse3") + set(COMPILE_FLAGS_SSSE3_ATOM "-mssse3") set(COMPILE_FLAGS_SSE41 "-msse4.1") set(COMPILE_FLAGS_SSE42 "-msse4.2") set(COMPILE_FLAGS_AVX "-mavx") diff --git a/include/libbase64.h b/include/libbase64.h index d470a82f..1e3b58a3 100644 --- a/include/libbase64.h +++ b/include/libbase64.h @@ -53,6 +53,8 @@ extern "C" { #define BASE64_FORCE_SSE41 (1 << 5) #define BASE64_FORCE_SSE42 (1 << 6) #define BASE64_FORCE_AVX (1 << 7) +#define BASE64_FORCE_SSSE3_ATOM (1 << 8) +#define BASE64_CHECK_SUPPORT (1 << 15) struct base64_state { int eof; diff --git a/lib/arch/sse2/compare_macros.h b/lib/arch/sse2/compare_macros.h new file mode 100644 index 00000000..76d92887 --- /dev/null +++ b/lib/arch/sse2/compare_macros.h @@ -0,0 +1,4 @@ +#define CMPGT(s,n) _mm_cmpgt_epi8((s), _mm_set1_epi8(n)) +#define CMPEQ(s,n) _mm_cmpeq_epi8((s), _mm_set1_epi8(n)) +#define REPLACE(s,n) _mm_and_si128((s), _mm_set1_epi8(n)) +#define RANGE(s,a,b) _mm_andnot_si128(CMPGT((s), (b)), CMPGT((s), (a) - 1)) diff --git a/lib/arch/ssse3_atom/codec.c b/lib/arch/ssse3_atom/codec.c new file mode 100644 index 00000000..25834e95 --- /dev/null +++ b/lib/arch/ssse3_atom/codec.c @@ -0,0 +1,42 @@ +#include +#include +#include + +#include "../../../include/libbase64.h" +#include "../../tables/tables.h" +#include "../../codecs.h" +#include "config.h" +#include "../../env.h" + +#if HAVE_SSSE3 +#include + +#include "../sse2/compare_macros.h" + +#include "dec_reshuffle.c" +#include "enc_reshuffle.c" +#include "enc_translate.c" + +#endif // __SSSE3__ + +BASE64_ENC_FUNCTION(ssse3_atom) +{ +#if HAVE_SSSE3 + #include "enc_head.c" + #include "enc_loop.c" + #include "enc_tail.c" +#else + BASE64_ENC_STUB +#endif +} + +BASE64_DEC_FUNCTION(ssse3_atom) +{ +#if HAVE_SSSE3 + #include "dec_head.c" + #include "dec_loop.c" + #include "dec_tail.c" +#else + BASE64_DEC_STUB +#endif +} diff --git a/lib/arch/ssse3_atom/dec_head.c b/lib/arch/ssse3_atom/dec_head.c new file mode 100644 index 00000000..55c75ef0 --- /dev/null +++ b/lib/arch/ssse3_atom/dec_head.c @@ -0,0 +1,36 @@ +int ret = 0; +const uint8_t *c = (const uint8_t *)src; +uint8_t *o = (uint8_t *)out; +uint8_t q; + +// Use local temporaries to avoid cache thrashing: +size_t outl = 0; +struct base64_state st; +st.eof = state->eof; +st.bytes = state->bytes; +st.carry = state->carry; + +// If we previously saw an EOF or an invalid character, bail out: +if (st.eof) { + *outlen = 0; + ret = 0; + // If there was a trailing '=' to check, check it: + if (srclen && (st.eof == BASE64_AEOF)) { + state->bytes = 0; + state->eof = BASE64_EOF; + ret = ((base64_table_dec_8bit[*c++] == 254) && (srclen == 1)) ? 1 : 0; + } + return ret; +} + +// Turn four 6-bit numbers into three bytes: +// out[0] = 11111122 +// out[1] = 22223333 +// out[2] = 33444444 + +// Duff's device again: +switch (st.bytes) +{ + for (;;) + { + case 0: diff --git a/lib/arch/ssse3_atom/dec_loop.c b/lib/arch/ssse3_atom/dec_loop.c new file mode 100644 index 00000000..e660d4c6 --- /dev/null +++ b/lib/arch/ssse3_atom/dec_loop.c @@ -0,0 +1,54 @@ +// If we have SSSE3 support, pick off 16 bytes at a time for as long as we can, +// but make sure that we quit before seeing any == markers at the end of the +// string. Also, because we write four zeroes at the end of the output, ensure +// that there are at least 6 valid bytes of input data remaining to close the +// gap. 16 + 2 + 6 = 24 bytes: +while (srclen >= 24) +{ + // Load string: + __m128i str = _mm_loadu_si128((__m128i *)c); + + // The input consists of six character sets in the Base64 alphabet, + // which we need to map back to the 6-bit values they represent. + // There are three ranges, two singles, and then there's the rest. + // + // # From To Add Characters + // 1 [43] [62] +19 + + // 2 [47] [63] +16 / + // 3 [48..57] [52..61] +4 0..9 + // 4 [65..90] [0..25] -65 A..Z + // 5 [97..122] [26..51] -71 a..z + // (6) Everything else => invalid input + + const __m128i set1 = CMPEQ(str, '+'); + const __m128i set2 = CMPEQ(str, '/'); + const __m128i set3 = RANGE(str, '0', '9'); + const __m128i set4 = RANGE(str, 'A', 'Z'); + const __m128i set5 = RANGE(str, 'a', 'z'); + + __m128i delta = REPLACE(set1, 19); + delta = _mm_or_si128(delta, REPLACE(set2, 16)); + delta = _mm_or_si128(delta, REPLACE(set3, 4)); + delta = _mm_or_si128(delta, REPLACE(set4, -65)); + delta = _mm_or_si128(delta, REPLACE(set5, -71)); + + // Check for invalid input: if any of the delta values are zero, + // fall back on bytewise code to do error checking and reporting: + if (_mm_movemask_epi8(CMPEQ(delta, 0))) { + break; + } + + // Now simply add the delta values to the input: + str = _mm_add_epi8(str, delta); + + // Reshuffle the input to packed 12-byte output format: + str = dec_reshuffle(str); + + // Store back: + _mm_storeu_si128((__m128i *)o, str); + + c += 16; + o += 12; + outl += 12; + srclen -= 16; +} diff --git a/lib/arch/ssse3_atom/dec_reshuffle.c b/lib/arch/ssse3_atom/dec_reshuffle.c new file mode 100644 index 00000000..b8cd0c13 --- /dev/null +++ b/lib/arch/ssse3_atom/dec_reshuffle.c @@ -0,0 +1,24 @@ +static inline __m128i +dec_reshuffle (__m128i in) +{ + // Mask in a single byte per shift: + const __m128i maskB2 = _mm_set1_epi32(0x003F0000); + const __m128i maskB1 = _mm_set1_epi32(0x00003F00); + + // Pack bytes together: + __m128i out = _mm_srli_epi32(in, 16); + + out = _mm_or_si128(out, _mm_srli_epi32(_mm_and_si128(in, maskB2), 2)); + + out = _mm_or_si128(out, _mm_slli_epi32(_mm_and_si128(in, maskB1), 12)); + + out = _mm_or_si128(out, _mm_slli_epi32(in, 26)); + + // Reshuffle and repack into 12-byte output format: + return _mm_shuffle_epi8(out, _mm_setr_epi8( + 3, 2, 1, + 7, 6, 5, + 11, 10, 9, + 15, 14, 13, + -1, -1, -1, -1)); +} diff --git a/lib/arch/ssse3_atom/dec_tail.c b/lib/arch/ssse3_atom/dec_tail.c new file mode 100644 index 00000000..e5831b21 --- /dev/null +++ b/lib/arch/ssse3_atom/dec_tail.c @@ -0,0 +1,90 @@ + if (srclen-- == 0) { + ret = 1; + break; + } + if ((q = base64_table_dec_8bit[*c++]) >= 254) { + st.eof = BASE64_EOF; + // Treat character '=' as invalid for byte 0: + break; + } + st.carry = q << 2; + st.bytes++; + + // Deliberate fallthrough: + BASE64_FALLTHROUGH + + case 1: if (srclen-- == 0) { + ret = 1; + break; + } + if ((q = base64_table_dec_8bit[*c++]) >= 254) { + st.eof = BASE64_EOF; + // Treat character '=' as invalid for byte 1: + break; + } + *o++ = st.carry | (q >> 4); + st.carry = q << 4; + st.bytes++; + outl++; + + // Deliberate fallthrough: + BASE64_FALLTHROUGH + + case 2: if (srclen-- == 0) { + ret = 1; + break; + } + if ((q = base64_table_dec_8bit[*c++]) >= 254) { + st.bytes++; + // When q == 254, the input char is '='. + // Check if next byte is also '=': + if (q == 254) { + if (srclen-- != 0) { + st.bytes = 0; + // EOF: + st.eof = BASE64_EOF; + q = base64_table_dec_8bit[*c++]; + ret = ((q == 254) && (srclen == 0)) ? 1 : 0; + break; + } + else { + // Almost EOF + st.eof = BASE64_AEOF; + ret = 1; + break; + } + } + // If we get here, there was an error: + break; + } + *o++ = st.carry | (q >> 2); + st.carry = q << 6; + st.bytes++; + outl++; + + // Deliberate fallthrough: + BASE64_FALLTHROUGH + + case 3: if (srclen-- == 0) { + ret = 1; + break; + } + if ((q = base64_table_dec_8bit[*c++]) >= 254) { + st.bytes = 0; + st.eof = BASE64_EOF; + // When q == 254, the input char is '='. Return 1 and EOF. + // When q == 255, the input char is invalid. Return 0 and EOF. + ret = ((q == 254) && (srclen == 0)) ? 1 : 0; + break; + } + *o++ = st.carry | q; + st.carry = 0; + st.bytes = 0; + outl++; + } +} +state->eof = st.eof; +state->bytes = st.bytes; +state->carry = st.carry; +*outlen = outl; +return ret; diff --git a/lib/arch/ssse3_atom/enc_head.c b/lib/arch/ssse3_atom/enc_head.c new file mode 100644 index 00000000..594054cf --- /dev/null +++ b/lib/arch/ssse3_atom/enc_head.c @@ -0,0 +1,23 @@ +// Assume that *out is large enough to contain the output. +// Theoretically it should be 4/3 the length of src. +const uint8_t *c = (const uint8_t *)src; +uint8_t *o = (uint8_t *)out; + +// Use local temporaries to avoid cache thrashing: +size_t outl = 0; +struct base64_state st; +st.bytes = state->bytes; +st.carry = state->carry; + +// Turn three bytes into four 6-bit numbers: +// in[0] = 00111111 +// in[1] = 00112222 +// in[2] = 00222233 +// in[3] = 00333333 + +// Duff's device, a for() loop inside a switch() statement. Legal! +switch (st.bytes) +{ + for (;;) + { + case 0: diff --git a/lib/arch/ssse3_atom/enc_loop.c b/lib/arch/ssse3_atom/enc_loop.c new file mode 100644 index 00000000..0518595e --- /dev/null +++ b/lib/arch/ssse3_atom/enc_loop.c @@ -0,0 +1,22 @@ +// If we have SSSE3 support, pick off 12 bytes at a time for as long as we can. +// But because we read 16 bytes at a time, ensure we have enough room to do a +// full 16-byte read without segfaulting: +while (srclen >= 16) +{ + // Load string: + __m128i str = _mm_loadu_si128((__m128i *)c); + + // Reshuffle: + str = enc_reshuffle(str); + + // Translate reshuffled bytes to the Base64 alphabet: + str = enc_translate(str); + + // Store: + _mm_storeu_si128((__m128i *)o, str); + + c += 12; // 3 * 4 bytes of input + o += 16; // 4 * 4 bytes of output + outl += 16; + srclen -= 12; +} diff --git a/lib/arch/ssse3_atom/enc_reshuffle.c b/lib/arch/ssse3_atom/enc_reshuffle.c new file mode 100644 index 00000000..088b3999 --- /dev/null +++ b/lib/arch/ssse3_atom/enc_reshuffle.c @@ -0,0 +1,48 @@ +static inline __m128i +enc_reshuffle (__m128i in) +{ + // input, bytes MSB to LSB: + // 0 0 0 0 l k j i h g f e d c b a + + in = _mm_shuffle_epi8(in, _mm_set_epi8( + 10, 11, 9, 10, + 7, 8, 6, 7, + 4, 5, 3, 4, + 1, 2, 0, 1)); + // in, bytes MSB to LSB: + // k l j k + // h i g h + // e f d e + // b c a b + + const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0fc0fc00)); + // bits, upper case are most significant bits, lower case are least significant bits + // 0000kkkk LL000000 JJJJJJ00 00000000 + // 0000hhhh II000000 GGGGGG00 00000000 + // 0000eeee FF000000 DDDDDD00 00000000 + // 0000bbbb CC000000 AAAAAA00 00000000 + + const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040)); + // 00000000 00kkkkLL 00000000 00JJJJJJ + // 00000000 00hhhhII 00000000 00GGGGGG + // 00000000 00eeeeFF 00000000 00DDDDDD + // 00000000 00bbbbCC 00000000 00AAAAAA + + const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003f03f0)); + // 00000000 00llllll 000000jj KKKK0000 + // 00000000 00iiiiii 000000gg HHHH0000 + // 00000000 00ffffff 000000dd EEEE0000 + // 00000000 00cccccc 000000aa BBBB0000 + + const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010)); + // 00llllll 00000000 00jjKKKK 00000000 + // 00iiiiii 00000000 00ggHHHH 00000000 + // 00ffffff 00000000 00ddEEEE 00000000 + // 00cccccc 00000000 00aaBBBB 00000000 + + return _mm_or_si128(t1, t3); + // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ + // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG + // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD + // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA +} diff --git a/lib/arch/ssse3_atom/enc_tail.c b/lib/arch/ssse3_atom/enc_tail.c new file mode 100644 index 00000000..45312546 --- /dev/null +++ b/lib/arch/ssse3_atom/enc_tail.c @@ -0,0 +1,34 @@ + if (srclen-- == 0) { + break; + } + *o++ = base64_table_enc_6bit[*c >> 2]; + st.carry = (*c++ << 4) & 0x30; + st.bytes++; + outl += 1; + + // Deliberate fallthrough: + BASE64_FALLTHROUGH + + case 1: if (srclen-- == 0) { + break; + } + *o++ = base64_table_enc_6bit[st.carry | (*c >> 4)]; + st.carry = (*c++ << 2) & 0x3C; + st.bytes++; + outl += 1; + + // Deliberate fallthrough: + BASE64_FALLTHROUGH + + case 2: if (srclen-- == 0) { + break; + } + *o++ = base64_table_enc_6bit[st.carry | (*c >> 6)]; + *o++ = base64_table_enc_6bit[*c++ & 0x3F]; + st.bytes = 0; + outl += 2; + } +} +state->bytes = st.bytes; +state->carry = st.carry; +*outlen = outl; diff --git a/lib/arch/ssse3_atom/enc_translate.c b/lib/arch/ssse3_atom/enc_translate.c new file mode 100644 index 00000000..9fbbbf75 --- /dev/null +++ b/lib/arch/ssse3_atom/enc_translate.c @@ -0,0 +1,34 @@ +static inline __m128i +enc_translate (const __m128i in) +{ + // LUT contains Absolute offset for all ranges: + const __m128i lut = _mm_setr_epi8( + 65, 71, -4, -4, + -4, -4, -4, -4, + -4, -4, -4, -4, + -19, -16, 0, 0 + ); + + // Translate values 0..63 to the Base64 alphabet. There are five sets: + // # From To Abs Index Characters + // 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ + // 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz + // 2 [52..61] [48..57] -4 [2..11] 0123456789 + // 3 [62] [43] -19 12 + + // 4 [63] [47] -16 13 / + + // Create LUT indices from input: + // the index for range #0 is right, others are 1 less than expected: + __m128i indices = _mm_subs_epu8(in, _mm_set1_epi8(51)); + + // mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0: + __m128i mask = CMPGT(in, 25); + + // substract -1, so add 1 to indices for range #[1..4], All indices are now correct: + indices = _mm_sub_epi8(indices, mask); + + // Add offsets to input values: + __m128i out = _mm_add_epi8(in, _mm_shuffle_epi8(lut, indices)); + + return out; +} diff --git a/lib/codec_choose.c b/lib/codec_choose.c index 6a07d6a7..dd0b7c1d 100644 --- a/lib/codec_choose.c +++ b/lib/codec_choose.c @@ -83,58 +83,96 @@ BASE64_CODEC_FUNCS(ssse3) BASE64_CODEC_FUNCS(sse41) BASE64_CODEC_FUNCS(sse42) BASE64_CODEC_FUNCS(avx) +BASE64_CODEC_FUNCS(ssse3_atom) + +static bool avx2_supported(void); +static bool avx_supported(void); +static bool sse42_supported(void); +static bool sse41_supported(void); +static bool ssse3_supported(void); +static bool ssse3_atom_supported(void); static bool codec_choose_forced (struct codec *codec, int flags) { - // If the user wants to use a certain codec, - // always allow it, even if the codec is a no-op. + bool check; + // If the user wants to use a certain codec, always allow it, + // even if the codec is a no-op, except when BASE64_CHECK_SUPPORT + // is set. // For testing purposes. - if (!(flags & 0xFF)) { + check = flags & BASE64_CHECK_SUPPORT; + flags = flags & ~BASE64_CHECK_SUPPORT; + if (!(flags & 0xFFF)) { return false; } if (flags & BASE64_FORCE_AVX2) { - codec->enc = base64_stream_encode_avx2; - codec->dec = base64_stream_decode_avx2; - return true; + if (!check || avx2_supported()) { + codec->enc = base64_stream_encode_avx2; + codec->dec = base64_stream_decode_avx2; + } else { + codec->enc = NULL; + codec->dec = NULL; + } } if (flags & BASE64_FORCE_NEON32) { codec->enc = base64_stream_encode_neon32; codec->dec = base64_stream_decode_neon32; - return true; } if (flags & BASE64_FORCE_NEON64) { codec->enc = base64_stream_encode_neon64; codec->dec = base64_stream_decode_neon64; - return true; } if (flags & BASE64_FORCE_PLAIN) { codec->enc = base64_stream_encode_plain; codec->dec = base64_stream_decode_plain; - return true; } if (flags & BASE64_FORCE_SSSE3) { - codec->enc = base64_stream_encode_ssse3; - codec->dec = base64_stream_decode_ssse3; - return true; + if (!check || ssse3_supported()) { + codec->enc = base64_stream_encode_ssse3; + codec->dec = base64_stream_decode_ssse3; + } else { + codec->enc = NULL; + codec->dec = NULL; + } } if (flags & BASE64_FORCE_SSE41) { - codec->enc = base64_stream_encode_sse41; - codec->dec = base64_stream_decode_sse41; - return true; + if (!check || sse41_supported()) { + codec->enc = base64_stream_encode_sse41; + codec->dec = base64_stream_decode_sse41; + } else { + codec->enc = NULL; + codec->dec = NULL; + } } if (flags & BASE64_FORCE_SSE42) { - codec->enc = base64_stream_encode_sse42; - codec->dec = base64_stream_decode_sse42; - return true; + if (!check || sse42_supported()) { + codec->enc = base64_stream_encode_sse42; + codec->dec = base64_stream_decode_sse42; + } else { + codec->enc = NULL; + codec->dec = NULL; + } } if (flags & BASE64_FORCE_AVX) { - codec->enc = base64_stream_encode_avx; - codec->dec = base64_stream_decode_avx; - return true; + if (!check || avx_supported()) { + codec->enc = base64_stream_encode_avx; + codec->dec = base64_stream_decode_avx; + } else { + codec->enc = NULL; + codec->dec = NULL; + } } - return false; + if (flags & BASE64_FORCE_SSSE3_ATOM) { + if (!check || ssse3_atom_supported()) { + codec->enc = base64_stream_encode_ssse3_atom; + codec->dec = base64_stream_decode_ssse3_atom; + } else { + codec->enc = NULL; + codec->dec = NULL; + } + } + return true; } static bool @@ -162,23 +200,21 @@ codec_choose_arm (struct codec *codec) #endif } -static bool -codec_choose_x86 (struct codec *codec) -{ -#ifdef BASE64_X86_SIMD +#if HAVE_AVX2 +static bool avx2_supported(void) +{ unsigned int eax, ebx = 0, ecx = 0, edx; unsigned int max_level; - #ifdef _MSC_VER +#ifdef _MSC_VER int info[4]; __cpuidex(info, 0, 0); max_level = info[0]; - #else +#else max_level = __get_cpuid_max(0, NULL); - #endif +#endif - #if HAVE_AVX2 || HAVE_AVX // Check for AVX/AVX2 support: // Checking for AVX requires 3 things: // 1) CPUID indicates that the OS uses XSAVE and XRSTORE instructions @@ -195,71 +231,214 @@ codec_choose_x86 (struct codec *codec) uint64_t xcr_mask; xcr_mask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); if (xcr_mask & _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) { - #if HAVE_AVX2 if (max_level >= 7) { __cpuid_count(7, 0, eax, ebx, ecx, edx); if (ebx & bit_AVX2) { - codec->enc = base64_stream_encode_avx2; - codec->dec = base64_stream_decode_avx2; return true; } } - #endif - #if HAVE_AVX + } + } + } + + return false; +} +#else +static bool avx2_supported(void) {return false;} +#endif + +#if HAVE_AVX +static bool avx_supported(void) +{ + unsigned int eax, ebx = 0, ecx = 0, edx; + unsigned int max_level; + +#ifdef _MSC_VER + int info[4]; + __cpuidex(info, 0, 0); + max_level = info[0]; +#else + max_level = __get_cpuid_max(0, NULL); +#endif + + // Check for AVX/AVX2 support: + // Checking for AVX requires 3 things: + // 1) CPUID indicates that the OS uses XSAVE and XRSTORE instructions + // (allowing saving YMM registers on context switch) + // 2) CPUID indicates support for AVX + // 3) XGETBV indicates the AVX registers will be saved and restored on + // context switch + // + // Note that XGETBV is only available on 686 or later CPUs, so the + // instruction needs to be conditionally run. + if (max_level >= 1) { + __cpuid_count(1, 0, eax, ebx, ecx, edx); + if (ecx & bit_XSAVE_XRSTORE) { + uint64_t xcr_mask; + xcr_mask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); + if (xcr_mask & _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) { __cpuid_count(1, 0, eax, ebx, ecx, edx); if (ecx & bit_AVX) { - codec->enc = base64_stream_encode_avx; - codec->dec = base64_stream_decode_avx; return true; } - #endif } } } - #endif + return false; +} +#else +static bool avx_supported(void) {return false;} +#endif + +#if HAVE_SSE42 +static bool sse42_supported(void) +{ + unsigned int eax, ebx = 0, ecx = 0, edx; + unsigned int max_level; + +#ifdef _MSC_VER + int info[4]; + __cpuidex(info, 0, 0); + max_level = info[0]; +#else + max_level = __get_cpuid_max(0, NULL); +#endif - #if HAVE_SSE42 // Check for SSE42 support: if (max_level >= 1) { __cpuid(1, eax, ebx, ecx, edx); if (ecx & bit_SSE42) { - codec->enc = base64_stream_encode_sse42; - codec->dec = base64_stream_decode_sse42; return true; } } - #endif + return false; +} +#else +static bool sse42_supported(void) {return false;} +#endif + +#if HAVE_SSE41 +static bool sse41_supported(void) +{ + unsigned int eax, ebx = 0, ecx = 0, edx; + unsigned int max_level; + +#ifdef _MSC_VER + int info[4]; + __cpuidex(info, 0, 0); + max_level = info[0]; +#else + max_level = __get_cpuid_max(0, NULL); +#endif - #if HAVE_SSE41 // Check for SSE41 support: if (max_level >= 1) { __cpuid(1, eax, ebx, ecx, edx); if (ecx & bit_SSE41) { - codec->enc = base64_stream_encode_sse41; - codec->dec = base64_stream_decode_sse41; return true; } } - #endif - #if HAVE_SSSE3 + return false; +} +#else +static bool sse41_supported(void) {return false;} +#endif + +#if HAVE_SSSE3 +static bool ssse3_supported(void) +{ + unsigned int eax, ebx = 0, ecx = 0, edx; + unsigned int max_level; + +#ifdef _MSC_VER + int info[4]; + __cpuidex(info, 0, 0); + max_level = info[0]; +#else + max_level = __get_cpuid_max(0, NULL); +#endif + // Check for SSSE3 support: if (max_level >= 1) { __cpuid(1, eax, ebx, ecx, edx); if (ecx & bit_SSSE3) { - codec->enc = base64_stream_encode_ssse3; - codec->dec = base64_stream_decode_ssse3; return true; } } - #endif + return false; +} #else - (void)codec; +static bool ssse3_supported(void) {return false;} +#endif + +#if HAVE_SSSE3 +static bool ssse3_atom_supported(void) +{ + unsigned int eax, ebx = 0, ecx = 0, edx; + unsigned int max_level; + +#ifdef _MSC_VER + int info[4]; + __cpuidex(info, 0, 0); + max_level = info[0]; +#else + max_level = __get_cpuid_max(0, NULL); #endif + // Check for SSSE3 support: + if (max_level >= 1) { + __cpuid(1, eax, ebx, ecx, edx); + if (ecx & bit_SSSE3) { + return true; + } + } + return false; } +#else +static bool ssse3_atom_supported(void) {return false;} +#endif + + +static bool +codec_choose_x86 (struct codec *codec) +{ + if(avx2_supported()) { + codec->enc = base64_stream_encode_avx2; + codec->dec = base64_stream_decode_avx2; + return true; + }; + if(avx_supported()) { + codec->enc = base64_stream_encode_avx; + codec->dec = base64_stream_decode_avx; + return true; + } + if(sse42_supported()) { + codec->enc = base64_stream_encode_sse42; + codec->dec = base64_stream_decode_sse42; + return true; + } + if(sse41_supported()) { + codec->enc = base64_stream_encode_sse41; + codec->dec = base64_stream_decode_sse41; + return true; + } + if(ssse3_supported()) { + codec->enc = base64_stream_encode_ssse3; + codec->dec = base64_stream_decode_ssse3; + return true; + } + if(ssse3_atom_supported()) { + codec->enc = base64_stream_encode_ssse3_atom; + codec->dec = base64_stream_decode_ssse3_atom; + return true; + } + (void)codec; + + return false; +} void codec_choose (struct codec *codec, int flags) diff --git a/lib/lib.c b/lib/lib.c index 4703512b..c8b6721c 100644 --- a/lib/lib.c +++ b/lib/lib.c @@ -1,3 +1,4 @@ +#include #include #include #ifdef _OPENMP @@ -18,7 +19,7 @@ void base64_stream_encode_init (struct base64_state *state, int flags) { // If any of the codec flags are set, redo choice: - if (codec.enc == NULL || flags & 0xFF) { + if (codec.enc == NULL || flags & 0xFFF) { codec_choose(&codec, flags); } state->eof = 0; @@ -74,7 +75,7 @@ base64_stream_decode_init (struct base64_state *state, int flags) state->eof = 0; state->bytes = 0; state->carry = 0; - state->flags = flags; + state->flags = flags & ~BASE64_CHECK_SUPPORT; } int @@ -143,6 +144,14 @@ base64_decode { int ret; struct base64_state state; + bool check; + + check = flags & BASE64_CHECK_SUPPORT; + if (check) { + base64_stream_decode_init(&state, flags); + if (codec.dec == NULL) return -1; + flags = 0; + } #ifdef _OPENMP if (srclen >= OMP_THRESHOLD) { diff --git a/test/benchmark.c b/test/benchmark.c index 80d21a38..0734ee98 100644 --- a/test/benchmark.c +++ b/test/benchmark.c @@ -213,7 +213,7 @@ main () } // Loop over all buffer sizes: - for (size_t i = 0; i < sizeof(sizes) / sizeof(sizes[0]); i++) { + for (size_t i = 0; i < sizeof(sizes) / sizeof(sizes[0]) ; i++) { printf("Testing with buffer size %s, fastest of %d * %d\n", sizes[i].label, sizes[i].repeat, sizes[i].batch); diff --git a/test/codec_supported.c b/test/codec_supported.c index a027b994..d5579c61 100644 --- a/test/codec_supported.c +++ b/test/codec_supported.c @@ -11,6 +11,7 @@ static char *_codecs[] = , "SSE41" , "SSE42" , "AVX" +, "SSSE3_ATOM" , NULL } ; @@ -24,5 +25,6 @@ codec_supported (int flags) char b[10]; size_t outlen; + flags |= BASE64_CHECK_SUPPORT; return (base64_decode(a, strlen(a), b, &outlen, flags) != -1); }