From 805326917b1c21cbac3f007dcf389889cf01f0da Mon Sep 17 00:00:00 2001 From: Jack Lloyd Date: Mon, 9 Jan 2023 16:27:03 -0500 Subject: [PATCH] Initial AVX512 support Adds SIMD_16x32 type plus Serpent and ChaCha using AVX-512 --- src/build-data/arch/x86_64.txt | 1 + src/build-data/cc/clang.txt | 2 + src/build-data/cc/gcc.txt | 2 + src/build-data/cc/msvc.txt | 1 + src/lib/block/serpent/serpent.cpp | 35 +- src/lib/block/serpent/serpent.h | 13 +- src/lib/block/serpent/serpent_avx512/info.txt | 16 + .../serpent/serpent_avx512/serpent_avx512.cpp | 132 ++++++ src/lib/block/serpent/serpent_sbox.h | 1 + src/lib/stream/chacha/chacha.cpp | 26 +- src/lib/stream/chacha/chacha.h | 4 + .../chacha/chacha_avx512/chacha_avx512.cpp | 212 ++++++++++ src/lib/stream/chacha/chacha_avx512/info.txt | 16 + src/lib/utils/cpuid/cpuid_x86.cpp | 10 +- src/lib/utils/simd/simd_avx512/info.txt | 22 + src/lib/utils/simd/simd_avx512/simd_avx512.h | 391 ++++++++++++++++++ src/tests/data/block/serpent.vec | 2 +- src/tests/data/stream/chacha.vec | 2 +- 18 files changed, 876 insertions(+), 12 deletions(-) create mode 100644 src/lib/block/serpent/serpent_avx512/info.txt create mode 100644 src/lib/block/serpent/serpent_avx512/serpent_avx512.cpp create mode 100644 src/lib/stream/chacha/chacha_avx512/chacha_avx512.cpp create mode 100644 src/lib/stream/chacha/chacha_avx512/info.txt create mode 100644 src/lib/utils/simd/simd_avx512/info.txt create mode 100644 src/lib/utils/simd/simd_avx512/simd_avx512.h diff --git a/src/build-data/arch/x86_64.txt b/src/build-data/arch/x86_64.txt index 729363e6fa3..cd46542557b 100644 --- a/src/build-data/arch/x86_64.txt +++ b/src/build-data/arch/x86_64.txt @@ -22,4 +22,5 @@ sse2 sse41 sse42 ssse3 +avx512 diff --git a/src/build-data/cc/clang.txt b/src/build-data/cc/clang.txt index 2b3b3d54d44..7b4bc3b0bb0 100644 --- a/src/build-data/cc/clang.txt +++ b/src/build-data/cc/clang.txt @@ -58,6 +58,8 @@ ssse3 -> "-mssse3" sse41 -> "-msse4.1" sse42 -> "-msse4.2" avx2 -> "-mavx2" +avx512 -> "-mavx512f -mavx512bw -mavx512dq -mavx512vbmi -mavx512vbmi2 -mavx512bitalg -mavx512vl -mavx512ifma" + bmi2 -> "-mbmi -mbmi2" aesni -> "-maes -mpclmul" rdrand -> "-mrdrnd" diff --git a/src/build-data/cc/gcc.txt b/src/build-data/cc/gcc.txt index 12c660f3e0d..edf2e5fb61f 100644 --- a/src/build-data/cc/gcc.txt +++ b/src/build-data/cc/gcc.txt @@ -61,6 +61,8 @@ ssse3 -> "-mssse3" sse41 -> "-msse4.1" sse42 -> "-msse4.2" avx2 -> "-mavx2" +avx512 -> "-mavx512f -mavx512bw -mavx512dq -mavx512vbmi -mavx512vbmi2 -mavx512bitalg -mavx512vl -mavx512ifma" + bmi2 -> "-mbmi -mbmi2" aesni -> "-maes -mpclmul" rdrand -> "-mrdrnd" diff --git a/src/build-data/cc/msvc.txt b/src/build-data/cc/msvc.txt index 5c32fffaafd..b4c2f25ddce 100644 --- a/src/build-data/cc/msvc.txt +++ b/src/build-data/cc/msvc.txt @@ -56,6 +56,7 @@ ssse3 -> "" sse41 -> "" sse42 -> "" x86_64:avx2 -> "/arch:AVX" +x86_64:avx512 -> "/arch:AVX512" bmi2 -> "" aesni -> "" clmul -> "" diff --git a/src/lib/block/serpent/serpent.cpp b/src/lib/block/serpent/serpent.cpp index 1916f55785a..e70a1dd9ad9 100644 --- a/src/lib/block/serpent/serpent.cpp +++ b/src/lib/block/serpent/serpent.cpp @@ -11,7 +11,7 @@ #include #include -#if defined(BOTAN_HAS_SERPENT_SIMD) || defined(BOTAN_HAS_SERPENT_AVX2) +#if defined(BOTAN_HAS_SERPENT_SIMD) || defined(BOTAN_HAS_SERPENT_AVX2) || defined(BOTAN_HAS_SERPENT_AVX512) #include #endif @@ -26,6 +26,19 @@ void Serpent::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const assert_key_material_set(); +#if defined(BOTAN_HAS_SERPENT_AVX512) + if(CPUID::has_avx512()) + { + while(blocks >= 16) + { + avx512_encrypt_16(in, out); + in += 16 * BLOCK_SIZE; + out += 16 * BLOCK_SIZE; + blocks -= 16; + } + } +#endif + #if defined(BOTAN_HAS_SERPENT_AVX2) if(CPUID::has_avx2()) { @@ -105,6 +118,19 @@ void Serpent::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const assert_key_material_set(); +#if defined(BOTAN_HAS_SERPENT_AVX512) + if(CPUID::has_avx512()) + { + while(blocks >= 16) + { + avx512_decrypt_16(in, out); + in += 16 * BLOCK_SIZE; + out += 16 * BLOCK_SIZE; + blocks -= 16; + } + } +#endif + #if defined(BOTAN_HAS_SERPENT_AVX2) if(CPUID::has_avx2()) { @@ -252,6 +278,13 @@ void Serpent::clear() std::string Serpent::provider() const { +#if defined(BOTAN_HAS_SERPENT_AVX512) + if(CPUID::has_avx512()) + { + return "avx512"; + } +#endif + #if defined(BOTAN_HAS_SERPENT_AVX2) if(CPUID::has_avx2()) { diff --git a/src/lib/block/serpent/serpent.h b/src/lib/block/serpent/serpent.h index 66518bf3a70..805ffa0d5de 100644 --- a/src/lib/block/serpent/serpent.h +++ b/src/lib/block/serpent/serpent.h @@ -33,13 +33,18 @@ class Serpent final : public Block_Cipher_Fixed_Params<16, 16, 32, 8> private: #if defined(BOTAN_HAS_SERPENT_SIMD) - void simd_encrypt_4(const uint8_t in[64], uint8_t out[64]) const; - void simd_decrypt_4(const uint8_t in[64], uint8_t out[64]) const; + void simd_encrypt_4(const uint8_t in[16*4], uint8_t out[16*4]) const; + void simd_decrypt_4(const uint8_t in[16*4], uint8_t out[16*4]) const; #endif #if defined(BOTAN_HAS_SERPENT_AVX2) - void avx2_encrypt_8(const uint8_t in[128], uint8_t out[128]) const; - void avx2_decrypt_8(const uint8_t in[128], uint8_t out[128]) const; + void avx2_encrypt_8(const uint8_t in[16*8], uint8_t out[16*8]) const; + void avx2_decrypt_8(const uint8_t in[16*8], uint8_t out[16*8]) const; +#endif + +#if defined(BOTAN_HAS_SERPENT_AVX512) + void avx512_encrypt_16(const uint8_t in[16*16], uint8_t out[16*16]) const; + void avx512_decrypt_16(const uint8_t in[16*16], uint8_t out[16*16]) const; #endif void key_schedule(const uint8_t key[], size_t length) override; diff --git a/src/lib/block/serpent/serpent_avx512/info.txt b/src/lib/block/serpent/serpent_avx512/info.txt new file mode 100644 index 00000000000..ec7396d0dff --- /dev/null +++ b/src/lib/block/serpent/serpent_avx512/info.txt @@ -0,0 +1,16 @@ + +SERPENT_AVX512 -> 20230101 + + + +name -> "Serpent AVX512" +brief -> "Serpent using AVX512 instructions" + + + +avx512 + + + +simd_avx512 + diff --git a/src/lib/block/serpent/serpent_avx512/serpent_avx512.cpp b/src/lib/block/serpent/serpent_avx512/serpent_avx512.cpp new file mode 100644 index 00000000000..35f24e0e2c1 --- /dev/null +++ b/src/lib/block/serpent/serpent_avx512/serpent_avx512.cpp @@ -0,0 +1,132 @@ +/* +* (C) 2023 Jack Lloyd +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#include +#include +#include + +namespace Botan { + +BOTAN_AVX512_FN +void Serpent::avx512_encrypt_16(const uint8_t in[16*16], uint8_t out[16*16]) const + { + using namespace Botan::Serpent_F; + + SIMD_16x32 B0 = SIMD_16x32::load_le(in); + SIMD_16x32 B1 = SIMD_16x32::load_le(in + 64); + SIMD_16x32 B2 = SIMD_16x32::load_le(in + 128); + SIMD_16x32 B3 = SIMD_16x32::load_le(in + 192); + + SIMD_16x32::transpose(B0, B1, B2, B3); + + const Key_Inserter key_xor(m_round_key.data()); + + key_xor( 0,B0,B1,B2,B3); SBoxE0(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor( 1,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor( 2,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor( 3,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor( 4,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor( 5,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor( 6,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor( 7,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3); + + key_xor( 8,B0,B1,B2,B3); SBoxE0(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor( 9,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(10,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(11,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(12,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(13,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(14,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(15,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3); + + key_xor(16,B0,B1,B2,B3); SBoxE0(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(17,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(18,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(19,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(20,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(21,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(22,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(23,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); transform(B0,B1,B2,B3); + + key_xor(24,B0,B1,B2,B3); SBoxE0(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(25,B0,B1,B2,B3); SBoxE1(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(26,B0,B1,B2,B3); SBoxE2(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(27,B0,B1,B2,B3); SBoxE3(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(28,B0,B1,B2,B3); SBoxE4(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(29,B0,B1,B2,B3); SBoxE5(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(30,B0,B1,B2,B3); SBoxE6(B0,B1,B2,B3); transform(B0,B1,B2,B3); + key_xor(31,B0,B1,B2,B3); SBoxE7(B0,B1,B2,B3); key_xor(32,B0,B1,B2,B3); + + SIMD_16x32::transpose(B0, B1, B2, B3); + B0.store_le(out); + B1.store_le(out + 64); + B2.store_le(out + 128); + B3.store_le(out + 192); + + SIMD_16x32::zero_registers(); + } + +BOTAN_AVX512_FN +void Serpent::avx512_decrypt_16(const uint8_t in[16*16], uint8_t out[16*16]) const + { + using namespace Botan::Serpent_F; + + SIMD_16x32 B0 = SIMD_16x32::load_le(in); + SIMD_16x32 B1 = SIMD_16x32::load_le(in + 64); + SIMD_16x32 B2 = SIMD_16x32::load_le(in + 128); + SIMD_16x32 B3 = SIMD_16x32::load_le(in + 192); + + SIMD_16x32::transpose(B0, B1, B2, B3); + + const Key_Inserter key_xor(m_round_key.data()); + + key_xor(32,B0,B1,B2,B3); SBoxD7(B0,B1,B2,B3); key_xor(31,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD6(B0,B1,B2,B3); key_xor(30,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD5(B0,B1,B2,B3); key_xor(29,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD4(B0,B1,B2,B3); key_xor(28,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD3(B0,B1,B2,B3); key_xor(27,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD2(B0,B1,B2,B3); key_xor(26,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD1(B0,B1,B2,B3); key_xor(25,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD0(B0,B1,B2,B3); key_xor(24,B0,B1,B2,B3); + + i_transform(B0,B1,B2,B3); SBoxD7(B0,B1,B2,B3); key_xor(23,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD6(B0,B1,B2,B3); key_xor(22,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD5(B0,B1,B2,B3); key_xor(21,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD4(B0,B1,B2,B3); key_xor(20,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD3(B0,B1,B2,B3); key_xor(19,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD2(B0,B1,B2,B3); key_xor(18,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD1(B0,B1,B2,B3); key_xor(17,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD0(B0,B1,B2,B3); key_xor(16,B0,B1,B2,B3); + + i_transform(B0,B1,B2,B3); SBoxD7(B0,B1,B2,B3); key_xor(15,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD6(B0,B1,B2,B3); key_xor(14,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD5(B0,B1,B2,B3); key_xor(13,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD4(B0,B1,B2,B3); key_xor(12,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD3(B0,B1,B2,B3); key_xor(11,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD2(B0,B1,B2,B3); key_xor(10,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD1(B0,B1,B2,B3); key_xor( 9,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD0(B0,B1,B2,B3); key_xor( 8,B0,B1,B2,B3); + + i_transform(B0,B1,B2,B3); SBoxD7(B0,B1,B2,B3); key_xor( 7,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD6(B0,B1,B2,B3); key_xor( 6,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD5(B0,B1,B2,B3); key_xor( 5,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD4(B0,B1,B2,B3); key_xor( 4,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD3(B0,B1,B2,B3); key_xor( 3,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD2(B0,B1,B2,B3); key_xor( 2,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD1(B0,B1,B2,B3); key_xor( 1,B0,B1,B2,B3); + i_transform(B0,B1,B2,B3); SBoxD0(B0,B1,B2,B3); key_xor( 0,B0,B1,B2,B3); + + SIMD_16x32::transpose(B0, B1, B2, B3); + + B0.store_le(out); + B1.store_le(out + 64); + B2.store_le(out + 128); + B3.store_le(out + 192); + + SIMD_16x32::zero_registers(); + } + +} diff --git a/src/lib/block/serpent/serpent_sbox.h b/src/lib/block/serpent/serpent_sbox.h index 245ebab3651..0a5e37232ed 100644 --- a/src/lib/block/serpent/serpent_sbox.h +++ b/src/lib/block/serpent/serpent_sbox.h @@ -11,6 +11,7 @@ #define BOTAN_SERPENT_FUNCS_H_ #include +#include namespace Botan::Serpent_F { diff --git a/src/lib/stream/chacha/chacha.cpp b/src/lib/stream/chacha/chacha.cpp index 0d2cfdd7715..78e1a58b65a 100644 --- a/src/lib/stream/chacha/chacha.cpp +++ b/src/lib/stream/chacha/chacha.cpp @@ -1,6 +1,6 @@ /* * ChaCha -* (C) 2014,2018 Jack Lloyd +* (C) 2014,2018,2023 Jack Lloyd * * Botan is released under the Simplified BSD License (see license.txt) */ @@ -73,6 +73,11 @@ ChaCha::ChaCha(size_t rounds) : m_rounds(rounds) size_t ChaCha::parallelism() { +#if defined(BOTAN_HAS_CHACHA_AVX512) + if(CPUID::has_avx512()) + return 16; +#endif + #if defined(BOTAN_HAS_CHACHA_AVX2) if(CPUID::has_avx2()) return 8; @@ -83,6 +88,13 @@ size_t ChaCha::parallelism() std::string ChaCha::provider() const { +#if defined(BOTAN_HAS_CHACHA_AVX512) + if(CPUID::has_avx512()) + { + return "avx512"; + } +#endif + #if defined(BOTAN_HAS_CHACHA_AVX2) if(CPUID::has_avx2()) { @@ -106,6 +118,18 @@ void ChaCha::chacha(uint8_t output[], { BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds"); +#if defined(BOTAN_HAS_CHACHA_AVX512) + if(CPUID::has_avx512()) + { + while(output_blocks >= 16) + { + ChaCha::chacha_avx512_x16(output, state, rounds); + output += 16*64; + output_blocks -= 16; + } + } +#endif + #if defined(BOTAN_HAS_CHACHA_AVX2) if(CPUID::has_avx2()) { diff --git a/src/lib/stream/chacha/chacha.h b/src/lib/stream/chacha/chacha.h index ac02c05e5c9..174f7f9d3d4 100644 --- a/src/lib/stream/chacha/chacha.h +++ b/src/lib/stream/chacha/chacha.h @@ -76,6 +76,10 @@ class ChaCha final : public StreamCipher static void chacha_avx2_x8(uint8_t output[64*8], uint32_t state[16], size_t rounds); #endif +#if defined(BOTAN_HAS_CHACHA_AVX512) + static void chacha_avx512_x16(uint8_t output[64*16], uint32_t state[16], size_t rounds); +#endif + size_t m_rounds; secure_vector m_key; secure_vector m_state; diff --git a/src/lib/stream/chacha/chacha_avx512/chacha_avx512.cpp b/src/lib/stream/chacha/chacha_avx512/chacha_avx512.cpp new file mode 100644 index 00000000000..ccaf56eedb8 --- /dev/null +++ b/src/lib/stream/chacha/chacha_avx512/chacha_avx512.cpp @@ -0,0 +1,212 @@ +/* +* (C) 2023 Jack Lloyd +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#include +#include + +namespace Botan { + +//static +BOTAN_AVX512_FN +void ChaCha::chacha_avx512_x16(uint8_t output[64*16], uint32_t state[16], size_t rounds) + { + BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds"); + const SIMD_16x32 CTR0 = SIMD_16x32( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + + const uint32_t C = 0xFFFFFFFF - state[12]; + const SIMD_16x32 CTR1 = SIMD_16x32( + 0, C < 1, C < 2, C < 3, C < 4, C < 5, C < 6, C < 7, + C < 8, C < 9, C < 10, C < 11, C < 12, C < 13, C < 14, C < 15 + ); + + SIMD_16x32 R00 = SIMD_16x32::splat(state[ 0]); + SIMD_16x32 R01 = SIMD_16x32::splat(state[ 1]); + SIMD_16x32 R02 = SIMD_16x32::splat(state[ 2]); + SIMD_16x32 R03 = SIMD_16x32::splat(state[ 3]); + SIMD_16x32 R04 = SIMD_16x32::splat(state[ 4]); + SIMD_16x32 R05 = SIMD_16x32::splat(state[ 5]); + SIMD_16x32 R06 = SIMD_16x32::splat(state[ 6]); + SIMD_16x32 R07 = SIMD_16x32::splat(state[ 7]); + SIMD_16x32 R08 = SIMD_16x32::splat(state[ 8]); + SIMD_16x32 R09 = SIMD_16x32::splat(state[ 9]); + SIMD_16x32 R10 = SIMD_16x32::splat(state[10]); + SIMD_16x32 R11 = SIMD_16x32::splat(state[11]); + SIMD_16x32 R12 = SIMD_16x32::splat(state[12]) + CTR0; + SIMD_16x32 R13 = SIMD_16x32::splat(state[13]) + CTR1; + SIMD_16x32 R14 = SIMD_16x32::splat(state[14]); + SIMD_16x32 R15 = SIMD_16x32::splat(state[15]); + + for(size_t r = 0; r != rounds / 2; ++r) + { + R00 += R04; + R01 += R05; + R02 += R06; + R03 += R07; + + R12 ^= R00; + R13 ^= R01; + R14 ^= R02; + R15 ^= R03; + + R12 = R12.rotl<16>(); + R13 = R13.rotl<16>(); + R14 = R14.rotl<16>(); + R15 = R15.rotl<16>(); + + R08 += R12; + R09 += R13; + R10 += R14; + R11 += R15; + + R04 ^= R08; + R05 ^= R09; + R06 ^= R10; + R07 ^= R11; + + R04 = R04.rotl<12>(); + R05 = R05.rotl<12>(); + R06 = R06.rotl<12>(); + R07 = R07.rotl<12>(); + + R00 += R04; + R01 += R05; + R02 += R06; + R03 += R07; + + R12 ^= R00; + R13 ^= R01; + R14 ^= R02; + R15 ^= R03; + + R12 = R12.rotl<8>(); + R13 = R13.rotl<8>(); + R14 = R14.rotl<8>(); + R15 = R15.rotl<8>(); + + R08 += R12; + R09 += R13; + R10 += R14; + R11 += R15; + + R04 ^= R08; + R05 ^= R09; + R06 ^= R10; + R07 ^= R11; + + R04 = R04.rotl<7>(); + R05 = R05.rotl<7>(); + R06 = R06.rotl<7>(); + R07 = R07.rotl<7>(); + + R00 += R05; + R01 += R06; + R02 += R07; + R03 += R04; + + R15 ^= R00; + R12 ^= R01; + R13 ^= R02; + R14 ^= R03; + + R15 = R15.rotl<16>(); + R12 = R12.rotl<16>(); + R13 = R13.rotl<16>(); + R14 = R14.rotl<16>(); + + R10 += R15; + R11 += R12; + R08 += R13; + R09 += R14; + + R05 ^= R10; + R06 ^= R11; + R07 ^= R08; + R04 ^= R09; + + R05 = R05.rotl<12>(); + R06 = R06.rotl<12>(); + R07 = R07.rotl<12>(); + R04 = R04.rotl<12>(); + + R00 += R05; + R01 += R06; + R02 += R07; + R03 += R04; + + R15 ^= R00; + R12 ^= R01; + R13 ^= R02; + R14 ^= R03; + + R15 = R15.rotl<8>(); + R12 = R12.rotl<8>(); + R13 = R13.rotl<8>(); + R14 = R14.rotl<8>(); + + R10 += R15; + R11 += R12; + R08 += R13; + R09 += R14; + + R05 ^= R10; + R06 ^= R11; + R07 ^= R08; + R04 ^= R09; + + R05 = R05.rotl<7>(); + R06 = R06.rotl<7>(); + R07 = R07.rotl<7>(); + R04 = R04.rotl<7>(); + } + + R00 += SIMD_16x32::splat(state[0]); + R01 += SIMD_16x32::splat(state[1]); + R02 += SIMD_16x32::splat(state[2]); + R03 += SIMD_16x32::splat(state[3]); + R04 += SIMD_16x32::splat(state[4]); + R05 += SIMD_16x32::splat(state[5]); + R06 += SIMD_16x32::splat(state[6]); + R07 += SIMD_16x32::splat(state[7]); + R08 += SIMD_16x32::splat(state[8]); + R09 += SIMD_16x32::splat(state[9]); + R10 += SIMD_16x32::splat(state[10]); + R11 += SIMD_16x32::splat(state[11]); + R12 += SIMD_16x32::splat(state[12]) + CTR0; + R13 += SIMD_16x32::splat(state[13]) + CTR1; + R14 += SIMD_16x32::splat(state[14]); + R15 += SIMD_16x32::splat(state[15]); + + SIMD_16x32::transpose( + R00, R01, R02, R03, + R04, R05, R06, R07, + R08, R09, R10, R11, + R12, R13, R14, R15); + + R00.store_le(output); + R01.store_le(output + 64*1); + R02.store_le(output + 64*2); + R03.store_le(output + 64*3); + R04.store_le(output + 64*4); + R05.store_le(output + 64*5); + R06.store_le(output + 64*6); + R07.store_le(output + 64*7); + R08.store_le(output + 64*8); + R09.store_le(output + 64*9); + R10.store_le(output + 64*10); + R11.store_le(output + 64*11); + R12.store_le(output + 64*12); + R13.store_le(output + 64*13); + R14.store_le(output + 64*14); + R15.store_le(output + 64*15); + + SIMD_16x32::zero_registers(); + + state[12] += 16; + if(state[12] < 16) + state[13]++; + } +} diff --git a/src/lib/stream/chacha/chacha_avx512/info.txt b/src/lib/stream/chacha/chacha_avx512/info.txt new file mode 100644 index 00000000000..84385bd986b --- /dev/null +++ b/src/lib/stream/chacha/chacha_avx512/info.txt @@ -0,0 +1,16 @@ + +CHACHA_AVX512 -> 20230101 + + + +name -> "ChaCha20 AVX512" +brief -> "ChaCha20 using AVX512 instructions" + + + +avx512 + + + +simd_avx512 + diff --git a/src/lib/utils/cpuid/cpuid_x86.cpp b/src/lib/utils/cpuid/cpuid_x86.cpp index 39b6f847f84..ca63c43615c 100644 --- a/src/lib/utils/cpuid/cpuid_x86.cpp +++ b/src/lib/utils/cpuid/cpuid_x86.cpp @@ -203,12 +203,14 @@ uint32_t CPUID::CPUID_Data::detect_cpu_features() bought such a processor. */ if((flags7 & AVX512_PROFILE_FLAGS) == AVX512_PROFILE_FLAGS) + { features_detected |= CPUID::CPUID_AVX512_BIT; - if(flags7 & x86_CPUID_7_bits::AVX512_VAES) - features_detected |= CPUID::CPUID_AVX512_AES_BIT; - if(flags7 & x86_CPUID_7_bits::AVX512_VCLMUL) - features_detected |= CPUID::CPUID_AVX512_CLMUL_BIT; + if(flags7 & x86_CPUID_7_bits::AVX512_VAES) + features_detected |= CPUID::CPUID_AVX512_AES_BIT; + if(flags7 & x86_CPUID_7_bits::AVX512_VCLMUL) + features_detected |= CPUID::CPUID_AVX512_CLMUL_BIT; + } } } diff --git a/src/lib/utils/simd/simd_avx512/info.txt b/src/lib/utils/simd/simd_avx512/info.txt new file mode 100644 index 00000000000..60763e6ac38 --- /dev/null +++ b/src/lib/utils/simd/simd_avx512/info.txt @@ -0,0 +1,22 @@ + +SIMD_AVX512 -> 20230101 + + + +name -> "AVX512" +brief -> "Helpers for working with AVX512 instructions" + + + +avx512 + + + +simd_avx512.h + + + +gcc +clang +msvc + diff --git a/src/lib/utils/simd/simd_avx512/simd_avx512.h b/src/lib/utils/simd/simd_avx512/simd_avx512.h new file mode 100644 index 00000000000..acbea144e59 --- /dev/null +++ b/src/lib/utils/simd/simd_avx512/simd_avx512.h @@ -0,0 +1,391 @@ +/* +* (C) 2023 Jack Lloyd +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#ifndef BOTAN_SIMD_AVX512_H_ +#define BOTAN_SIMD_AVX512_H_ + +#include +#include + +namespace Botan { + +#define BOTAN_AVX512_FN BOTAN_FUNC_ISA("avx512f,avx512dq,avx512bw") + +class SIMD_16x32 final + { + public: + + SIMD_16x32& operator=(const SIMD_16x32& other) = default; + SIMD_16x32(const SIMD_16x32& other) = default; + + SIMD_16x32& operator=(SIMD_16x32&& other) = default; + SIMD_16x32(SIMD_16x32&& other) = default; + + BOTAN_AVX512_FN + BOTAN_FORCE_INLINE SIMD_16x32() + { + m_avx512 = _mm512_setzero_si512(); + } + + BOTAN_AVX512_FN + explicit SIMD_16x32(const uint32_t B[16]) + { + m_avx512 = _mm512_loadu_si512(reinterpret_cast(B)); + } + + BOTAN_AVX512_FN + explicit SIMD_16x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3, + uint32_t B4, uint32_t B5, uint32_t B6, uint32_t B7, + uint32_t B8, uint32_t B9, uint32_t BA, uint32_t BB, + uint32_t BC, uint32_t BD, uint32_t BE, uint32_t BF) + { + m_avx512 = _mm512_set_epi32( + BF, BE, BD, BC, BB, BA, B9, B8, + B7, B6, B5, B4, B3, B2, B1, B0); + } + + BOTAN_AVX512_FN + static SIMD_16x32 splat(uint32_t B) + { + return SIMD_16x32(_mm512_set1_epi32(B)); + } + + BOTAN_AVX512_FN + static SIMD_16x32 load_le(const uint8_t* in) + { + return SIMD_16x32(_mm512_loadu_si512(reinterpret_cast(in))); + } + + BOTAN_AVX512_FN + static SIMD_16x32 load_be(const uint8_t* in) + { + return load_le(in).bswap(); + } + + BOTAN_AVX512_FN + void store_le(uint8_t out[]) const + { + _mm512_storeu_si512(reinterpret_cast<__m512i*>(out), m_avx512); + } + + BOTAN_AVX512_FN + void store_be(uint8_t out[]) const + { + bswap().store_le(out); + } + + template + BOTAN_AVX512_FN + SIMD_16x32 rotl() const + { + static_assert(ROT > 0 && ROT < 32, "Invalid rotation constant"); + return SIMD_16x32(_mm512_rol_epi32(m_avx512, ROT)); + } + + template + BOTAN_AVX512_FN + SIMD_16x32 rotr() const + { + return this->rotl<32-ROT>(); + } + + SIMD_16x32 BOTAN_AVX512_FN sigma0() const + { + const SIMD_16x32 rot1 = this->rotr<2>(); + const SIMD_16x32 rot2 = this->rotr<13>(); + const SIMD_16x32 rot3 = this->rotr<22>(); + return rot1 ^ rot2 ^ rot3; + } + + SIMD_16x32 BOTAN_AVX512_FN sigma1() const + { + const SIMD_16x32 rot1 = this->rotr<6>(); + const SIMD_16x32 rot2 = this->rotr<11>(); + const SIMD_16x32 rot3 = this->rotr<25>(); + return rot1 ^ rot2 ^ rot3; + } + + BOTAN_AVX512_FN + SIMD_16x32 operator+(const SIMD_16x32& other) const + { + SIMD_16x32 retval(*this); + retval += other; + return retval; + } + + BOTAN_AVX512_FN + SIMD_16x32 operator-(const SIMD_16x32& other) const + { + SIMD_16x32 retval(*this); + retval -= other; + return retval; + } + + BOTAN_AVX512_FN + SIMD_16x32 operator^(const SIMD_16x32& other) const + { + SIMD_16x32 retval(*this); + retval ^= other; + return retval; + } + + BOTAN_AVX512_FN + SIMD_16x32 operator|(const SIMD_16x32& other) const + { + SIMD_16x32 retval(*this); + retval |= other; + return retval; + } + + BOTAN_AVX512_FN + SIMD_16x32 operator&(const SIMD_16x32& other) const + { + SIMD_16x32 retval(*this); + retval &= other; + return retval; + } + + BOTAN_AVX512_FN + void operator+=(const SIMD_16x32& other) + { + m_avx512 = _mm512_add_epi32(m_avx512, other.m_avx512); + } + + BOTAN_AVX512_FN + void operator-=(const SIMD_16x32& other) + { + m_avx512 = _mm512_sub_epi32(m_avx512, other.m_avx512); + } + + BOTAN_AVX512_FN + void operator^=(const SIMD_16x32& other) + { + m_avx512 = _mm512_xor_si512(m_avx512, other.m_avx512); + } + + BOTAN_AVX512_FN + void operator^=(uint32_t other) + { + *this ^= SIMD_16x32::splat(other); + } + + BOTAN_AVX512_FN + void operator|=(const SIMD_16x32& other) + { + m_avx512 = _mm512_or_si512(m_avx512, other.m_avx512); + } + + BOTAN_AVX512_FN + void operator&=(const SIMD_16x32& other) + { + m_avx512 = _mm512_and_si512(m_avx512, other.m_avx512); + } + + template BOTAN_AVX512_FN SIMD_16x32 shl() const + { + return SIMD_16x32(_mm512_slli_epi32(m_avx512, SHIFT)); + } + + template BOTAN_AVX512_FN SIMD_16x32 shr() const + { + return SIMD_16x32(_mm512_srli_epi32(m_avx512, SHIFT)); + } + + BOTAN_AVX512_FN + SIMD_16x32 operator~() const + { + return SIMD_16x32(_mm512_xor_si512(m_avx512, _mm512_set1_epi32(0xFFFFFFFF))); + } + + // (~reg) & other + BOTAN_AVX512_FN + SIMD_16x32 andc(const SIMD_16x32& other) const + { + return SIMD_16x32(_mm512_andnot_si512(m_avx512, other.m_avx512)); + } + + template + BOTAN_AVX512_FN + static SIMD_16x32 ternary_fn(const SIMD_16x32& a, + const SIMD_16x32& b, + const SIMD_16x32& c) + { + return _mm512_ternarylogic_epi32(a.raw(), b.raw(), c.raw(), TBL); + } + + BOTAN_AVX512_FN + SIMD_16x32 bswap() const + { + const uint8_t BSWAP_MASK[64] = { + 3, 2, 1, 0, + 7, 6, 5, 4, + 11, 10, 9, 8, + 15, 14, 13, 12, + 19, 18, 17, 16, + 23, 22, 21, 20, + 27, 26, 25, 24, + 31, 30, 29, 28, + 35, 34, 33, 32, + 39, 38, 37, 36, + 43, 42, 41, 40, + 47, 46, 45, 44, + 51, 50, 49, 48, + 55, 54, 53, 52, + 59, 58, 57, 56, + 63, 62, 61, 60, + }; + + const __m512i bswap = _mm512_loadu_si512(reinterpret_cast(BSWAP_MASK)); + + const __m512i output = _mm512_shuffle_epi8(m_avx512, bswap); + + return SIMD_16x32(output); + } + + BOTAN_AVX512_FN + static void transpose(SIMD_16x32& B0, SIMD_16x32& B1, + SIMD_16x32& B2, SIMD_16x32& B3) + { + const __m512i T0 = _mm512_unpacklo_epi32(B0.m_avx512, B1.m_avx512); + const __m512i T1 = _mm512_unpacklo_epi32(B2.m_avx512, B3.m_avx512); + const __m512i T2 = _mm512_unpackhi_epi32(B0.m_avx512, B1.m_avx512); + const __m512i T3 = _mm512_unpackhi_epi32(B2.m_avx512, B3.m_avx512); + + B0.m_avx512 = _mm512_unpacklo_epi64(T0, T1); + B1.m_avx512 = _mm512_unpackhi_epi64(T0, T1); + B2.m_avx512 = _mm512_unpacklo_epi64(T2, T3); + B3.m_avx512 = _mm512_unpackhi_epi64(T2, T3); + } + + BOTAN_AVX512_FN + static void transpose(SIMD_16x32& B0, SIMD_16x32& B1, + SIMD_16x32& B2, SIMD_16x32& B3, + SIMD_16x32& B4, SIMD_16x32& B5, + SIMD_16x32& B6, SIMD_16x32& B7, + SIMD_16x32& B8, SIMD_16x32& B9, + SIMD_16x32& BA, SIMD_16x32& BB, + SIMD_16x32& BC, SIMD_16x32& BD, + SIMD_16x32& BE, SIMD_16x32& BF) + { + auto t0 = _mm512_unpacklo_epi32(B0.raw(), B1.raw()); + auto t1 = _mm512_unpackhi_epi32(B0.raw(), B1.raw()); + auto t2 = _mm512_unpacklo_epi32(B2.raw(), B3.raw()); + auto t3 = _mm512_unpackhi_epi32(B2.raw(), B3.raw()); + auto t4 = _mm512_unpacklo_epi32(B4.raw(), B5.raw()); + auto t5 = _mm512_unpackhi_epi32(B4.raw(), B5.raw()); + auto t6 = _mm512_unpacklo_epi32(B6.raw(), B7.raw()); + auto t7 = _mm512_unpackhi_epi32(B6.raw(), B7.raw()); + auto t8 = _mm512_unpacklo_epi32(B8.raw(), B9.raw()); + auto t9 = _mm512_unpackhi_epi32(B8.raw(), B9.raw()); + auto ta = _mm512_unpacklo_epi32(BA.raw(), BB.raw()); + auto tb = _mm512_unpackhi_epi32(BA.raw(), BB.raw()); + auto tc = _mm512_unpacklo_epi32(BC.raw(), BD.raw()); + auto td = _mm512_unpackhi_epi32(BC.raw(), BD.raw()); + auto te = _mm512_unpacklo_epi32(BE.raw(), BF.raw()); + auto tf = _mm512_unpackhi_epi32(BE.raw(), BF.raw()); + + auto r0 = _mm512_unpacklo_epi64(t0, t2); + auto r1 = _mm512_unpackhi_epi64(t0, t2); + auto r2 = _mm512_unpacklo_epi64(t1, t3); + auto r3 = _mm512_unpackhi_epi64(t1, t3); + auto r4 = _mm512_unpacklo_epi64(t4, t6); + auto r5 = _mm512_unpackhi_epi64(t4, t6); + auto r6 = _mm512_unpacklo_epi64(t5, t7); + auto r7 = _mm512_unpackhi_epi64(t5, t7); + auto r8 = _mm512_unpacklo_epi64(t8, ta); + auto r9 = _mm512_unpackhi_epi64(t8, ta); + auto ra = _mm512_unpacklo_epi64(t9, tb); + auto rb = _mm512_unpackhi_epi64(t9, tb); + auto rc = _mm512_unpacklo_epi64(tc, te); + auto rd = _mm512_unpackhi_epi64(tc, te); + auto re = _mm512_unpacklo_epi64(td, tf); + auto rf = _mm512_unpackhi_epi64(td, tf); + + t0 = _mm512_shuffle_i32x4(r0, r4, 0x88); + t1 = _mm512_shuffle_i32x4(r1, r5, 0x88); + t2 = _mm512_shuffle_i32x4(r2, r6, 0x88); + t3 = _mm512_shuffle_i32x4(r3, r7, 0x88); + t4 = _mm512_shuffle_i32x4(r0, r4, 0xdd); + t5 = _mm512_shuffle_i32x4(r1, r5, 0xdd); + t6 = _mm512_shuffle_i32x4(r2, r6, 0xdd); + t7 = _mm512_shuffle_i32x4(r3, r7, 0xdd); + t8 = _mm512_shuffle_i32x4(r8, rc, 0x88); + t9 = _mm512_shuffle_i32x4(r9, rd, 0x88); + ta = _mm512_shuffle_i32x4(ra, re, 0x88); + tb = _mm512_shuffle_i32x4(rb, rf, 0x88); + tc = _mm512_shuffle_i32x4(r8, rc, 0xdd); + td = _mm512_shuffle_i32x4(r9, rd, 0xdd); + te = _mm512_shuffle_i32x4(ra, re, 0xdd); + tf = _mm512_shuffle_i32x4(rb, rf, 0xdd); + + B0.m_avx512 = _mm512_shuffle_i32x4(t0, t8, 0x88); + B1.m_avx512 = _mm512_shuffle_i32x4(t1, t9, 0x88); + B2.m_avx512 = _mm512_shuffle_i32x4(t2, ta, 0x88); + B3.m_avx512 = _mm512_shuffle_i32x4(t3, tb, 0x88); + B4.m_avx512 = _mm512_shuffle_i32x4(t4, tc, 0x88); + B5.m_avx512 = _mm512_shuffle_i32x4(t5, td, 0x88); + B6.m_avx512 = _mm512_shuffle_i32x4(t6, te, 0x88); + B7.m_avx512 = _mm512_shuffle_i32x4(t7, tf, 0x88); + B8.m_avx512 = _mm512_shuffle_i32x4(t0, t8, 0xdd); + B9.m_avx512 = _mm512_shuffle_i32x4(t1, t9, 0xdd); + BA.m_avx512 = _mm512_shuffle_i32x4(t2, ta, 0xdd); + BB.m_avx512 = _mm512_shuffle_i32x4(t3, tb, 0xdd); + BC.m_avx512 = _mm512_shuffle_i32x4(t4, tc, 0xdd); + BD.m_avx512 = _mm512_shuffle_i32x4(t5, td, 0xdd); + BE.m_avx512 = _mm512_shuffle_i32x4(t6, te, 0xdd); + BF.m_avx512 = _mm512_shuffle_i32x4(t7, tf, 0xdd); + } + + BOTAN_AVX512_FN + static SIMD_16x32 choose(const SIMD_16x32& mask, const SIMD_16x32& a, const SIMD_16x32& b) + { + return SIMD_16x32::ternary_fn<0xca>(mask, a, b); + } + + BOTAN_AVX512_FN + static SIMD_16x32 majority(const SIMD_16x32& x, const SIMD_16x32& y, const SIMD_16x32& z) + { + return SIMD_16x32::ternary_fn<0xe8>(x, y, z); + } + + BOTAN_FUNC_ISA("avx2") + static void zero_registers() + { + // Unfortunately this only zeros zmm0-zmm15 and not zmm16-zmm32 + _mm256_zeroall(); + } + + __m512i BOTAN_AVX512_FN raw() const { return m_avx512; } + + BOTAN_AVX512_FN + SIMD_16x32(__m512i x) : m_avx512(x) {} + + private: + __m512i m_avx512; + }; + +template +inline SIMD_16x32 rotl(SIMD_16x32 input) + { + return input.rotl(); + } + +template +inline SIMD_16x32 rotr(SIMD_16x32 input) + { + return input.rotr(); + } + +// For Serpent: +template +inline SIMD_16x32 shl(SIMD_16x32 input) + { + return input.shl(); + } + +} + +#endif diff --git a/src/tests/data/block/serpent.vec b/src/tests/data/block/serpent.vec index 1009251bafb..8e75f341ad1 100644 --- a/src/tests/data/block/serpent.vec +++ b/src/tests/data/block/serpent.vec @@ -1,5 +1,5 @@ -#test cpuid avx2 simd +#test cpuid avx512 avx2 simd [Serpent] Key = 00000000000000000000000000000000 diff --git a/src/tests/data/stream/chacha.vec b/src/tests/data/stream/chacha.vec index 41381bba09c..346683b5ffb 100644 --- a/src/tests/data/stream/chacha.vec +++ b/src/tests/data/stream/chacha.vec @@ -1,5 +1,5 @@ -#test cpuid avx2 sse2 +#test cpuid avx512 avx2 sse2 [ChaCha(8)]