From 48c8fe8bb490dc3fd4a0968eef7a98c059a35ad4 Mon Sep 17 00:00:00 2001 From: Brian Smith Date: Sat, 11 May 2024 12:22:13 -0700 Subject: [PATCH] aes-gcm: Clarify CPU feature detection. Although every key has been represented with the same types `aes::AES_KEY` and `gcm::HTable` regardless of which implementation is used, in reality those types are polymorphic in ways that aren't captured by the type system currently. Thus, the `set_encrypt_key!` function must be matched with the corresponding `encrypt_block!` and/or `ctr32_encrypt_blocks!` function. Previously, we did CPU feature detection for each function call and assumed that CPU feature detection is idempotent. Now, we do CPU feature detection during key construction and make the lesser assumption that at least those same CPU features are available as long as the key exists. This is a step towards making further improvements in CPU-feature-based dispatching. One side-effect of this change is that GCM keys (and thus AES-GCM keys) are now much smaller on targets that don't support any assembly implementation, as they now just store a single `U128` instead of a whole `HTable`. --- src/aead/aes.rs | 379 +++++----------------- src/aead/aes/bs.rs | 62 ++++ src/aead/aes/fallback.rs | 47 +++ src/aead/aes/ffi.rs | 17 +- src/aead/aes/hw.rs | 66 ++++ src/aead/aes/vp.rs | 130 ++++++++ src/aead/aes_gcm.rs | 314 +++++++++++++----- src/aead/algorithm.rs | 8 +- src/aead/gcm.rs | 215 +++--------- src/aead/gcm/clmul.rs | 66 ++++ src/aead/gcm/clmulavxmovbe.rs | 53 +++ src/aead/gcm/{gcm_nohw.rs => fallback.rs} | 35 +- src/aead/gcm/ffi.rs | 55 ++-- src/aead/gcm/neon.rs | 45 +++ src/aead/shift.rs | 2 +- src/cpu.rs | 54 +++ src/cpu/arm.rs | 15 +- src/cpu/intel.rs | 34 +- src/lib.rs | 9 + src/polyfill.rs | 9 + 20 files changed, 994 insertions(+), 621 deletions(-) create mode 100644 src/aead/aes/bs.rs create mode 100644 src/aead/aes/fallback.rs create mode 100644 src/aead/aes/hw.rs create mode 100644 src/aead/aes/vp.rs create mode 100644 src/aead/gcm/clmul.rs create mode 100644 src/aead/gcm/clmulavxmovbe.rs rename src/aead/gcm/{gcm_nohw.rs => fallback.rs} (92%) create mode 100644 src/aead/gcm/neon.rs diff --git a/src/aead/aes.rs b/src/aead/aes.rs index d3ce316899..e0416cd796 100644 --- a/src/aead/aes.rs +++ b/src/aead/aes.rs @@ -13,14 +13,24 @@ // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{nonce::Nonce, quic::Sample, NONCE_LEN}; -use crate::{constant_time, cpu, error}; +use crate::{ + constant_time, + cpu::{self, GetFeature as _}, + error, +}; use cfg_if::cfg_if; use core::ops::RangeFrom; pub(super) use ffi::Counter; + #[macro_use] mod ffi; +mod bs; +pub(super) mod fallback; +pub(super) mod hw; +pub(super) mod vp; + cfg_if! { if #[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))] { pub(super) use ffi::AES_KEY; @@ -30,8 +40,19 @@ cfg_if! { } #[derive(Clone)] -pub(super) struct Key { - inner: AES_KEY, +pub(super) enum Key { + #[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "x86"))] + Hw(hw::Key), + + #[cfg(any( + target_arch = "aarch64", + target_arch = "arm", + target_arch = "x86", + target_arch = "x86_64" + ))] + Vp(vp::Key), + + Fallback(fallback::Key), } impl Key { @@ -40,201 +61,48 @@ impl Key { bytes: KeyBytes<'_>, cpu_features: cpu::Features, ) -> Result { - let key = match detect_implementation(cpu_features) { - #[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "x86"))] - // SAFETY: `aes_hw_set_encrypt_key` satisfies the `set_encrypt_key!` - // contract for these target architectures. - Implementation::HWAES => unsafe { - set_encrypt_key!(aes_hw_set_encrypt_key, bytes, cpu_features) - }, - - #[cfg(any( - target_arch = "aarch64", - target_arch = "arm", - target_arch = "x86_64", - target_arch = "x86" - ))] - // SAFETY: `vpaes_set_encrypt_key` satisfies the `set_encrypt_key!` - // contract for these target architectures. - Implementation::VPAES_BSAES => unsafe { - set_encrypt_key!(vpaes_set_encrypt_key, bytes, cpu_features) - }, - - // SAFETY: `aes_nohw_set_encrypt_key` satisfies the `set_encrypt_key!` - // contract. - Implementation::NOHW => unsafe { - set_encrypt_key!(aes_nohw_set_encrypt_key, bytes, cpu_features) - }, - }?; - - Ok(Self { inner: key }) - } - - #[inline] - pub fn encrypt_block(&self, a: Block, cpu_features: cpu::Features) -> Block { - match detect_implementation(cpu_features) { - #[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "x86"))] - Implementation::HWAES => self.encrypt_iv_xor_block(Iv(a), ZERO_BLOCK, cpu_features), - - #[cfg(any(target_arch = "aarch64", target_arch = "arm", target_arch = "x86_64"))] - Implementation::VPAES_BSAES => { - self.encrypt_iv_xor_block(Iv(a), ZERO_BLOCK, cpu_features) - } - - // `encrypt_iv_xor_block` calls `encrypt_block` on `target_arch = "x86"`. - #[cfg(target_arch = "x86")] - Implementation::VPAES_BSAES => unsafe { encrypt_block!(vpaes_encrypt, a, &self.inner) }, - - Implementation::NOHW => unsafe { encrypt_block!(aes_nohw_encrypt, a, &self.inner) }, + #[cfg(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64"))] + if let Some(hw_features) = cpu_features.get_feature() { + return Ok(Self::Hw(hw::Key::new(bytes, hw_features)?)); } - } - pub fn encrypt_iv_xor_block( - &self, - iv: Iv, - mut block: Block, - cpu_features: cpu::Features, - ) -> Block { - let use_ctr32 = match detect_implementation(cpu_features) { - // These have specialized one-block implementations. - #[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "x86"))] - Implementation::HWAES => true, - // `ctr32_encrypt_within` calls `encrypt_iv_xor_block` on `target_arch = "x86"`. - #[cfg(any(target_arch = "aarch64", target_arch = "arm", target_arch = "x86_64"))] - Implementation::VPAES_BSAES => true, - _ => false, - }; - if use_ctr32 { - let mut ctr = Counter(iv.0); // We're only doing one block so this is OK. - self.ctr32_encrypt_within(&mut block, 0.., &mut ctr, cpu_features); - block - } else { - let encrypted_iv = self.encrypt_block(iv.into_block_less_safe(), cpu_features); - constant_time::xor_16(encrypted_iv, block) + #[cfg(any( + target_arch = "aarch64", + target_arch = "arm", + target_arch = "x86_64", + target_arch = "x86" + ))] + if let Some(vp_features) = cpu_features.get_feature() { + return Ok(Self::Vp(vp::Key::new(bytes, vp_features)?)); } + + let _ = cpu_features; + + Ok(Self::Fallback(fallback::Key::new(bytes)?)) } #[inline] - pub(super) fn ctr32_encrypt_within( - &self, - in_out: &mut [u8], - src: RangeFrom, - ctr: &mut Counter, - cpu_features: cpu::Features, - ) { - match detect_implementation(cpu_features) { + pub fn encrypt_block(&self, a: Block) -> Block { + match self { #[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "x86"))] - // SAFETY: - // * self.inner was initialized with `aes_hw_set_encrypt_key` above, - // as required by `aes_hw_ctr32_encrypt_blocks`. - // * `aes_hw_ctr32_encrypt_blocks` satisfies the contract for - // `ctr32_encrypt_blocks`. - Implementation::HWAES => unsafe { - ctr32_encrypt_blocks!( - aes_hw_ctr32_encrypt_blocks, - in_out, - src, - &self.inner, - ctr, - cpu_features - ) - }, - - #[cfg(any(target_arch = "aarch64", target_arch = "arm", target_arch = "x86_64"))] - Implementation::VPAES_BSAES => { - #[cfg(target_arch = "arm")] - let in_out = { - let blocks = in_out[src.clone()].len() / BLOCK_LEN; - - // bsaes operates in batches of 8 blocks. - let bsaes_blocks = if blocks >= 8 && (blocks % 8) < 6 { - // It's faster to use bsaes for all the full batches and then - // switch to vpaes for the last partial batch (if any). - blocks - (blocks % 8) - } else if blocks >= 8 { - // It's faster to let bsaes handle everything including - // the last partial batch. - blocks - } else { - // It's faster to let vpaes handle everything. - 0 - }; - let bsaes_in_out_len = bsaes_blocks * BLOCK_LEN; - - // SAFETY: - // * self.inner was initialized with `vpaes_set_encrypt_key` above, - // as required by `bsaes_ctr32_encrypt_blocks_with_vpaes_key`. - unsafe { - bsaes_ctr32_encrypt_blocks_with_vpaes_key( - &mut in_out[..(src.start + bsaes_in_out_len)], - src.clone(), - &self.inner, - ctr, - cpu_features, - ); - } - - &mut in_out[bsaes_in_out_len..] - }; - - // SAFETY: - // * self.inner was initialized with `vpaes_set_encrypt_key` above, - // as required by `vpaes_ctr32_encrypt_blocks`. - // * `vpaes_ctr32_encrypt_blocks` satisfies the contract for - // `ctr32_encrypt_blocks`. - unsafe { - ctr32_encrypt_blocks!( - vpaes_ctr32_encrypt_blocks, - in_out, - src, - &self.inner, - ctr, - cpu_features - ) - } - } - - #[cfg(target_arch = "x86")] - Implementation::VPAES_BSAES => { - super::shift::shift_full_blocks(in_out, src, |input| { - self.encrypt_iv_xor_block(ctr.increment(), *input, cpu_features) - }); - } - - // SAFETY: - // * self.inner was initialized with `aes_nohw_set_encrypt_key` - // above, as required by `aes_nohw_ctr32_encrypt_blocks`. - // * `aes_nohw_ctr32_encrypt_blocks` satisfies the contract for - // `ctr32_encrypt_blocks`. - Implementation::NOHW => unsafe { - ctr32_encrypt_blocks!( - aes_nohw_ctr32_encrypt_blocks, - in_out, - src, - &self.inner, - ctr, - cpu_features - ) - }, + Key::Hw(inner) => inner.encrypt_block(a), + + #[cfg(any( + target_arch = "aarch64", + target_arch = "arm", + target_arch = "x86", + target_arch = "x86_64" + ))] + Key::Vp(inner) => inner.encrypt_block(a), + + Key::Fallback(inner) => inner.encrypt_block(a), } } pub fn new_mask(&self, sample: Sample) -> [u8; 5] { - let [b0, b1, b2, b3, b4, ..] = self.encrypt_block(sample, cpu::features()); + let [b0, b1, b2, b3, b4, ..] = self.encrypt_block(sample); [b0, b1, b2, b3, b4] } - - #[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))] - #[must_use] - pub fn is_aes_hw(&self, cpu_features: cpu::Features) -> bool { - matches!(detect_implementation(cpu_features), Implementation::HWAES) - } - - #[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))] - #[must_use] - pub(super) fn inner_less_safe(&self) -> &AES_KEY { - &self.inner - } } pub const AES_128_KEY_LEN: usize = 128 / 8; @@ -280,131 +148,39 @@ impl From for Iv { } } -impl Iv { - /// "Less safe" because it defeats attempts to use the type system to prevent reuse of the IV. - #[inline] - pub(super) fn into_block_less_safe(self) -> Block { - self.0 - } -} - pub(super) type Block = [u8; BLOCK_LEN]; pub(super) const BLOCK_LEN: usize = 16; pub(super) const ZERO_BLOCK: Block = [0u8; BLOCK_LEN]; -#[derive(Clone, Copy)] -#[allow(clippy::upper_case_acronyms)] -pub enum Implementation { - #[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "x86"))] - HWAES, - - // On "arm" only, this indicates that the bsaes implementation may be used. - #[cfg(any( - target_arch = "aarch64", - target_arch = "arm", - target_arch = "x86_64", - target_arch = "x86" - ))] - VPAES_BSAES, - - NOHW, +pub(super) trait EncryptBlock { + fn encrypt_block(&self, block: Block) -> Block; + fn encrypt_iv_xor_block(&self, iv: Iv, block: Block) -> Block; } -fn detect_implementation(cpu_features: cpu::Features) -> Implementation { - // `cpu_features` is only used for specific platforms. - #[cfg(not(any( - target_arch = "aarch64", - target_arch = "arm", - target_arch = "x86_64", - target_arch = "x86" - )))] - let _cpu_features = cpu_features; - - #[cfg(target_arch = "aarch64")] - { - if cpu::arm::AES.available(cpu_features) { - return Implementation::HWAES; - } - } - - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - { - if cpu::intel::AES.available(cpu_features) { - return Implementation::HWAES; - } - } - - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - { - if cpu::intel::SSSE3.available(cpu_features) { - return Implementation::VPAES_BSAES; - } - } - - #[cfg(any(target_arch = "aarch64", target_arch = "arm"))] - { - if cpu::arm::NEON.available(cpu_features) { - return Implementation::VPAES_BSAES; - } - } +pub(super) trait EncryptCtr32 { + // TODO: Document safety: + fn ctr32_encrypt_within(&self, in_out: &mut [u8], src: RangeFrom, ctr: &mut Counter); +} - { - Implementation::NOHW - } +#[allow(dead_code)] +fn encrypt_block_using_encrypt_iv_xor_block(key: &impl EncryptBlock, block: Block) -> Block { + key.encrypt_iv_xor_block(Iv(block), ZERO_BLOCK) } -/// SAFETY: -/// * The caller must ensure that if blocks > 0 then either `input` and -/// `output` do not overlap at all, or input == output.add(n) for some -/// (nonnegative) n. -/// * if blocks > 0, The caller must ensure `input` points to `blocks` blocks -/// and that `output` points to writable space for `blocks` blocks. -/// * The caller must ensure that `vpaes_key` was initialized with -/// `vpaes_set_encrypt_key`. -/// * Upon returning, `blocks` blocks will have been read from `input` and -/// written to `output`. -#[cfg(target_arch = "arm")] -unsafe fn bsaes_ctr32_encrypt_blocks_with_vpaes_key( - in_out: &mut [u8], - src: RangeFrom, - vpaes_key: &AES_KEY, - ctr: &mut Counter, - cpu_features: cpu::Features, -) { - prefixed_extern! { - // bsaes_ctr32_encrypt_blocks requires transformation of an existing - // VPAES key; there is no `bsaes_set_encrypt_key`. - fn vpaes_encrypt_key_to_bsaes(bsaes_key: *mut AES_KEY, vpaes_key: &AES_KEY); - } +fn encrypt_iv_xor_block_using_encrypt_block( + key: &impl EncryptBlock, + iv: Iv, + block: Block, +) -> Block { + let encrypted_iv = key.encrypt_block(iv.0); + constant_time::xor_16(encrypted_iv, block) +} - // SAFETY: - // * The caller ensures `vpaes_key` was initialized by - // `vpaes_set_encrypt_key`. - // * `bsaes_key was zeroed above, and `vpaes_encrypt_key_to_bsaes` - // is assumed to initialize `bsaes_key`. - let bsaes_key = - unsafe { AES_KEY::derive(vpaes_encrypt_key_to_bsaes, &vpaes_key, cpu_features) }; - - // The code for `vpaes_encrypt_key_to_bsaes` notes "vpaes stores one - // fewer round count than bsaes, but the number of keys is the same," - // so use this as a sanity check. - debug_assert_eq!(bsaes_key.rounds(), vpaes_key.rounds() + 1); - - // SAFETY: - // * `bsaes_key` is in bsaes format after calling - // `vpaes_encrypt_key_to_bsaes`. - // * `bsaes_ctr32_encrypt_blocks` satisfies the contract for - // `ctr32_encrypt_blocks`. - unsafe { - ctr32_encrypt_blocks!( - bsaes_ctr32_encrypt_blocks, - in_out, - src, - &bsaes_key, - ctr, - cpu_features - ); - } +#[allow(dead_code)] +fn encrypt_iv_xor_block_using_ctr32(key: &impl EncryptCtr32, iv: Iv, mut block: Block) -> Block { + let mut ctr = Counter(iv.0); // This is OK because we're only encrypting one block. + key.ctr32_encrypt_within(&mut block, 0.., &mut ctr); + block } #[cfg(test)] @@ -414,7 +190,6 @@ mod tests { #[test] pub fn test_aes() { - let cpu_features = cpu::features(); test::run(test_file!("aes_tests.txt"), |section, test_case| { assert_eq!(section, ""); let key = consume_key(test_case, "Key"); @@ -422,7 +197,7 @@ mod tests { let block: Block = input.as_slice().try_into()?; let expected_output = test_case.consume_bytes("Output"); - let output = key.encrypt_block(block, cpu_features); + let output = key.encrypt_block(block); assert_eq!(output.as_ref(), &expected_output[..]); Ok(()) diff --git a/src/aead/aes/bs.rs b/src/aead/aes/bs.rs new file mode 100644 index 0000000000..5c69715abd --- /dev/null +++ b/src/aead/aes/bs.rs @@ -0,0 +1,62 @@ +// Copyright 2018-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(target_arch = "arm")] + +use super::{Counter, AES_KEY}; +use core::ops::RangeFrom; + +/// SAFETY: +/// * The caller must ensure that if blocks > 0 then either `input` and +/// `output` do not overlap at all, or input == output.add(n) for some +/// (nonnegative) n. +/// * if blocks > 0, The caller must ensure `input` points to `blocks` blocks +/// and that `output` points to writable space for `blocks` blocks. +/// * The caller must ensure that `vpaes_key` was initialized with +/// `vpaes_set_encrypt_key`. +/// * Upon returning, `blocks` blocks will have been read from `input` and +/// written to `output`. +pub(super) unsafe fn ctr32_encrypt_blocks_with_vpaes_key( + in_out: &mut [u8], + src: RangeFrom, + vpaes_key: &AES_KEY, + ctr: &mut Counter, +) { + prefixed_extern! { + // bsaes_ctr32_encrypt_blocks requires transformation of an existing + // VPAES key; there is no `bsaes_set_encrypt_key`. + fn vpaes_encrypt_key_to_bsaes(bsaes_key: *mut AES_KEY, vpaes_key: &AES_KEY); + } + + // SAFETY: + // * The caller ensures `vpaes_key` was initialized by + // `vpaes_set_encrypt_key`. + // * `bsaes_key was zeroed above, and `vpaes_encrypt_key_to_bsaes` + // is assumed to initialize `bsaes_key`. + let bsaes_key = unsafe { AES_KEY::derive(vpaes_encrypt_key_to_bsaes, vpaes_key) }; + + // The code for `vpaes_encrypt_key_to_bsaes` notes "vpaes stores one + // fewer round count than bsaes, but the number of keys is the same," + // so use this as a sanity check. + debug_assert_eq!(bsaes_key.rounds(), vpaes_key.rounds() + 1); + + // SAFETY: + // * `bsaes_key` is in bsaes format after calling + // `vpaes_encrypt_key_to_bsaes`. + // * `bsaes_ctr32_encrypt_blocks` satisfies the contract for + // `ctr32_encrypt_blocks`. + unsafe { + ctr32_encrypt_blocks!(bsaes_ctr32_encrypt_blocks, in_out, src, &bsaes_key, ctr,); + } +} diff --git a/src/aead/aes/fallback.rs b/src/aead/aes/fallback.rs new file mode 100644 index 0000000000..00caa694ab --- /dev/null +++ b/src/aead/aes/fallback.rs @@ -0,0 +1,47 @@ +// Copyright 2018-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{Block, Counter, EncryptBlock, EncryptCtr32, Iv, KeyBytes, AES_KEY}; +use crate::error; +use core::ops::RangeFrom; + +#[derive(Clone)] +pub struct Key { + inner: AES_KEY, +} + +impl Key { + pub(in super::super) fn new(bytes: KeyBytes<'_>) -> Result { + let inner = unsafe { set_encrypt_key!(aes_nohw_set_encrypt_key, bytes) }?; + Ok(Self { inner }) + } +} + +impl EncryptBlock for Key { + fn encrypt_block(&self, block: Block) -> Block { + unsafe { encrypt_block!(aes_nohw_encrypt, block, &self.inner) } + } + + fn encrypt_iv_xor_block(&self, iv: Iv, block: Block) -> Block { + super::encrypt_iv_xor_block_using_encrypt_block(self, iv, block) + } +} + +impl EncryptCtr32 for Key { + fn ctr32_encrypt_within(&self, in_out: &mut [u8], src: RangeFrom, ctr: &mut Counter) { + unsafe { + ctr32_encrypt_blocks!(aes_nohw_ctr32_encrypt_blocks, in_out, src, &self.inner, ctr) + } + } +} diff --git a/src/aead/aes/ffi.rs b/src/aead/aes/ffi.rs index 5248fbe228..840845059b 100644 --- a/src/aead/aes/ffi.rs +++ b/src/aead/aes/ffi.rs @@ -13,7 +13,7 @@ // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. use super::{Block, KeyBytes, BLOCK_LEN}; -use crate::{bits::BitLength, c, cpu, error, polyfill::slice}; +use crate::{bits::BitLength, c, error, polyfill::slice}; use core::{num::NonZeroUsize, ops::RangeFrom}; /// nonce || big-endian counter. @@ -36,7 +36,6 @@ impl AES_KEY { pub(super) unsafe fn new( f: unsafe extern "C" fn(*const u8, BitLength, *mut AES_KEY) -> c::int, bytes: KeyBytes<'_>, - _cpu_features: cpu::Features, ) -> Result { let mut key = Self { rd_key: [0; 4 * (MAX_ROUNDS + 1)], @@ -63,7 +62,6 @@ impl AES_KEY { pub(super) unsafe fn derive( f: for<'a> unsafe extern "C" fn(*mut AES_KEY, &'a AES_KEY), src: &Self, - _cpu_features: cpu::Features, ) -> Self { let mut r = AES_KEY { rd_key: [0u32; 4 * (MAX_ROUNDS + 1)], @@ -89,12 +87,12 @@ impl AES_KEY { // In BoringSSL, the C prototypes for these are in // crypto/fipsmodule/aes/internal.h. macro_rules! set_encrypt_key { - ( $name:ident, $key_bytes:expr, $cpu_features:expr $(,)? ) => {{ + ( $name:ident, $key_bytes:expr $(,)? ) => {{ use crate::{bits::BitLength, c}; prefixed_extern! { fn $name(user_key: *const u8, bits: BitLength, key: *mut AES_KEY) -> c::int; } - $crate::aead::aes::ffi::AES_KEY::new($name, $key_bytes, $cpu_features) + $crate::aead::aes::ffi::AES_KEY::new($name, $key_bytes) }}; } @@ -129,7 +127,7 @@ impl AES_KEY { /// * The caller must ensure that fhe function `$name` satisfies the conditions /// for the `f` parameter to `ctr32_encrypt_blocks`. macro_rules! ctr32_encrypt_blocks { - ($name:ident, $in_out:expr, $src:expr, $key:expr, $ctr:expr, $cpu_features:expr ) => {{ + ($name:ident, $in_out:expr, $src:expr, $key:expr, $ctr:expr $(,)? ) => {{ use crate::{ aead::aes::{ffi::AES_KEY, Counter, BLOCK_LEN}, c, @@ -143,7 +141,7 @@ macro_rules! ctr32_encrypt_blocks { ivec: &Counter, ); } - $key.ctr32_encrypt_blocks($name, $in_out, $src, $ctr, $cpu_features) + $key.ctr32_encrypt_blocks($name, $in_out, $src, $ctr) }}; } @@ -172,7 +170,6 @@ impl AES_KEY { in_out: &mut [u8], src: RangeFrom, ctr: &mut Counter, - cpu_features: cpu::Features, ) { let (input, leftover) = slice::as_chunks(&in_out[src]); debug_assert_eq!(leftover.len(), 0); @@ -189,8 +186,6 @@ impl AES_KEY { let input = input.as_ptr(); let output: *mut [u8; BLOCK_LEN] = in_out.as_mut_ptr().cast(); - let _: cpu::Features = cpu_features; - // SAFETY: // * `input` points to `blocks` blocks. // * `output` points to space for `blocks` blocks to be written. @@ -200,8 +195,6 @@ impl AES_KEY { // `blocks` including zero. // * The caller is responsible for ensuring `key` was initialized by the // `set_encrypt_key!` invocation required by `f`. - // * CPU feature detection has been done so `f` can inspect - // CPU features. unsafe { f(input, output, blocks, self, ctr); } diff --git a/src/aead/aes/hw.rs b/src/aead/aes/hw.rs new file mode 100644 index 0000000000..0db11cfb37 --- /dev/null +++ b/src/aead/aes/hw.rs @@ -0,0 +1,66 @@ +// Copyright 2018-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64"))] + +use super::{Block, Counter, EncryptBlock, EncryptCtr32, Iv, KeyBytes, AES_KEY}; +use crate::{cpu, error}; +use core::ops::RangeFrom; + +#[cfg(target_arch = "aarch64")] +pub(in super::super) type RequiredCpuFeatures = cpu::arm::Aes; + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub(in super::super) type RequiredCpuFeatures = cpu::intel::Aes; + +#[derive(Clone)] +pub struct Key { + inner: AES_KEY, +} + +impl Key { + pub(in super::super) fn new( + bytes: KeyBytes<'_>, + _cpu: RequiredCpuFeatures, + ) -> Result { + let inner = unsafe { set_encrypt_key!(aes_hw_set_encrypt_key, bytes) }?; + Ok(Self { inner }) + } + + #[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))] + #[must_use] + pub(in super::super) fn inner_less_safe(&self) -> &AES_KEY { + &self.inner + } +} + +impl EncryptBlock for Key { + fn encrypt_block(&self, block: Block) -> Block { + super::encrypt_block_using_encrypt_iv_xor_block(self, block) + } + + fn encrypt_iv_xor_block(&self, iv: Iv, block: Block) -> Block { + super::encrypt_iv_xor_block_using_ctr32(self, iv, block) + } +} + +impl EncryptCtr32 for Key { + fn ctr32_encrypt_within(&self, in_out: &mut [u8], src: RangeFrom, ctr: &mut Counter) { + #[cfg(target_arch = "x86_64")] + let _: cpu::Features = cpu::features(); + unsafe { + ctr32_encrypt_blocks!(aes_hw_ctr32_encrypt_blocks, in_out, src, &self.inner, ctr,) + } + } +} diff --git a/src/aead/aes/vp.rs b/src/aead/aes/vp.rs new file mode 100644 index 0000000000..fb97bd44f5 --- /dev/null +++ b/src/aead/aes/vp.rs @@ -0,0 +1,130 @@ +// Copyright 2018-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(any( + target_arch = "aarch64", + target_arch = "arm", + target_arch = "x86", + target_arch = "x86_64" +))] + +use super::{Block, Counter, EncryptBlock, EncryptCtr32, Iv, KeyBytes, AES_KEY}; +use crate::{cpu, error}; +use core::ops::RangeFrom; + +#[cfg(any(target_arch = "aarch64", target_arch = "arm"))] +type RequiredCpuFeatures = cpu::arm::Neon; + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +type RequiredCpuFeatures = cpu::intel::Ssse3; + +#[derive(Clone)] +pub(in super::super) struct Key { + inner: AES_KEY, +} + +impl Key { + pub(in super::super) fn new( + bytes: KeyBytes<'_>, + _cpu: RequiredCpuFeatures, + ) -> Result { + let inner = unsafe { set_encrypt_key!(vpaes_set_encrypt_key, bytes) }?; + Ok(Self { inner }) + } +} + +#[cfg(any(target_arch = "aarch64", target_arch = "arm", target_arch = "x86_64"))] +impl EncryptBlock for Key { + fn encrypt_block(&self, block: Block) -> Block { + super::encrypt_block_using_encrypt_iv_xor_block(self, block) + } + + fn encrypt_iv_xor_block(&self, iv: Iv, block: Block) -> Block { + super::encrypt_iv_xor_block_using_ctr32(self, iv, block) + } +} + +#[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))] +impl EncryptCtr32 for Key { + fn ctr32_encrypt_within(&self, in_out: &mut [u8], src: RangeFrom, ctr: &mut Counter) { + unsafe { ctr32_encrypt_blocks!(vpaes_ctr32_encrypt_blocks, in_out, src, &self.inner, ctr,) } + } +} + +#[cfg(target_arch = "arm")] +impl EncryptCtr32 for Key { + fn ctr32_encrypt_within(&self, in_out: &mut [u8], src: RangeFrom, ctr: &mut Counter) { + use super::{bs, BLOCK_LEN}; + + let in_out = { + let blocks = in_out[src.clone()].len() / BLOCK_LEN; + + // bsaes operates in batches of 8 blocks. + let bsaes_blocks = if blocks >= 8 && (blocks % 8) < 6 { + // It's faster to use bsaes for all the full batches and then + // switch to vpaes for the last partial batch (if any). + blocks - (blocks % 8) + } else if blocks >= 8 { + // It's faster to let bsaes handle everything including + // the last partial batch. + blocks + } else { + // It's faster to let vpaes handle everything. + 0 + }; + let bsaes_in_out_len = bsaes_blocks * BLOCK_LEN; + + // SAFETY: + // * self.inner was initialized with `vpaes_set_encrypt_key` above, + // as required by `bsaes_ctr32_encrypt_blocks_with_vpaes_key`. + unsafe { + bs::ctr32_encrypt_blocks_with_vpaes_key( + &mut in_out[..(src.start + bsaes_in_out_len)], + src.clone(), + &self.inner, + ctr, + ); + } + + &mut in_out[bsaes_in_out_len..] + }; + + // SAFETY: + // * self.inner was initialized with `vpaes_set_encrypt_key` above, + // as required by `vpaes_ctr32_encrypt_blocks`. + // * `vpaes_ctr32_encrypt_blocks` satisfies the contract for + // `ctr32_encrypt_blocks`. + unsafe { ctr32_encrypt_blocks!(vpaes_ctr32_encrypt_blocks, in_out, src, &self.inner, ctr,) } + } +} + +#[cfg(target_arch = "x86")] +impl EncryptBlock for Key { + fn encrypt_block(&self, block: Block) -> Block { + unsafe { encrypt_block!(vpaes_encrypt, block, &self.inner) } + } + + fn encrypt_iv_xor_block(&self, iv: Iv, block: Block) -> Block { + super::encrypt_iv_xor_block_using_encrypt_block(self, iv, block) + } +} + +#[cfg(target_arch = "x86")] +impl EncryptCtr32 for Key { + fn ctr32_encrypt_within(&self, in_out: &mut [u8], src: RangeFrom, ctr: &mut Counter) { + super::super::shift::shift_full_blocks(in_out, src, |input| { + self.encrypt_iv_xor_block(ctr.increment(), *input) + }); + } +} diff --git a/src/aead/aes_gcm.rs b/src/aead/aes_gcm.rs index aa83a975b9..ca44ac3403 100644 --- a/src/aead/aes_gcm.rs +++ b/src/aead/aes_gcm.rs @@ -22,48 +22,110 @@ use crate::{ }; use core::ops::RangeFrom; +#[cfg(target_arch = "x86_64")] +use aes::EncryptCtr32 as _; + +#[cfg(any( + target_arch = "aarch64", + target_arch = "arm", + target_arch = "x86", + target_arch = "x86_64" +))] +use cpu::GetFeature as _; + #[derive(Clone)] -pub struct Key { - gcm_key: gcm::Key, // First because it has a large alignment requirement. - aes_key: aes::Key, -} +pub(super) struct Key(DynKey); impl Key { pub(super) fn new( key: aes::KeyBytes, cpu_features: cpu::Features, ) -> Result { - let aes_key = aes::Key::new(key, cpu_features)?; - let gcm_key = gcm::Key::new( - aes_key.encrypt_block(ZERO_BLOCK, cpu_features), - cpu_features, - ); - Ok(Self { gcm_key, aes_key }) + Ok(Self(DynKey::new(key, cpu_features)?)) + } +} + +#[derive(Clone)] +enum DynKey { + #[cfg(target_arch = "x86_64")] + AesHwClMulAvxMovbe(Combo), + + #[cfg(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64"))] + AesHwClMul(Combo), + + #[cfg(any(target_arch = "aarch64", target_arch = "arm"))] + Simd(Combo), + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Simd(Combo), + + Fallback(Combo), +} + +impl DynKey { + fn new(key: aes::KeyBytes, cpu_features: cpu::Features) -> Result { + #[cfg(target_arch = "x86_64")] + if let (Some(aes), Some(gcm)) = (cpu_features.get_feature(), cpu_features.get_feature()) { + let aes_key = aes::hw::Key::new(key, aes)?; + let gcm_key_value = derive_gcm_key_value(&aes_key); + let gcm_key = gcm::clmulavxmovbe::Key::new(gcm_key_value, gcm); + return Ok(Self::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key })); + } + + #[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "x86"))] + if let (Some(aes), Some(gcm)) = (cpu_features.get_feature(), cpu_features.get_feature()) { + let aes_key = aes::hw::Key::new(key, aes)?; + let gcm_key_value = derive_gcm_key_value(&aes_key); + let gcm_key = gcm::clmul::Key::new(gcm_key_value, gcm); + return Ok(Self::AesHwClMul(Combo { aes_key, gcm_key })); + } + + #[cfg(any(target_arch = "aarch64", target_arch = "arm"))] + if let (Some(aes), Some(gcm)) = (cpu_features.get_feature(), cpu_features.get_feature()) { + let aes_key = aes::vp::Key::new(key, aes)?; + let gcm_key_value = derive_gcm_key_value(&aes_key); + let gcm_key = gcm::neon::Key::new(gcm_key_value, gcm); + return Ok(Self::Simd(Combo { aes_key, gcm_key })); + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + if let Some(aes) = cpu_features.get_feature() { + let aes_key = aes::vp::Key::new(key, aes)?; + let gcm_key_value = derive_gcm_key_value(&aes_key); + let gcm_key = gcm::fallback::Key::new(gcm_key_value); + return Ok(Self::Simd(Combo { aes_key, gcm_key })); + } + + let _ = cpu_features; + + let aes_key = aes::fallback::Key::new(key)?; + let gcm_key_value = derive_gcm_key_value(&aes_key); + let gcm_key = gcm::fallback::Key::new(gcm_key_value); + Ok(Self::Fallback(Combo { aes_key, gcm_key })) } } +fn derive_gcm_key_value(aes_key: &impl aes::EncryptBlock) -> gcm::KeyValue { + gcm::KeyValue::new(aes_key.encrypt_block(ZERO_BLOCK)) +} + const CHUNK_BLOCKS: usize = 3 * 1024 / 16; +#[inline(never)] pub(super) fn seal( - key: &Key, + Key(key): &Key, nonce: Nonce, aad: Aad<&[u8]>, in_out: &mut [u8], - cpu_features: cpu::Features, ) -> Result { - let Key { gcm_key, aes_key } = key; - - let mut auth = gcm::Context::new(gcm_key, aad, in_out.len(), cpu_features)?; - let mut ctr = Counter::one(nonce); let tag_iv = ctr.increment(); - #[cfg(target_arch = "x86_64")] - let in_out = { - if !aes_key.is_aes_hw(cpu_features) || !auth.is_avx() { - in_out - } else { + match key { + #[cfg(target_arch = "x86_64")] + DynKey::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key }) => { use crate::c; + let mut auth = gcm::Context::new(gcm_key, aad, in_out.len())?; let (htable, xi) = auth.inner(); prefixed_extern! { // `HTable` and `Xi` should be 128-bit aligned. TODO: Can we shrink `HTable`? The @@ -89,26 +151,27 @@ pub(super) fn seal( ) }; - match in_out.get_mut(processed..) { + let ramaining = match in_out.get_mut(processed..) { Some(remaining) => remaining, None => { // This can't happen. If it did, then the assembly already // caused a buffer overflow. unreachable!() } - } + }; + let (whole, remainder) = slice::as_chunks_mut(ramaining); + aes_key.ctr32_encrypt_within(slice::flatten_mut(whole), 0.., &mut ctr); + auth.update_blocks(whole); + seal_finish(aes_key, auth, remainder, ctr, tag_iv) } - }; - let (whole, remainder) = slice::as_chunks_mut(in_out); - - #[cfg(target_arch = "aarch64")] - let whole = { - if !aes_key.is_aes_hw(cpu_features) || !auth.is_clmul() { - whole - } else { + #[cfg(target_arch = "aarch64")] + DynKey::AesHwClMul(Combo { aes_key, gcm_key }) => { use crate::bits::BitLength; + let mut auth = gcm::Context::new(gcm_key, aad, in_out.len())?; + + let (whole, remainder) = slice::as_chunks_mut(in_out); let whole_block_bits = auth.in_out_whole_block_bits(); let whole_block_bits_u64: BitLength = whole_block_bits.into(); if let Ok(whole_block_bits) = whole_block_bits_u64.try_into() { @@ -137,20 +200,54 @@ pub(super) fn seal( ) } } - - &mut [] + seal_finish(aes_key, auth, remainder, ctr, tag_iv) } - }; + + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + DynKey::AesHwClMul(c) => seal_strided(c, aad, in_out, ctr, tag_iv), + + #[cfg(any( + target_arch = "aarch64", + target_arch = "arm", + target_arch = "x86_64", + target_arch = "x86" + ))] + DynKey::Simd(c) => seal_strided(c, aad, in_out, ctr, tag_iv), + + DynKey::Fallback(c) => seal_strided(c, aad, in_out, ctr, tag_iv), + } +} + +fn seal_strided( + Combo { aes_key, gcm_key }: &Combo, + aad: Aad<&[u8]>, + in_out: &mut [u8], + mut ctr: Counter, + tag_iv: aes::Iv, +) -> Result { + let mut auth = gcm::Context::new(gcm_key, aad, in_out.len())?; + + let (whole, remainder) = slice::as_chunks_mut(in_out); for chunk in whole.chunks_mut(CHUNK_BLOCKS) { - aes_key.ctr32_encrypt_within(slice::flatten_mut(chunk), 0.., &mut ctr, cpu_features); + aes_key.ctr32_encrypt_within(slice::flatten_mut(chunk), 0.., &mut ctr); auth.update_blocks(chunk); } + seal_finish(aes_key, auth, remainder, ctr, tag_iv) +} + +fn seal_finish( + aes_key: &A, + mut auth: gcm::Context, + remainder: &mut [u8], + ctr: Counter, + tag_iv: aes::Iv, +) -> Result { if !remainder.is_empty() { let mut input = ZERO_BLOCK; overwrite_at_start(&mut input, remainder); - let mut output = aes_key.encrypt_iv_xor_block(ctr.into(), input, cpu_features); + let mut output = aes_key.encrypt_iv_xor_block(ctr.into(), input); output[remainder.len()..].fill(0); auth.update_block(output); overwrite_at_start(remainder, &output); @@ -159,36 +256,26 @@ pub(super) fn seal( Ok(finish(aes_key, auth, tag_iv)) } +#[inline(never)] pub(super) fn open( - key: &Key, + Key(key): &Key, nonce: Nonce, aad: Aad<&[u8]>, in_out: &mut [u8], src: RangeFrom, - cpu_features: cpu::Features, ) -> Result { - let Key { gcm_key, aes_key } = key; - - let mut auth = { - let unprefixed_len = in_out - .len() - .checked_sub(src.start) - .ok_or(error::Unspecified)?; - gcm::Context::new(gcm_key, aad, unprefixed_len, cpu_features) - }?; + // Check that `src` is in bounds. + #[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))] + let input = in_out.get(src.clone()).ok_or(error::Unspecified)?; let mut ctr = Counter::one(nonce); let tag_iv = ctr.increment(); - let in_prefix_len = src.start; - - #[cfg(target_arch = "x86_64")] - let in_out = { - if !aes_key.is_aes_hw(cpu_features) || !auth.is_avx() { - in_out - } else { + match key { + #[cfg(target_arch = "x86_64")] + DynKey::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key }) => { use crate::c; - let (htable, xi) = auth.inner(); + prefixed_extern! { // `HTable` and `Xi` should be 128-bit aligned. TODO: Can we shrink `HTable`? The // assembly says it needs just nine values in that array. @@ -202,6 +289,8 @@ pub(super) fn open( Xi: &mut gcm::Xi) -> c::size_t; } + let mut auth = gcm::Context::new(gcm_key, aad, input.len())?; + let (htable, xi) = auth.inner(); let processed = unsafe { aesni_gcm_decrypt( in_out[src.clone()].as_ptr(), @@ -213,24 +302,48 @@ pub(super) fn open( xi, ) }; - match in_out.get_mut(processed..) { + let in_out = match in_out.get_mut(processed..) { Some(remaining) => remaining, None => { // This can't happen. If it did, then the assembly already // caused a buffer overflow. unreachable!() } - } + }; + // Authenticate any remaining whole blocks. + let input = match in_out.get(src.clone()) { + Some(remaining_input) => remaining_input, + None => unreachable!(), + }; + let (whole, _) = slice::as_chunks(input); + auth.update_blocks(whole); + + let whole_len = slice::flatten(whole).len(); + + // Decrypt any remaining whole blocks. + aes_key.ctr32_encrypt_within( + &mut in_out[..(src.start + whole_len)], + src.clone(), + &mut ctr, + ); + + let in_out = match in_out.get_mut(whole_len..) { + Some(partial) => partial, + None => unreachable!(), + }; + open_finish(aes_key, auth, in_out, src, ctr, tag_iv) } - }; - #[cfg(target_arch = "aarch64")] - let in_out = { - if !aes_key.is_aes_hw(cpu_features) || !auth.is_clmul() { - in_out - } else { + #[cfg(target_arch = "aarch64")] + DynKey::AesHwClMul(Combo { aes_key, gcm_key }) => { use crate::bits::BitLength; + let input_len = input.len(); + let mut auth = gcm::Context::new(gcm_key, aad, input_len)?; + + let remainder_len = input_len % BLOCK_LEN; + let whole_len = input_len - remainder_len; + let whole_block_bits = auth.in_out_whole_block_bits(); let whole_block_bits_u64: BitLength = whole_block_bits.into(); if let Ok(whole_block_bits) = whole_block_bits_u64.try_into() { @@ -260,15 +373,43 @@ pub(super) fn open( ) } } - - &mut in_out[whole_block_bits.as_usize_bytes_rounded_up()..] + let remainder = &mut in_out[whole_len..]; + open_finish(aes_key, auth, remainder, src, ctr, tag_iv) } - }; - let whole_len = { - let in_out_len = in_out.len() - in_prefix_len; - in_out_len - (in_out_len % BLOCK_LEN) - }; + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + DynKey::AesHwClMul(c) => open_strided(c, aad, in_out, src, ctr, tag_iv), + + #[cfg(any( + target_arch = "aarch64", + target_arch = "arm", + target_arch = "x86_64", + target_arch = "x86" + ))] + DynKey::Simd(c) => open_strided(c, aad, in_out, src, ctr, tag_iv), + + DynKey::Fallback(c) => open_strided(c, aad, in_out, src, ctr, tag_iv), + } +} + +#[inline(always)] +fn open_strided( + Combo { aes_key, gcm_key }: &Combo, + aad: Aad<&[u8]>, + in_out: &mut [u8], + src: RangeFrom, + mut ctr: Counter, + tag_iv: aes::Iv, +) -> Result { + let input = in_out.get(src.clone()).ok_or(error::Unspecified)?; + let input_len = input.len(); + + let mut auth = gcm::Context::new(gcm_key, aad, input_len)?; + + let remainder_len = input_len % BLOCK_LEN; + let whole_len = input_len - remainder_len; + let in_prefix_len = src.start; + { let mut chunk_len = CHUNK_BLOCKS * BLOCK_LEN; let mut output = 0; @@ -290,29 +431,40 @@ pub(super) fn open( &mut in_out[output..][..(chunk_len + in_prefix_len)], in_prefix_len.., &mut ctr, - cpu_features, ); output += chunk_len; input += chunk_len; } } - let remainder = &mut in_out[whole_len..]; - shift::shift_partial((in_prefix_len, remainder), |remainder| { + open_finish(aes_key, auth, &mut in_out[whole_len..], src, ctr, tag_iv) +} + +fn open_finish( + aes_key: &A, + mut auth: gcm::Context, + remainder: &mut [u8], + src: RangeFrom, + ctr: Counter, + tag_iv: aes::Iv, +) -> Result { + shift::shift_partial((src.start, remainder), |remainder| { let mut input = ZERO_BLOCK; overwrite_at_start(&mut input, remainder); auth.update_block(input); - aes_key.encrypt_iv_xor_block(ctr.into(), input, cpu_features) + aes_key.encrypt_iv_xor_block(ctr.into(), input) }); Ok(finish(aes_key, auth, tag_iv)) } -fn finish(aes_key: &aes::Key, gcm_ctx: gcm::Context, tag_iv: aes::Iv) -> Tag { +fn finish( + aes_key: &A, + gcm_ctx: gcm::Context, + tag_iv: aes::Iv, +) -> Tag { // Finalize the tag and return it. - gcm_ctx.pre_finish(|pre_tag, cpu_features| { - Tag(aes_key.encrypt_iv_xor_block(tag_iv, pre_tag, cpu_features)) - }) + gcm_ctx.pre_finish(|pre_tag| Tag(aes_key.encrypt_iv_xor_block(tag_iv, pre_tag))) } pub(super) const MAX_IN_OUT_LEN: usize = super::max_input_len(BLOCK_LEN, 2); @@ -326,3 +478,9 @@ pub(super) const MAX_IN_OUT_LEN: usize = super::max_input_len(BLOCK_LEN, 2); // [RFC 5116 Section 5.2]: https://tools.ietf.org/html/rfc5116#section-5.2 const _MAX_INPUT_LEN_BOUNDED_BY_NIST: () = assert!(MAX_IN_OUT_LEN == usize_from_u64_saturated(((1u64 << 39) - 256) / 8)); + +#[derive(Copy, Clone)] +pub(super) struct Combo { + pub(super) aes_key: Aes, + pub(super) gcm_key: Gcm, +} diff --git a/src/aead/algorithm.rs b/src/aead/algorithm.rs index 1357e93c03..1556cf5dde 100644 --- a/src/aead/algorithm.rs +++ b/src/aead/algorithm.rs @@ -187,13 +187,13 @@ fn aes_gcm_seal( nonce: Nonce, aad: Aad<&[u8]>, in_out: &mut [u8], - cpu_features: cpu::Features, + _cpu_features: cpu::Features, ) -> Result { let key = match key { KeyInner::AesGcm(key) => key, _ => unreachable!(), }; - aes_gcm::seal(key, nonce, aad, in_out, cpu_features) + aes_gcm::seal(key, nonce, aad, in_out) } pub(super) fn aes_gcm_open( @@ -202,13 +202,13 @@ pub(super) fn aes_gcm_open( aad: Aad<&[u8]>, in_out: &mut [u8], src: RangeFrom, - cpu_features: cpu::Features, + _cpu_features: cpu::Features, ) -> Result { let key = match key { KeyInner::AesGcm(key) => key, _ => unreachable!(), }; - aes_gcm::open(key, nonce, aad, in_out, src, cpu_features) + aes_gcm::open(key, nonce, aad, in_out, src) } /// ChaCha20-Poly1305 as described in [RFC 8439]. diff --git a/src/aead/gcm.rs b/src/aead/gcm.rs index 27716e4ba8..7fcfd88d86 100644 --- a/src/aead/gcm.rs +++ b/src/aead/gcm.rs @@ -1,4 +1,4 @@ -// Copyright 2018 Brian Smith. +// Copyright 2018-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above @@ -16,11 +16,13 @@ use self::ffi::{Block, BLOCK_LEN, ZERO_BLOCK}; use super::{aes_gcm, Aad}; use crate::{ bits::{BitLength, FromByteLen as _}, - cpu, error, - polyfill::{sliceutil::overwrite_at_start, ArraySplitMap as _}, + error, + polyfill::{sliceutil::overwrite_at_start, NotSend}, }; use cfg_if::cfg_if; +pub(super) use ffi::KeyValue; + cfg_if! { if #[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))] { pub(super) use self::ffi::{HTable, Xi}; @@ -31,48 +33,26 @@ cfg_if! { #[macro_use] mod ffi; -mod gcm_nohw; - -#[derive(Clone)] -pub struct Key { - h_table: HTable, -} - -impl Key { - pub(super) fn new(h_be: Block, cpu_features: cpu::Features) -> Self { - let h: [u64; 2] = h_be.array_split_map(u64::from_be_bytes); - let h_table = match detect_implementation(cpu_features) { - #[cfg(target_arch = "x86_64")] - Implementation::CLMUL if has_avx_movbe(cpu_features) => unsafe { - htable_new!(gcm_init_avx, &h, cou_features) - }, - #[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "x86"))] - Implementation::CLMUL => unsafe { htable_new!(gcm_init_clmul, &h, cpu_features) }, - - #[cfg(any(target_arch = "aarch64", target_arch = "arm"))] - Implementation::NEON => unsafe { htable_new!(gcm_init_neon, &h, cpu_features) }, - - Implementation::Fallback => HTable::new_single_entry(gcm_nohw::init(h)), - }; - Self { h_table } - } -} +pub(super) mod clmul; +pub(super) mod clmulavxmovbe; +pub(super) mod fallback; +pub(super) mod neon; -pub struct Context<'key> { +pub(super) struct Context<'key, K> { Xi: Xi, - h_table: &'key HTable, + key: &'key K, aad_len: BitLength, in_out_len: BitLength, - cpu_features: cpu::Features, + _not_send: NotSend, } -impl<'key> Context<'key> { +impl<'key, K: Gmult> Context<'key, K> { + #[inline(always)] pub(crate) fn new( - key: &'key Key, + key: &'key K, aad: Aad<&[u8]>, in_out_len: usize, - cpu_features: cpu::Features, ) -> Result { if in_out_len > aes_gcm::MAX_IN_OUT_LEN { return Err(error::Unspecified); @@ -86,10 +66,10 @@ impl<'key> Context<'key> { let mut ctx = Self { Xi: Xi(ZERO_BLOCK), - h_table: &key.h_table, + key, aad_len, in_out_len, - cpu_features, + _not_send: NotSend::VALUE, }; for ad in aad.0.chunks(BLOCK_LEN) { @@ -100,8 +80,10 @@ impl<'key> Context<'key> { Ok(ctx) } +} - #[cfg(all(target_arch = "aarch64", target_pointer_width = "64"))] +#[cfg(all(target_arch = "aarch64", target_pointer_width = "64"))] +impl Context<'_, K> { pub(super) fn in_out_whole_block_bits(&self) -> BitLength { use crate::polyfill::usize_from_u64; const WHOLE_BLOCK_BITS_MASK: usize = !0b111_1111; @@ -110,160 +92,57 @@ impl<'key> Context<'key> { assert!(WHOLE_BLOCK_BITS_MASK == !((BLOCK_LEN * 8) - 1)); BitLength::from_bits(usize_from_u64(self.in_out_len.as_bits()) & WHOLE_BLOCK_BITS_MASK) } +} - /// Access to `inner` for the integrated AES-GCM implementations only. - #[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))] +#[cfg(target_arch = "aarch64")] +/// Access to `inner` for the integrated AES-GCM implementations only. +impl Context<'_, clmul::Key> { #[inline] pub(super) fn inner(&mut self) -> (&HTable, &mut Xi) { - (self.h_table, &mut self.Xi) + (&self.key.inner(), &mut self.Xi) } +} - pub fn update_blocks(&mut self, input: &[[u8; BLOCK_LEN]]) { - let xi = &mut self.Xi; - let h_table = self.h_table; - - match detect_implementation(self.cpu_features) { - #[cfg(target_arch = "x86_64")] - // SAFETY: gcm_ghash_avx satisfies the ghash! contract. - Implementation::CLMUL if has_avx_movbe(self.cpu_features) => unsafe { - ghash!(gcm_ghash_avx, xi, h_table, input, self.cpu_features); - }, - - #[cfg(target_arch = "aarch64")] - // If we have CLMUL then we probably have AES, so the integrated - // implementation will take care of everything except any final - // partial block. Thus, we avoid having an optimized implementation - // here. - Implementation::CLMUL => self.update_blocks_1x(input), - - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - // SAFETY: gcm_ghash_clmul satisfies the ghash! contract on these - // targets. - Implementation::CLMUL => unsafe { - ghash!(gcm_ghash_clmul, xi, h_table, input, self.cpu_features); - }, - - #[cfg(any(target_arch = "aarch64", target_arch = "arm"))] - // SAFETY: gcm_ghash_neon satisfies the ghash! contract on these - // targets. - Implementation::NEON => unsafe { - ghash!(gcm_ghash_neon, xi, h_table, input, self.cpu_features); - }, - - Implementation::Fallback => { - gcm_nohw::ghash(xi, h_table.first_entry(), input); - } - } +#[cfg(target_arch = "x86_64")] +impl Context<'_, clmulavxmovbe::Key> { + /// Access to `inner` for the integrated AES-GCM implementations only. + #[inline] + pub(super) fn inner(&mut self) -> (&HTable, &mut Xi) { + (self.key.inner(), &mut self.Xi) } +} - #[cfg(target_arch = "aarch64")] - #[inline(never)] - fn update_blocks_1x(&mut self, input: &[[u8; BLOCK_LEN]]) { - for input in input { - self.update_block(*input); - } +impl Context<'_, K> { + #[inline(always)] + pub fn update_blocks(&mut self, input: &[[u8; BLOCK_LEN]]) { + self.key.update_blocks(&mut self.Xi, input); } +} +impl Context<'_, K> { pub fn update_block(&mut self, a: Block) { self.Xi.bitxor_assign(a); - - let xi = &mut self.Xi; - let h_table = self.h_table; - - match detect_implementation(self.cpu_features) { - #[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "x86"))] - Implementation::CLMUL => unsafe { - gmult!(gcm_gmult_clmul, xi, h_table, self.cpu_features) - }, - - #[cfg(any(target_arch = "aarch64", target_arch = "arm"))] - Implementation::NEON => unsafe { - gmult!(gcm_gmult_neon, xi, h_table, self.cpu_features) - }, - - Implementation::Fallback => { - gcm_nohw::gmult(xi, h_table.first_entry()); - } - } + self.key.gmult(&mut self.Xi); } + #[inline(always)] pub(super) fn pre_finish(mut self, f: F) -> super::Tag where - F: FnOnce(Block, cpu::Features) -> super::Tag, + F: FnOnce(Block) -> super::Tag, { let mut block = [0u8; BLOCK_LEN]; let (alen, clen) = block.split_at_mut(BLOCK_LEN / 2); alen.copy_from_slice(&BitLength::::to_be_bytes(self.aad_len)); clen.copy_from_slice(&BitLength::::to_be_bytes(self.in_out_len)); self.update_block(block); - f(self.Xi.into_block(), self.cpu_features) - } - - #[cfg(target_arch = "x86_64")] - pub(super) fn is_avx(&self) -> bool { - match detect_implementation(self.cpu_features) { - Implementation::CLMUL => has_avx_movbe(self.cpu_features), - _ => false, - } - } - - #[cfg(target_arch = "aarch64")] - pub(super) fn is_clmul(&self) -> bool { - matches!( - detect_implementation(self.cpu_features), - Implementation::CLMUL - ) + f(self.Xi.0) } } -#[allow(clippy::upper_case_acronyms)] -enum Implementation { - #[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "x86"))] - CLMUL, - - #[cfg(any(target_arch = "aarch64", target_arch = "arm"))] - NEON, - - Fallback, -} - -#[inline] -fn detect_implementation(cpu_features: cpu::Features) -> Implementation { - // `cpu_features` is only used for specific platforms. - #[cfg(not(any( - target_arch = "aarch64", - target_arch = "arm", - target_arch = "x86_64", - target_arch = "x86" - )))] - let _cpu_features = cpu_features; - - #[cfg(target_arch = "aarch64")] - { - if cpu::arm::PMULL.available(cpu_features) { - return Implementation::CLMUL; - } - } - - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - { - if cpu::intel::FXSR.available(cpu_features) && cpu::intel::PCLMULQDQ.available(cpu_features) - { - return Implementation::CLMUL; - } - } - - #[cfg(any(target_arch = "aarch64", target_arch = "arm"))] - { - if cpu::arm::NEON.available(cpu_features) { - return Implementation::NEON; - } - } - - Implementation::Fallback +pub(super) trait Gmult { + fn gmult(&self, xi: &mut Xi); } -#[cfg(target_arch = "x86_64")] -fn has_avx_movbe(cpu_features: cpu::Features) -> bool { - cpu::intel::AVX.available(cpu_features) && cpu::intel::MOVBE.available(cpu_features) +pub(super) trait UpdateBlocks { + fn update_blocks(&self, xi: &mut Xi, input: &[[u8; BLOCK_LEN]]); } diff --git a/src/aead/gcm/clmul.rs b/src/aead/gcm/clmul.rs new file mode 100644 index 0000000000..848258a841 --- /dev/null +++ b/src/aead/gcm/clmul.rs @@ -0,0 +1,66 @@ +// Copyright 2018-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64"))] + +use super::{ffi::KeyValue, Gmult, HTable, Xi}; +use crate::cpu; + +#[cfg(target_arch = "aarch64")] +pub(in super::super) type RequiredCpuFeatures = cpu::arm::PMull; + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub(in super::super) type RequiredCpuFeatures = (cpu::intel::ClMul, cpu::intel::Fxsr); + +#[derive(Clone)] +pub struct Key { + h_table: HTable, +} + +impl Key { + pub(in super::super) fn new(value: KeyValue, _cpu: RequiredCpuFeatures) -> Self { + Self { + h_table: unsafe { htable_new!(gcm_init_clmul, value) }, + } + } + + #[cfg(target_arch = "x86_64")] + pub(super) fn new_avx( + value: KeyValue, + _cpu_features: super::clmulavxmovbe::RequiredCpuFeatures, + ) -> Self { + Self { + h_table: unsafe { htable_new!(gcm_init_avx, value) }, + } + } + + #[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))] + pub(super) fn inner(&self) -> &HTable { + &self.h_table + } +} + +impl Gmult for Key { + fn gmult(&self, xi: &mut Xi) { + unsafe { gmult!(gcm_gmult_clmul, xi, &self.h_table) } + } +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl super::UpdateBlocks for Key { + fn update_blocks(&self, xi: &mut Xi, input: &[[u8; super::BLOCK_LEN]]) { + let _: cpu::Features = cpu::features(); + unsafe { ghash!(gcm_ghash_clmul, xi, &self.h_table, input) } + } +} diff --git a/src/aead/gcm/clmulavxmovbe.rs b/src/aead/gcm/clmulavxmovbe.rs new file mode 100644 index 0000000000..753bb27906 --- /dev/null +++ b/src/aead/gcm/clmulavxmovbe.rs @@ -0,0 +1,53 @@ +// Copyright 2018-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(target_arch = "x86_64")] + +use super::{clmul, Gmult, HTable, KeyValue, UpdateBlocks, Xi, BLOCK_LEN}; +use crate::cpu; + +pub(in super::super) type RequiredCpuFeatures = ( + clmul::RequiredCpuFeatures, + cpu::intel::Avx, + cpu::intel::Movbe, +); + +#[derive(Clone)] +pub struct Key { + inner: clmul::Key, +} + +impl Key { + pub(in super::super) fn new(key_value: KeyValue, cpu: RequiredCpuFeatures) -> Self { + Self { + inner: clmul::Key::new_avx(key_value, cpu), + } + } + + pub(super) fn inner(&self) -> &HTable { + self.inner.inner() + } +} + +impl Gmult for Key { + fn gmult(&self, xi: &mut Xi) { + self.inner.gmult(xi) + } +} + +impl UpdateBlocks for Key { + fn update_blocks(&self, xi: &mut Xi, input: &[[u8; BLOCK_LEN]]) { + unsafe { ghash!(gcm_ghash_avx, xi, &self.inner.inner(), input,) } + } +} diff --git a/src/aead/gcm/gcm_nohw.rs b/src/aead/gcm/fallback.rs similarity index 92% rename from src/aead/gcm/gcm_nohw.rs rename to src/aead/gcm/fallback.rs index 77ca08e056..219fbcc81f 100644 --- a/src/aead/gcm/gcm_nohw.rs +++ b/src/aead/gcm/fallback.rs @@ -1,5 +1,5 @@ // Copyright (c) 2019, Google Inc. -// Portions Copyright 2020 Brian Smith. +// Portions Copyright 2020-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above @@ -22,9 +22,32 @@ // // Unlike the BearSSL notes, we use u128 in the 64-bit implementation. -use super::{ffi::U128, Xi, BLOCK_LEN}; +use super::{ffi::U128, Gmult, KeyValue, UpdateBlocks, Xi, BLOCK_LEN}; use crate::polyfill::ArraySplitMap as _; +#[derive(Clone)] +pub struct Key { + h: U128, +} + +impl Key { + pub(in super::super) fn new(value: KeyValue) -> Self { + Self { h: init(value) } + } +} + +impl Gmult for Key { + fn gmult(&self, xi: &mut Xi) { + gmult(xi, self.h); + } +} + +impl UpdateBlocks for Key { + fn update_blocks(&self, xi: &mut Xi, input: &[[u8; BLOCK_LEN]]) { + ghash(xi, self.h, input); + } +} + #[cfg(target_pointer_width = "64")] fn gcm_mul64_nohw(a: u64, b: u64) -> (u64, u64) { #[allow(clippy::cast_possible_truncation)] @@ -138,7 +161,9 @@ fn gcm_mul64_nohw(a: u64, b: u64) -> (u64, u64) { (lo ^ (mid << 32), hi ^ (mid >> 32)) } -pub(super) fn init(xi: [u64; 2]) -> U128 { +fn init(value: KeyValue) -> U128 { + let xi = value.into_inner(); + // We implement GHASH in terms of POLYVAL, as described in RFC 8452. This // avoids a shift by 1 in the multiplication, needed to account for bit // reversal losing a bit after multiplication, that is, @@ -217,13 +242,13 @@ fn gcm_polyval_nohw(xi: &mut [u64; 2], h: U128) { *xi = [r2, r3]; } -pub(super) fn gmult(xi: &mut Xi, h: U128) { +fn gmult(xi: &mut Xi, h: U128) { with_swapped_xi(xi, |swapped| { gcm_polyval_nohw(swapped, h); }) } -pub(super) fn ghash(xi: &mut Xi, h: U128, input: &[[u8; BLOCK_LEN]]) { +fn ghash(xi: &mut Xi, h: U128, input: &[[u8; BLOCK_LEN]]) { with_swapped_xi(xi, |swapped| { input.iter().for_each(|&input| { let input = input.array_split_map(u64::from_be_bytes); diff --git a/src/aead/gcm/ffi.rs b/src/aead/gcm/ffi.rs index b8dcee925f..6089800bea 100644 --- a/src/aead/gcm/ffi.rs +++ b/src/aead/gcm/ffi.rs @@ -12,7 +12,7 @@ // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -use crate::constant_time; +use crate::{constant_time, polyfill::ArraySplitMap}; pub(in super::super) const BLOCK_LEN: usize = 16; pub(in super::super) type Block = [u8; BLOCK_LEN]; @@ -25,12 +25,12 @@ pub(super) const ZERO_BLOCK: Block = [0u8; BLOCK_LEN]; target_arch = "x86_64" ))] macro_rules! htable_new { - ( $name:ident, $input:expr, $cpu_features:expr ) => {{ + ( $name:ident, $value:expr $(,)? ) => {{ use crate::aead::gcm::ffi::HTable; prefixed_extern! { fn $name(HTable: &mut HTable, h: &[u64; 2]); } - HTable::new($name, $input) + HTable::new($name, $value) }}; } @@ -41,12 +41,12 @@ macro_rules! htable_new { target_arch = "x86_64" ))] macro_rules! gmult { - ( $name:ident, $xi:expr, $h_table:expr, $cpu_features:expr ) => {{ + ( $name:ident, $xi:expr, $h_table:expr $(,)? ) => {{ use crate::aead::gcm::ffi::{HTable, Xi}; prefixed_extern! { fn $name(xi: &mut Xi, Htable: &HTable); } - $h_table.gmult($name, $xi, $cpu_features) + $h_table.gmult($name, $xi) }}; } @@ -60,7 +60,7 @@ macro_rules! gmult { target_arch = "x86_64" ))] macro_rules! ghash { - ( $name:ident, $xi:expr, $h_table:expr, $input:expr, $cpu_features:expr ) => {{ + ( $name:ident, $xi:expr, $h_table:expr, $input:expr $(,)? ) => {{ use crate::aead::gcm::ffi::{HTable, Xi}; prefixed_extern! { fn $name( @@ -70,10 +70,22 @@ macro_rules! ghash { len: crate::c::NonZero_size_t, ); } - $h_table.ghash($name, $xi, $input, $cpu_features) + $h_table.ghash($name, $xi, $input) }}; } +pub(in super::super) struct KeyValue([u64; 2]); + +impl KeyValue { + pub(in super::super) fn new(value: Block) -> Self { + Self(value.array_split_map(u64::from_be_bytes)) + } + + pub(super) fn into_inner(self) -> [u64; 2] { + self.0 + } +} + /// SAFETY: /// * `f` must read `len` bytes from `inp`; it may assume /// that `len` is a (non-zero) multiple of `BLOCK_LEN`. @@ -86,13 +98,13 @@ macro_rules! ghash { ))] impl HTable { pub(super) unsafe fn new( - init: unsafe extern "C" fn(HTable: &mut HTable, h: &[u64; 2]), - value: &[u64; 2], + init: unsafe extern "C" fn(HTable: &mut HTable, &[u64; 2]), + value: KeyValue, ) -> Self { let mut r = Self { Htable: [U128 { hi: 0, lo: 0 }; HTABLE_LEN], }; - unsafe { init(&mut r, value) }; + unsafe { init(&mut r, &value.0) }; r } @@ -100,7 +112,6 @@ impl HTable { &self, f: unsafe extern "C" fn(xi: &mut Xi, h_table: &HTable), xi: &mut Xi, - _cpu_features: crate::cpu::Features, ) { unsafe { f(xi, self) } } @@ -115,7 +126,6 @@ impl HTable { ), xi: &mut Xi, input: &[[u8; BLOCK_LEN]], - cpu_features: crate::cpu::Features, ) { use crate::polyfill::slice; use core::num::NonZeroUsize; @@ -129,31 +139,15 @@ impl HTable { } }; - let _: crate::cpu::Features = cpu_features; // SAFETY: // * There are `input_len: NonZeroUsize` bytes available at `input` for // `f` to read. - // * CPU feature detection has been done. unsafe { f(xi, self, input.as_ptr(), input_len); } } } -impl HTable { - pub(super) fn new_single_entry(first_entry: U128) -> Self { - let mut r = Self { - Htable: [U128 { hi: 0, lo: 0 }; HTABLE_LEN], - }; - r.Htable[0] = first_entry; - r - } - - pub(super) fn first_entry(&self) -> U128 { - self.Htable[0] - } -} - // The alignment is required by some assembly code. #[derive(Clone)] #[repr(C, align(16))] @@ -178,9 +172,4 @@ impl Xi { pub(super) fn bitxor_assign(&mut self, a: Block) { self.0 = constant_time::xor_16(self.0, a) } - - #[inline] - pub(super) fn into_block(self) -> Block { - self.0 - } } diff --git a/src/aead/gcm/neon.rs b/src/aead/gcm/neon.rs new file mode 100644 index 0000000000..f1dd07cf25 --- /dev/null +++ b/src/aead/gcm/neon.rs @@ -0,0 +1,45 @@ +// Copyright 2018-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(any(target_arch = "aarch64", target_arch = "arm"))] + +use super::{Gmult, HTable, KeyValue, UpdateBlocks, Xi, BLOCK_LEN}; +use crate::cpu; + +pub(in super::super) type RequiredCpuFeatures = cpu::arm::Neon; + +#[derive(Clone)] +pub struct Key { + h_table: HTable, +} + +impl Key { + pub(in super::super) fn new(value: KeyValue, _cpu: RequiredCpuFeatures) -> Self { + Self { + h_table: unsafe { htable_new!(gcm_init_neon, value) }, + } + } +} + +impl Gmult for Key { + fn gmult(&self, xi: &mut Xi) { + unsafe { gmult!(gcm_gmult_neon, xi, &self.h_table) } + } +} + +impl UpdateBlocks for Key { + fn update_blocks(&self, xi: &mut Xi, input: &[[u8; BLOCK_LEN]]) { + unsafe { ghash!(gcm_ghash_neon, xi, &self.h_table, input) } + } +} diff --git a/src/aead/shift.rs b/src/aead/shift.rs index fc2227378f..f4a62eceeb 100644 --- a/src/aead/shift.rs +++ b/src/aead/shift.rs @@ -14,7 +14,7 @@ use crate::polyfill::sliceutil::overwrite_at_start; -#[cfg(target_arch = "x86")] +#[allow(dead_code)] pub fn shift_full_blocks( in_out: &mut [u8], src: core::ops::RangeFrom, diff --git a/src/cpu.rs b/src/cpu.rs index bd5833ab99..90b6445c7f 100644 --- a/src/cpu.rs +++ b/src/cpu.rs @@ -14,6 +14,60 @@ pub(crate) use self::features::Features; +macro_rules! impl_get_feature { + { $feature:path => $T:ident } => { + #[derive(Clone, Copy)] + pub(crate) struct $T(crate::cpu::Features); + + impl crate::cpu::GetFeature<$T> for super::Features { + fn get_feature(&self) -> Option<$T> { + if $feature.available(*self) { + Some($T(*self)) + } else { + None + } + } + } + + impl From<$T> for crate::cpu::Features { + fn from($T(features): $T) -> Self { + features + } + } + } +} + +pub(crate) trait GetFeature { + fn get_feature(&self) -> Option; +} + +impl GetFeature<(A, B)> for T +where + T: GetFeature, + T: GetFeature, +{ + fn get_feature(&self) -> Option<(A, B)> { + match (self.get_feature(), self.get_feature()) { + (Some(a), Some(b)) => Some((a, b)), + _ => None, + } + } +} + +impl GetFeature<(A, B, C)> for T +where + T: GetFeature, + T: GetFeature, + T: GetFeature, +{ + fn get_feature(&self) -> Option<(A, B, C)> { + match (self.get_feature(), self.get_feature(), self.get_feature()) { + (Some(a), Some(b), Some(c)) => Some((a, b, c)), + _ => None, + } + } +} + #[inline(always)] pub(crate) fn features() -> Features { get_or_init_feature_flags() diff --git a/src/cpu/arm.rs b/src/cpu/arm.rs index be6322af6d..d2920a10eb 100644 --- a/src/cpu/arm.rs +++ b/src/cpu/arm.rs @@ -63,7 +63,7 @@ cfg_if::cfg_if! { macro_rules! features { { $( - $target_feature_name:expr => $name:ident { + $target_feature_name:expr => $TyName:ident($name:ident) { mask: $mask:expr, } ),+ @@ -74,6 +74,7 @@ macro_rules! features { pub(crate) const $name: Feature = Feature { mask: $mask, }; + impl_get_feature!{ $name => $TyName } )+ // See const assertions below. @@ -115,17 +116,17 @@ impl Feature { #[cfg(target_arch = "aarch64")] features! { // Keep in sync with `ARMV7_NEON`. - "neon" => NEON { + "neon" => Neon(NEON) { mask: 1 << 0, }, // Keep in sync with `ARMV8_AES`. - "aes" => AES { + "aes" => Aes(AES) { mask: 1 << 2, }, // Keep in sync with `ARMV8_SHA256`. - "sha2" => SHA256 { + "sha2" => Sha256(SHA256) { mask: 1 << 4, }, @@ -137,13 +138,13 @@ features! { // https://developer.arm.com/downloads/-/exploration-tools/feature-names-for-a-profile // "Features introduced prior to 2020." Change this to use "pmull" when // that is supported. - "aes" => PMULL { + "aes" => PMull(PMULL) { mask: 1 << 5, }, // Keep in sync with `ARMV8_SHA512`. // "sha3" is overloaded for both SHA-3 and SHA512. - "sha3" => SHA512 { + "sha3" => Sha512(SHA512) { mask: 1 << 6, }, } @@ -151,7 +152,7 @@ features! { #[cfg(target_arch = "arm")] features! { // Keep in sync with `ARMV7_NEON`. - "neon" => NEON { + "neon" => Neon(NEON) { mask: 1 << 0, }, } diff --git a/src/cpu/intel.rs b/src/cpu/intel.rs index d2d18e316a..172fe47bef 100644 --- a/src/cpu/intel.rs +++ b/src/cpu/intel.rs @@ -12,6 +12,8 @@ // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +use cfg_if::cfg_if; + mod abi_assumptions { // TOOD: Support targets that do not have SSE and SSE2 enabled, such as // x86_64-unknown-linux-none. See @@ -118,22 +120,32 @@ pub(crate) const SSE41: Feature = Feature { mask: 1 << 19, }; -#[cfg(target_arch = "x86_64")] -pub(crate) const MOVBE: Feature = Feature { - word: 1, - mask: 1 << 22, -}; - pub(crate) const AES: Feature = Feature { word: 1, mask: 1 << 25, }; -#[cfg(target_arch = "x86_64")] -pub(crate) const AVX: Feature = Feature { - word: 1, - mask: 1 << 28, -}; +impl_get_feature! { AES => Aes } +impl_get_feature! { FXSR => Fxsr } +impl_get_feature! { PCLMULQDQ => ClMul } +impl_get_feature! { SSSE3 => Ssse3 } + +cfg_if! { + if #[cfg(any(target_arch = "x86_64"))] { + pub(crate) const MOVBE: Feature = Feature { + word: 1, + mask: 1 << 22, + }; + + pub(crate) const AVX: Feature = Feature { + word: 1, + mask: 1 << 28, + }; + + impl_get_feature!{ MOVBE => Movbe } + impl_get_feature!{ AVX => Avx } + } +} #[cfg(all(target_arch = "x86_64", test))] mod x86_64_tests { diff --git a/src/lib.rs b/src/lib.rs index f501856bee..cb2d55bf1c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -82,6 +82,15 @@ clippy::cast_precision_loss, clippy::cast_sign_loss )] +#![cfg_attr( + not(any( + target_arch = "aarch64", + target_arch = "arm", + target_arch = "x86", + target_arch = "x86_64" + )), + allow(dead_code, unused_imports, unused_macros) +)] #![no_std] #[cfg(feature = "alloc")] diff --git a/src/polyfill.rs b/src/polyfill.rs index 4d5a0ec1f0..96702a7a80 100644 --- a/src/polyfill.rs +++ b/src/polyfill.rs @@ -64,6 +64,15 @@ mod test; mod unwrap_const; +#[cfg_attr( + not(any( + target_arch = "aarch64", + target_arch = "arm", + target_arch = "x86", + target_arch = "x86_64" + )), + allow(unused_imports) +)] pub use self::{ array_flat_map::ArrayFlatMap, array_split_map::ArraySplitMap, notsend::NotSend, unwrap_const::unwrap_const,