diff --git a/TODO.md b/TODO.md index 42ef6e43bb183..2764cd8d97c1a 100644 --- a/TODO.md +++ b/TODO.md @@ -5,6 +5,7 @@ Intel intrinsics. Replace `SSE4.2` with the intended type. rg '^> TODO.md ``` +rg calls the ripgrep tool, which can be installed with `cargo install ripgrep` sse --- @@ -535,3 +536,391 @@ sse4.2 * [ ] `_mm_crc32_u16` * [ ] `_mm_crc32_u32` * [ ] `_mm_crc32_u64` + + +avx +--- +* [ ] `_mm256_add_pd` +* [ ] `_mm256_add_ps` +* [ ] `_mm256_addsub_pd` +* [ ] `_mm256_addsub_ps` +* [ ] `_mm256_and_pd` +* [ ] `_mm256_and_ps` +* [ ] `_mm256_andnot_pd` +* [ ] `_mm256_andnot_ps` +* [ ] `_mm256_blend_pd` +* [ ] `_mm256_blend_ps` +* [ ] `_mm256_blendv_pd` +* [ ] `_mm256_blendv_ps` +* [ ] `_mm256_div_pd` +* [ ] `_mm256_div_ps` +* [ ] `_mm256_dp_ps` +* [ ] `_mm256_hadd_pd` +* [ ] `_mm256_hadd_ps` +* [ ] `_mm256_hsub_pd` +* [ ] `_mm256_hsub_ps` +* [ ] `_mm256_max_pd` +* [ ] `_mm256_max_ps` +* [ ] `_mm256_min_pd` +* [ ] `_mm256_min_ps` +* [ ] `_mm256_mul_pd` +* [ ] `_mm256_mul_ps` +* [ ] `_mm256_or_pd` +* [ ] `_mm256_or_ps` +* [ ] `_mm256_shuffle_pd` +* [ ] `_mm256_shuffle_ps` +* [ ] `_mm256_sub_pd` +* [ ] `_mm256_sub_ps` +* [ ] `_mm256_xor_pd` +* [ ] `_mm256_xor_ps` +* [ ] `_mm_cmp_pd` +* [ ] `_mm256_cmp_pd` +* [ ] `_mm_cmp_ps` +* [ ] `_mm256_cmp_ps` +* [ ] `_mm_cmp_sd` +* [ ] `_mm_cmp_ss` +* [ ] `_mm256_cvtepi32_pd` +* [ ] `_mm256_cvtepi32_ps` +* [ ] `_mm256_cvtpd_ps` +* [ ] `_mm256_cvtps_epi32` +* [ ] `_mm256_cvtps_pd` +* [ ] `_mm256_cvttpd_epi32` +* [ ] `_mm256_cvtpd_epi32` +* [ ] `_mm256_cvttps_epi32` +* [ ] `_mm256_extractf128_ps` +* [ ] `_mm256_extractf128_pd` +* [ ] `_mm256_extractf128_si256` +* [ ] `_mm256_extract_epi8` +* [ ] `_mm256_extract_epi16` +* [ ] `_mm256_extract_epi32` +* [ ] `_mm256_extract_epi64` +* [ ] `_mm256_zeroall` +* [ ] `_mm256_zeroupper` +* [ ] `_mm256_permutevar_ps` +* [ ] `_mm_permutevar_ps` +* [ ] `_mm256_permute_ps` +* [ ] `_mm_permute_ps` +* [ ] `_mm256_permutevar_pd` +* [ ] `_mm_permutevar_pd` +* [ ] `_mm256_permute_pd` +* [ ] `_mm_permute_pd` +* [ ] `_mm256_permute2f128_ps` +* [ ] `_mm256_permute2f128_pd` +* [ ] `_mm256_permute2f128_si256` +* [ ] `_mm256_broadcast_ss` +* [ ] `_mm_broadcast_ss` +* [ ] `_mm256_broadcast_sd` +* [ ] `_mm256_broadcast_ps` +* [ ] `_mm256_broadcast_pd` +* [ ] `_mm256_insertf128_ps` +* [ ] `_mm256_insertf128_pd` +* [ ] `_mm256_insertf128_si256` +* [ ] `_mm256_insert_epi8` +* [ ] `_mm256_insert_epi16` +* [ ] `_mm256_insert_epi32` +* [ ] `_mm256_insert_epi64` +* [ ] `_mm256_load_pd` +* [ ] `_mm256_store_pd` +* [ ] `_mm256_load_ps` +* [ ] `_mm256_store_ps` +* [ ] `_mm256_loadu_pd` +* [ ] `_mm256_storeu_pd` +* [ ] `_mm256_loadu_ps` +* [ ] `_mm256_storeu_ps` +* [ ] `_mm256_load_si256` +* [ ] `_mm256_store_si256` +* [ ] `_mm256_loadu_si256` +* [ ] `_mm256_storeu_si256` +* [ ] `_mm256_maskload_pd` +* [ ] `_mm256_maskstore_pd` +* [ ] `_mm_maskload_pd` +* [ ] `_mm_maskstore_pd` +* [ ] `_mm256_maskload_ps` +* [ ] `_mm256_maskstore_ps` +* [ ] `_mm_maskload_ps` +* [ ] `_mm_maskstore_ps` +* [ ] `_mm256_movehdup_ps` +* [ ] `_mm256_moveldup_ps` +* [ ] `_mm256_movedup_pd` +* [ ] `_mm256_lddqu_si256` +* [ ] `_mm256_stream_si256` +* [ ] `_mm256_stream_pd` +* [ ] `_mm256_stream_ps` +* [ ] `_mm256_rcp_ps` +* [ ] `_mm256_rsqrt_ps` +* [ ] `_mm256_sqrt_pd` +* [ ] `_mm256_sqrt_ps` +* [ ] `_mm256_round_pd` +* [ ] `_mm256_round_ps` +* [ ] `_mm256_unpackhi_pd` +* [ ] `_mm256_unpackhi_ps` +* [ ] `_mm256_unpacklo_pd` +* [ ] `_mm256_unpacklo_ps` +* [ ] `_mm256_testz_si256` +* [ ] `_mm256_testc_si256` +* [ ] `_mm256_testnzc_si256` +* [ ] `_mm256_testz_pd` +* [ ] `_mm256_testc_pd` +* [ ] `_mm256_testnzc_pd` +* [ ] `_mm_testz_pd` +* [ ] `_mm_testc_pd` +* [ ] `_mm_testnzc_pd` +* [ ] `_mm256_testz_ps` +* [ ] `_mm256_testc_ps` +* [ ] `_mm256_testnzc_ps` +* [ ] `_mm_testz_ps` +* [ ] `_mm_testc_ps` +* [ ] `_mm_testnzc_ps` +* [ ] `_mm256_movemask_pd` +* [ ] `_mm256_movemask_ps` +* [ ] `_mm256_setzero_pd` +* [ ] `_mm256_setzero_ps` +* [ ] `_mm256_setzero_si256` +* [ ] `_mm256_set_pd` +* [ ] `_mm256_set_ps` +* [ ] `_mm256_set_epi8` +* [ ] `_mm256_set_epi16` +* [ ] `_mm256_set_epi32` +* [ ] `_mm256_set_epi64x` +* [ ] `_mm256_setr_pd` +* [ ] `_mm256_setr_ps` +* [ ] `_mm256_setr_epi8` +* [ ] `_mm256_setr_epi16` +* [ ] `_mm256_setr_epi32` +* [ ] `_mm256_setr_epi64x` +* [ ] `_mm256_set1_pd` +* [ ] `_mm256_set1_ps` +* [ ] `_mm256_set1_epi8` +* [ ] `_mm256_set1_epi16` +* [ ] `_mm256_set1_epi32` +* [ ] `_mm256_set1_epi64x` +* [ ] `_mm256_castpd_ps` +* [ ] `_mm256_castps_pd` +* [ ] `_mm256_castps_si256` +* [ ] `_mm256_castpd_si256` +* [ ] `_mm256_castsi256_ps` +* [ ] `_mm256_castsi256_pd` +* [ ] `_mm256_castps256_ps128` +* [ ] `_mm256_castpd256_pd128` +* [ ] `_mm256_castsi256_si128` +* [ ] `_mm256_castps128_ps256` +* [ ] `_mm256_castpd128_pd256` +* [ ] `_mm256_castsi128_si256` +* [ ] `_mm256_zextps128_ps256` +* [ ] `_mm256_zextpd128_pd256` +* [ ] `_mm256_zextsi128_si256` +* [ ] `_mm256_floor_ps` +* [ ] `_mm256_ceil_ps` +* [ ] `_mm256_floor_pd` +* [ ] `_mm256_ceil_pd` +* [ ] `_mm256_undefined_ps` +* [ ] `_mm256_undefined_pd` +* [ ] `_mm256_undefined_si256` +* [ ] `_mm256_set_m128` +* [ ] `_mm256_set_m128d` +* [ ] `_mm256_set_m128i` +* [ ] `_mm256_setr_m128` +* [ ] `_mm256_setr_m128d` +* [ ] `_mm256_setr_m128i` +* [ ] `_mm256_loadu2_m128` +* [ ] `_mm256_loadu2_m128d` +* [ ] `_mm256_loadu2_m128i` +* [ ] `_mm256_storeu2_m128` +* [ ] `_mm256_storeu2_m128d` +* [ ] `_mm256_storeu2_m128i` + + + +avx2 +---- +* [x] `_mm256_abs_epi8` +* [x] `_mm256_abs_epi16` +* [x] `_mm256_abs_epi32` +* [x] `_mm256_add_epi8` +* [x] `_mm256_add_epi16` +* [x] `_mm256_add_epi32` +* [x] `_mm256_add_epi64` +* [x] `_mm256_adds_epi8` +* [x] `_mm256_adds_epi16` +* [x] `_mm256_adds_epu8` +* [x] `_mm256_adds_epu16` +* [ ] `_mm256_alignr_epi8` +* [x] `_mm256_and_si256` +* [x] `_mm256_andnot_si256` +* [x] `_mm256_avg_epu8` +* [x] `_mm256_avg_epu16` +* [ ] `_mm256_blend_epi16` +* [ ] `_mm_blend_epi32` +* [ ] `_mm256_blend_epi32` +* [x] `_mm256_blendv_epi8` +* [ ] `_mm_broadcastb_epi8` +* [ ] `_mm256_broadcastb_epi8` +* [ ] `_mm_broadcastd_epi32` +* [ ] `_mm256_broadcastd_epi32` +* [ ] `_mm_broadcastq_epi64` +* [ ] `_mm256_broadcastq_epi64` +* [ ] `_mm_broadcastsd_pd` +* [ ] `_mm256_broadcastsd_pd` +* [ ] `_mm_broadcastsi128_si256` +* [ ] `_mm256_broadcastsi128_si256` +* [ ] `_mm_broadcastss_ps` +* [ ] `_mm256_broadcastss_ps` +* [ ] `_mm_broadcastw_epi16` +* [ ] `_mm256_broadcastw_epi16` +* [x] `_mm256_cmpeq_epi8` +* [x] `_mm256_cmpeq_epi16` +* [x] `_mm256_cmpeq_epi32` +* [x] `_mm256_cmpeq_epi64` +* [x] `_mm256_cmpgt_epi8` +* [x] `_mm256_cmpgt_epi16` +* [x] `_mm256_cmpgt_epi32` +* [x] `_mm256_cmpgt_epi64` +* [ ] `_mm256_cvtepi16_epi32` +* [ ] `_mm256_cvtepi16_epi64` +* [ ] `_mm256_cvtepi32_epi64` +* [ ] `_mm256_cvtepi8_epi16` +* [ ] `_mm256_cvtepi8_epi32` +* [ ] `_mm256_cvtepi8_epi64` +* [ ] `_mm256_cvtepu16_epi32` +* [ ] `_mm256_cvtepu16_epi64` +* [ ] `_mm256_cvtepu32_epi64` +* [ ] `_mm256_cvtepu8_epi16` +* [ ] `_mm256_cvtepu8_epi32` +* [ ] `_mm256_cvtepu8_epi64` +* [ ] `_mm256_extracti128_si256` +* [x] `_mm256_hadd_epi16` +* [x] `_mm256_hadd_epi32` +* [x] `_mm256_hadds_epi16` +* [x] `_mm256_hsub_epi16` +* [x] `_mm256_hsub_epi32` +* [x] `_mm256_hsubs_epi16` +* [ ] `_mm_i32gather_pd` +* [ ] `_mm256_i32gather_pd` +* [ ] `_mm_i32gather_ps` +* [ ] `_mm256_i32gather_ps` +* [ ] `_mm_i32gather_epi32` +* [ ] `_mm256_i32gather_epi32` +* [ ] `_mm_i32gather_epi64` +* [ ] `_mm256_i32gather_epi64` +* [ ] `_mm_i64gather_pd` +* [ ] `_mm256_i64gather_pd` +* [ ] `_mm_i64gather_ps` +* [ ] `_mm256_i64gather_ps` +* [ ] `_mm_i64gather_epi32` +* [ ] `_mm256_i64gather_epi32` +* [ ] `_mm_i64gather_epi64` +* [ ] `_mm256_i64gather_epi64` +* [ ] `_mm256_inserti128_si256` +* [ ] `_mm256_madd_epi16` +* [ ] `_mm256_maddubs_epi16` +* [ ] `_mm_mask_i32gather_pd` +* [ ] `_mm256_mask_i32gather_pd` +* [ ] `_mm_mask_i32gather_ps` +* [ ] `_mm256_mask_i32gather_ps` +* [ ] `_mm_mask_i32gather_epi32` +* [ ] `_mm256_mask_i32gather_epi32` +* [ ] `_mm_mask_i32gather_epi64` +* [ ] `_mm256_mask_i32gather_epi64` +* [ ] `_mm_mask_i64gather_pd` +* [ ] `_mm256_mask_i64gather_pd` +* [ ] `_mm_mask_i64gather_ps` +* [ ] `_mm256_mask_i64gather_ps` +* [ ] `_mm_mask_i64gather_epi32` +* [ ] `_mm256_mask_i64gather_epi32` +* [ ] `_mm_mask_i64gather_epi64` +* [ ] `_mm256_mask_i64gather_epi64` +* [ ] `_mm_maskload_epi32` +* [ ] `_mm256_maskload_epi32` +* [ ] `_mm_maskload_epi64` +* [ ] `_mm256_maskload_epi64` +* [ ] `_mm_maskstore_epi32` +* [ ] `_mm256_maskstore_epi32` +* [ ] `_mm_maskstore_epi64` +* [ ] `_mm256_maskstore_epi64` +* [ ] `_mm256_max_epi8` +* [ ] `_mm256_max_epi16` +* [ ] `_mm256_max_epi32` +* [ ] `_mm256_max_epu8` +* [ ] `_mm256_max_epu16` +* [ ] `_mm256_max_epu32` +* [ ] `_mm256_min_epi8` +* [ ] `_mm256_min_epi16` +* [ ] `_mm256_min_epi32` +* [ ] `_mm256_min_epu8` +* [ ] `_mm256_min_epu16` +* [ ] `_mm256_min_epu32` +* [ ] `_mm256_movemask_epi8` +* [ ] `_mm256_mpsadbw_epu8` +* [ ] `_mm256_mul_epi32` +* [ ] `_mm256_mul_epu32` +* [ ] `_mm256_mulhi_epi16` +* [ ] `_mm256_mulhi_epu16` +* [ ] `_mm256_mulhrs_epi16` +* [ ] `_mm256_mullo_epi16` +* [ ] `_mm256_mullo_epi32` +* [ ] `_mm256_or_si256` +* [ ] `_mm256_packs_epi16` +* [ ] `_mm256_packs_epi32` +* [ ] `_mm256_packus_epi16` +* [ ] `_mm256_packus_epi32` +* [ ] `_mm256_permute2x128_si256` +* [ ] `_mm256_permute4x64_epi64` +* [ ] `_mm256_permute4x64_pd` +* [ ] `_mm256_permutevar8x32_epi32` +* [ ] `_mm256_permutevar8x32_ps` +* [ ] `_mm256_sad_epu8` +* [ ] `_mm256_shuffle_epi32` +* [ ] `_mm256_shuffle_epi8` +* [ ] `_mm256_shufflehi_epi16` +* [ ] `_mm256_shufflelo_epi16` +* [ ] `_mm256_sign_epi8` +* [ ] `_mm256_sign_epi16` +* [ ] `_mm256_sign_epi32` +* [ ] `_mm256_slli_si256` +* [ ] `_mm256_bslli_epi128` +* [ ] `_mm256_sll_epi16` +* [ ] `_mm256_slli_epi16` +* [ ] `_mm256_sll_epi32` +* [ ] `_mm256_slli_epi32` +* [ ] `_mm256_sll_epi64` +* [ ] `_mm256_slli_epi64` +* [ ] `_mm_sllv_epi32` +* [ ] `_mm256_sllv_epi32` +* [ ] `_mm_sllv_epi64` +* [ ] `_mm256_sllv_epi64` +* [ ] `_mm256_sra_epi16` +* [ ] `_mm256_srai_epi16` +* [ ] `_mm256_sra_epi32` +* [ ] `_mm256_srai_epi32` +* [ ] `_mm_srav_epi32` +* [ ] `_mm256_srav_epi32` +* [ ] `_mm256_srli_si256` +* [ ] `_mm256_bsrli_epi128` +* [ ] `_mm256_srl_epi16` +* [ ] `_mm256_srli_epi16` +* [ ] `_mm256_srl_epi32` +* [ ] `_mm256_srli_epi32` +* [ ] `_mm256_srl_epi64` +* [ ] `_mm256_srli_epi64` +* [ ] `_mm_srlv_epi32` +* [ ] `_mm256_srlv_epi32` +* [ ] `_mm_srlv_epi64` +* [ ] `_mm256_srlv_epi64` +* [ ] `_mm256_stream_load_si256` +* [ ] `_mm256_sub_epi8` +* [ ] `_mm256_sub_epi16` +* [ ] `_mm256_sub_epi32` +* [ ] `_mm256_sub_epi64` +* [ ] `_mm256_subs_epi8` +* [ ] `_mm256_subs_epi16` +* [ ] `_mm256_subs_epu8` +* [ ] `_mm256_subs_epu16` +* [ ] `_mm256_xor_si256` +* [ ] `_mm256_unpackhi_epi8` +* [ ] `_mm256_unpackhi_epi16` +* [ ] `_mm256_unpackhi_epi32` +* [ ] `_mm256_unpackhi_epi64` +* [ ] `_mm256_unpacklo_epi8` +* [ ] `_mm256_unpacklo_epi16` +* [ ] `_mm256_unpacklo_epi32` +* [ ] `_mm256_unpacklo_epi64` diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 0a4588b1782e7..7ec508231d37b 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -1,3 +1,699 @@ -use simd::*; -use v128::*; -use v64::*; +use v256::*; +use x86::__m256i; + +/// Computes the absolute values of packed 32-bit integers in `a`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_abs_epi32(a: i32x8) -> i32x8 { + unsafe { pabsd(a) } +} + +/// Computes the absolute values of packed 16-bit integers in `a`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_abs_epi16(a: i16x16) -> i16x16 { + unsafe { pabsw(a) } +} + +/// Computes the absolute values of packed 8-bit integers in `a`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_abs_epi8(a: i8x32) -> i8x32 { + unsafe { pabsb(a) } +} + +/// Add packed 64-bit integers in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_add_epi64(a: i64x4, b: i64x4) -> i64x4 { + a + b +} + +/// Add packed 32-bit integers in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_add_epi32(a: i32x8, b: i32x8) -> i32x8 { + a + b +} + +/// Add packed 16-bit integers in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_add_epi16(a: i16x16, b: i16x16) -> i16x16 { + a + b +} + +/// Add packed 8-bit integers in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_add_epi8(a: i8x32, b: i8x32) -> i8x32 { + a + b +} + +/// Add packed 8-bit integers in `a` and `b` using saturation. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_adds_epi8(a: i8x32, b: i8x32) -> i8x32 { + unsafe { paddsb(a, b) } +} + +/// Add packed 16-bit integers in `a` and `b` using saturation. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_adds_epi16(a: i16x16, b: i16x16) -> i16x16 { + unsafe { paddsw(a, b) } +} + +/// Add packed unsigned 8-bit integers in `a` and `b` using saturation. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_adds_epu8(a: u8x32, b: u8x32) -> u8x32 { + unsafe { paddusb(a, b) } +} + +/// Add packed unsigned 16-bit integers in `a` and `b` using saturation. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 { + unsafe { paddusw(a, b) } +} + +/// Compute the bitwise AND of 256 bits (representing integer data) +/// in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i { + a & b +} + +/// Compute the bitwise NOT of 256 bits (representing integer data) +/// in `a` and then AND with `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i { + (!a) & b +} + +/// Average packed unsigned 16-bit integers in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_avg_epu16 (a: u16x16, b: u16x16) -> u16x16 { + unsafe { pavgw(a, b) } +} + +/// Average packed unsigned 8-bit integers in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_avg_epu8 (a: u8x32, b: u8x32) -> u8x32 { + unsafe { pavgb(a, b) } +} + +// TODO _mm256_alignr_epi8 +// TODO _mm256_blend_epi16 +// TODO _mm_blend_epi32 +// TODO _mm256_blend_epi32 + +/// Blend packed 8-bit integers from `a` and `b` using `mask`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 { + unsafe { pblendvb(a,b,mask) } +} + +// TODO _mm_broadcastb_epi8 +// TODO _mm256_broadcastb_epi8 +// TODO _mm_broadcastd_epi32 +// TODO _mm256_broadcastd_epi32 +// TODO _mm_broadcastq_epi64 +// TODO _mm256_broadcastq_epi64 +// TODO _mm_broadcastsd_pd +// TODO _mm256_broadcastsd_pd +// TODO _mm_broadcastsi128_si256 +// TODO _mm256_broadcastsi128_si256 +// TODO _mm_broadcastss_ps +// TODO _mm256_broadcastss_ps +// TODO _mm_broadcastw_epi16 +// TODO _mm256_broadcastw_epi16 +// TODO _mm256_bslli_epi128 +// TODO _mm256_bsrli_epi128 + + +/// Compare packed 64-bit integers in `a` and `b` for equality. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_cmpeq_epi64(a: i64x4, b: i64x4) -> i64x4 { + a.eq(b) +} + +/// Compare packed 32-bit integers in `a` and `b` for equality. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_cmpeq_epi32(a: i32x8, b: i32x8) -> i32x8 { + a.eq(b) +} + +/// Compare packed 16-bit integers in `a` and `b` for equality. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_cmpeq_epi16(a: i16x16, b: i16x16) -> i16x16 { + a.eq(b) +} + +/// Compare packed 8-bit integers in `a` and `b` for equality. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_cmpeq_epi8(a: i8x32, b: i8x32) -> i8x32 { + a.eq(b) +} + +/// Compare packed 64-bit integers in `a` and `b` for greater-than. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_cmpgt_epi64(a: i64x4, b: i64x4) -> i64x4 { + a.gt(b) +} + +/// Compare packed 32-bit integers in `a` and `b` for greater-than. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_cmpgt_epi32(a: i32x8, b: i32x8) -> i32x8 { + a.gt(b) +} + +/// Compare packed 16-bit integers in `a` and `b` for greater-than. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_cmpgt_epi16(a: i16x16, b: i16x16) -> i16x16 { + a.gt(b) +} + +/// Compare packed 8-bit integers in `a` and `b` for greater-than. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_cmpgt_epi8(a: i8x32, b: i8x32) -> i8x32 { + a.gt(b) +} + +// TODO _mm256_cvtepi16_epi32 +// TODO _mm256_cvtepi16_epi64 +// TODO _mm256_cvtepi32_epi64 +// TODO _mm256_cvtepi8_epi16 +// TODO _mm256_cvtepi8_epi32 +// TODO _mm256_cvtepi8_epi64 +// TODO _mm256_cvtepu16_epi32 +// TODO _mm256_cvtepu16_epi64 +// TODO _mm256_cvtepu32_epi64 +// TODO _mm256_cvtepu8_epi16 +// TODO _mm256_cvtepu8_epi32 +// TODO _mm256_cvtepu8_epi64 +// TODO _m128i _mm256_extracti128_si256 + +/// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_hadd_epi16(a: i16x16, b: i16x16) -> i16x16 { + unsafe { phaddw(a, b) } +} + +/// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_hadd_epi32(a: i32x8, b: i32x8) -> i32x8 { + unsafe { phaddd(a, b) } +} + +/// Horizontally add adjacent pairs of 16-bit integers in `a` and `b` +/// using saturation. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_hadds_epi16(a: i16x16, b: i16x16) -> i16x16 { + unsafe { phaddsw(a, b) } +} + +/// Horizontally substract adjacent pairs of 16-bit integers in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_hsub_epi16(a: i16x16, b: i16x16) -> i16x16 { + unsafe { phsubw(a, b) } +} + +/// Horizontally substract adjacent pairs of 32-bit integers in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_hsub_epi32(a: i32x8, b: i32x8) -> i32x8 { + unsafe { phsubd(a, b) } +} + +/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b` +/// using saturation. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 { + unsafe { phsubsw(a, b) } +} + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.x86.avx2.pabs.b"] + fn pabsb(a: i8x32) -> i8x32; + #[link_name = "llvm.x86.avx2.pabs.w"] + fn pabsw(a: i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.pabs.d"] + fn pabsd(a: i32x8) -> i32x8; + #[link_name = "llvm.x86.avx2.padds.b"] + fn paddsb(a: i8x32, b: i8x32) -> i8x32; + #[link_name = "llvm.x86.avx2.padds.w"] + fn paddsw(a: i16x16, b: i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.paddus.b"] + fn paddusb(a: u8x32, b: u8x32) -> u8x32; + #[link_name = "llvm.x86.avx2.paddus.w"] + fn paddusw(a: u16x16, b: u16x16) -> u16x16; + #[link_name = "llvm.x86.avx2.pavg.b"] + fn pavgb(a: u8x32, b: u8x32) -> u8x32; + #[link_name = "llvm.x86.avx2.pavg.w"] + fn pavgw(a: u16x16, b: u16x16) -> u16x16; + #[link_name = "llvm.x86.avx2.pblendvb"] + fn pblendvb(a: i8x32, b: i8x32, mask: __m256i) -> i8x32; + #[link_name = "llvm.x86.avx2.phadd.w"] + fn phaddw(a: i16x16, b: i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.phadd.d"] + fn phaddd(a: i32x8, b: i32x8) -> i32x8; + #[link_name = "llvm.x86.avx2.phadd.sw"] + fn phaddsw(a: i16x16, b: i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.phsub.w"] + fn phsubw(a: i16x16, b: i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.phsub.d"] + fn phsubd(a: i32x8, b: i32x8) -> i32x8; + #[link_name = "llvm.x86.avx2.phsub.sw"] + fn phsubsw(a: i16x16, b: i16x16) -> i16x16; +} + + +#[cfg(test)] +mod tests { + use v256::*; + use x86::avx2; + use x86::__m256i; + use std; + + #[test] + #[target_feature = "+avx2"] + fn _mm256_abs_epi32() { + let a = i32x8::new( + 0, 1, -1, std::i32::MAX, + std::i32::MIN + 1, 100, -100, -32); + let r = avx2::_mm256_abs_epi32(a); + let e = i32x8::new( + 0, 1, 1, std::i32::MAX, + (std::i32::MIN + 1).abs(), 100, 100, 32); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_abs_epi16() { + let a = i16x16::new( + 0, 1, -1, 2, + -2, 3, -3, 4, + -4, 5, -5, std::i16::MAX, + std::i16::MIN + 1, 100, -100, -32); + let r = avx2::_mm256_abs_epi16(a); + let e = i16x16::new( + 0, 1, 1, 2, + 2, 3, 3, 4, + 4, 5, 5, std::i16::MAX, + (std::i16::MIN + 1).abs(), 100, 100, 32); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_abs_epi8() { + let a = i8x32::new( + 0, 1, -1, 2, + -2, 3, -3, 4, + -4, 5, -5, std::i8::MAX, + std::i8::MIN + 1, 100, -100, -32, + 0, 1, -1, 2, + -2, 3, -3, 4, + -4, 5, -5, std::i8::MAX, + std::i8::MIN + 1, 100, -100, -32); + let r = avx2::_mm256_abs_epi8(a); + let e = i8x32::new( + 0, 1, 1, 2, 2, 3, 3, 4, + 4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32, + 0, 1, 1, 2, 2, 3, 3, 4, + 4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_add_epi64() { + let a = i64x4::new(-10, 0, 100, 1_000_000_000); + let b = i64x4::new(-1, 0, 1, 2); + let r = avx2::_mm256_add_epi64(a, b); + let e = i64x4::new(-11, 0, 101, 1_000_000_002); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_add_epi32() { + let a = i32x8::new(-1, 0, 1, 2, 3, 4, 5, 6); + let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let r = avx2::_mm256_add_epi32(a, b); + let e = i32x8::new(0, 2, 4, 6, 8, 10, 12, 14); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_add_epi16() { + let a = i16x16::new( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); + let b = i16x16::new( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); + let r = avx2::_mm256_add_epi16(a, b); + let e = i16x16::new( + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_add_epi8() { + let a = i8x32::new( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31); + let b = i8x32::new( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31); + let r = avx2::_mm256_add_epi8(a, b); + let e = i8x32::new( + 0, 2, 4, 6, 8, 10, 12, 14, 16, + 18, 20, 22, 24, 26, 28, 30, 32, + 34, 36, 38, 40, 42, 44, 46, 48, + 50, 52, 54, 56, 58, 60, 62); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_adds_epi8() { + let a = i8x32::new( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + let b = i8x32::new( + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); + let r = avx2::_mm256_adds_epi8(a, b); + let e = i8x32::new( + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_adds_epi8_saturate_positive() { + let a = i8x32::splat(0x7F); + let b = i8x32::splat(1); + let r = avx2::_mm256_adds_epi8(a, b); + assert_eq!(r, a); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_adds_epi8_saturate_negative() { + let a = i8x32::splat(-0x80); + let b = i8x32::splat(-1); + let r = avx2::_mm256_adds_epi8(a, b); + assert_eq!(r, a); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_adds_epi16() { + let a = i16x16::new( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let b = i16x16::new( + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47); + let r = avx2::_mm256_adds_epi16(a, b); + let e = i16x16::new( + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62); + + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_adds_epi16_saturate_positive() { + let a = i16x16::splat(0x7FFF); + let b = i16x16::splat(1); + let r = avx2::_mm256_adds_epi16(a, b); + assert_eq!(r, a); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_adds_epi16_saturate_negative() { + let a = i16x16::splat(-0x8000); + let b = i16x16::splat(-1); + let r = avx2::_mm256_adds_epi16(a, b); + assert_eq!(r, a); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_adds_epu8() { + let a = u8x32::new( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + let b = u8x32::new( + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); + let r = avx2::_mm256_adds_epu8(a, b); + let e = u8x32::new( + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_adds_epu8_saturate() { + let a = u8x32::splat(0xFF); + let b = u8x32::splat(1); + let r = avx2::_mm256_adds_epu8(a, b); + assert_eq!(r, a); + } + + + #[test] + #[target_feature = "+avx2"] + fn _mm256_adds_epu16() { + let a = u16x16::new( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let b = u16x16::new( + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47); + let r = avx2::_mm256_adds_epu16(a, b); + let e = u16x16::new( + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62); + + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_adds_epu16_saturate() { + let a = u16x16::splat(0xFFFF); + let b = u16x16::splat(1); + let r = avx2::_mm256_adds_epu16(a, b); + assert_eq!(r, a); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_and_si256() { + assert_eq!( + avx2::_mm256_and_si256( + __m256i::splat(5), __m256i::splat(3)),__m256i::splat(1)); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_andnot_si256() { + assert_eq!( + avx2::_mm256_andnot_si256(__m256i::splat(5), __m256i::splat(3)), + __m256i::splat(2)); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_avg_epu8() { + let (a, b) = (u8x32::splat(3), u8x32::splat(9)); + let r = avx2::_mm256_avg_epu8(a, b); + assert_eq!(r, u8x32::splat(6)); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_avg_epu16() { + let (a, b) = (u16x16::splat(3), u16x16::splat(9)); + let r = avx2::_mm256_avg_epu16(a, b); + assert_eq!(r, u16x16::splat(6)); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_blendv_epi8() { + let (a,b) = (i8x32::splat(4),i8x32::splat(2)); + let mask = i8x32::splat(0).replace(2,-1); + let e = i8x32::splat(4).replace(2,2); + let r= avx2::_mm256_blendv_epi8(a,b,mask); + assert_eq!(r,e); + } + + #[test] + fn _mm256_cmpeq_epi8() { + let a = i8x32::new( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + let b = i8x32::new( + 31, 30, 2, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + let r = avx2::_mm256_cmpeq_epi8(a, b); + assert_eq!(r, i8x32::splat(0).replace(2,0xFFu8 as i8)); + } + + #[test] + fn _mm256_cmpeq_epi16() { + let a = i16x16::new( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let b = i16x16::new( + 15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + let r = avx2::_mm256_cmpeq_epi16(a, b); + assert_eq!(r, i16x16::splat(0).replace(2, 0xFFFFu16 as i16)); + } + + #[test] + fn _mm256_cmpeq_epi32() { + let a = i32x8::new(0, 1, 2, 3,4,5,6,7); + let b = i32x8::new(7,6,2,4,3, 2, 1, 0); + let r = avx2::_mm256_cmpeq_epi32(a, b); + assert_eq!(r, i32x8::splat(0).replace(2, 0xFFFFFFFFu32 as i32)); + } + + #[test] + fn _mm256_cmpeq_epi64() { + let a = i64x4::new(0, 1, 2, 3); + let b = i64x4::new(3, 2, 2, 0); + let r = avx2::_mm256_cmpeq_epi64(a, b); + assert_eq!(r, i64x4::splat(0).replace( + 2, 0xFFFFFFFFFFFFFFFFu64 as i64)); + } + + #[test] + fn _mm256_cmpgt_epi8() { + let a = i8x32::splat(0).replace(0, 5); + let b = i8x32::splat(0); + let r = avx2::_mm256_cmpgt_epi8(a, b); + assert_eq!(r, i8x32::splat(0).replace(0, 0xFFu8 as i8)); + } + + #[test] + fn _mm256_cmpgt_epi16() { + let a = i16x16::splat(0).replace(0, 5); + let b = i16x16::splat(0); + let r = avx2::_mm256_cmpgt_epi16(a, b); + assert_eq!(r, i16x16::splat(0).replace(0, 0xFFFFu16 as i16)); + } + + #[test] + fn _mm256_cmpgt_epi32() { + let a = i32x8::splat(0).replace(0, 5); + let b = i32x8::splat(0); + let r = avx2::_mm256_cmpgt_epi32(a, b); + assert_eq!(r, i32x8::splat(0).replace(0, 0xFFFFFFFFu32 as i32)); + } + + #[test] + fn _mm256_cmpgt_epi64() { + let a = i64x4::splat(0).replace(0, 5); + let b = i64x4::splat(0); + let r = avx2::_mm256_cmpgt_epi64(a, b); + assert_eq!(r, i64x4::splat(0).replace( + 0, 0xFFFFFFFFFFFFFFFFu64 as i64)); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_hadd_epi16() { + let a = i16x16::splat(2); + let b = i16x16::splat(4); + let r = avx2::_mm256_hadd_epi16(a, b); + let e = i16x16::new(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8); + assert_eq!(r,e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_hadd_epi32() { + let a = i32x8::splat(2); + let b = i32x8::splat(4); + let r = avx2::_mm256_hadd_epi32(a, b); + let e = i32x8::new(4, 4, 8, 8, 4, 4, 8, 8); + assert_eq!(r,e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_hadds_epi16() { + let a = i16x16::splat(2).replace(0,0x7FFF).replace(1,1); + let b = i16x16::splat(4); + let r = avx2::_mm256_hadds_epi16(a, b); + let e = i16x16::new( + 0x7FFF, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8); + assert_eq!(r,e); + } + + #[test] + #[target_feature ="+avx2"] + fn _mm256_hsub_epi16() { + let a = i16x16::splat(2); + let b = i16x16::splat(4); + let r = avx2::_mm256_hsub_epi16(a, b); + let e = i16x16::splat(0); + assert_eq!(r,e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_hsub_epi32() { + let a = i32x8::splat(2); + let b = i32x8::splat(4); + let r = avx2::_mm256_hsub_epi32(a, b); + let e = i32x8::splat(0); + assert_eq!(r,e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_hsubs_epi16() { + let a = i16x16::splat(2).replace(0,0x7FFF).replace(1,-1); + let b = i16x16::splat(4); + let r = avx2::_mm256_hsubs_epi16(a, b); + let e = i16x16::splat(0).replace(0,0x7FFF); + assert_eq!(r,e); + } + + +} diff --git a/src/x86/mod.rs b/src/x86/mod.rs index 610bf657d0be5..d36fa4444d56a 100644 --- a/src/x86/mod.rs +++ b/src/x86/mod.rs @@ -2,11 +2,15 @@ pub use self::sse::*; pub use self::sse2::*; pub use self::ssse3::*; pub use self::sse42::*; +pub use self::avx2::*; #[allow(non_camel_case_types)] pub type __m128i = ::v128::i8x16; +#[allow(non_camel_case_types)] +pub type __m256i = ::v256::i8x32; mod sse; mod sse2; mod ssse3; mod sse42; +mod avx2; \ No newline at end of file