From 5b43883603b5308439dc169f5ab136415b6fd1a2 Mon Sep 17 00:00:00 2001 From: Phoebe Wang Date: Mon, 5 Aug 2024 18:57:42 +0800 Subject: [PATCH] [X86][AVX10.2] Support AVX10.2 VNNI FP16/INT8/INT16 new instructions (#101783) Ref.: https://cdrdv2.intel.com/v1/dl/getContent/828965 --- clang/include/clang/Basic/BuiltinsX86.def | 69 +- clang/lib/Headers/avx10_2_512niintrin.h | 279 ++++ clang/lib/Headers/avx10_2niintrin.h | 369 +++++ clang/lib/Headers/avxvnniint16intrin.h | 113 +- clang/lib/Headers/avxvnniint8intrin.h | 113 +- .../test/CodeGen/X86/avx10_2_512ni-builtins.c | 276 ++++ clang/test/CodeGen/X86/avx10_2ni-builtins.c | 381 +++++ .../test/CodeGen/X86/avxvnniint16-builtins.c | 2 + clang/test/CodeGen/X86/avxvnniint8-builtins.c | 2 + llvm/include/llvm/IR/IntrinsicsX86.td | 79 + llvm/lib/Target/X86/X86ISelLowering.cpp | 7 + llvm/lib/Target/X86/X86ISelLowering.h | 10 +- llvm/lib/Target/X86/X86InstrAVX10.td | 34 + llvm/lib/Target/X86/X86InstrAVX512.td | 59 +- llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 12 + llvm/lib/Target/X86/X86InstrInfo.cpp | 54 + llvm/lib/Target/X86/X86InstrSSE.td | 79 +- llvm/lib/Target/X86/X86IntrinsicsInfo.h | 33 + .../CodeGen/X86/avx10_2_512ni-intrinsics.ll | 387 ++++- llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll | 563 +++++++ .../CodeGen/X86/avxvnniint16-intrinsics.ll | 62 + .../CodeGen/X86/avxvnniint8-intrinsics.ll | 206 +++ .../test/MC/Disassembler/X86/avx10_2ni-32.txt | 1410 +++++++++++++++++ .../test/MC/Disassembler/X86/avx10_2ni-64.txt | 1410 +++++++++++++++++ llvm/test/MC/X86/avx10_2ni-32-intel.s | 1410 +++++++++++++++++ llvm/test/MC/X86/avx10_2ni-64-att.s | 1410 +++++++++++++++++ llvm/test/TableGen/x86-fold-tables.inc | 234 +++ .../utils/TableGen/X86InstrMappingEmitter.cpp | 3 +- 28 files changed, 8815 insertions(+), 251 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def index c49b5c36da4fc36..55551f688c14bd4 100644 --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -773,18 +773,18 @@ TARGET_BUILTIN(__builtin_ia32_vpdpwssds256, "V8iV8iV8iV8i", "ncV:256:", "avx512v TARGET_BUILTIN(__builtin_ia32_vpdpwssds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni,evex512") // AVX-VNNI-INT8 -TARGET_BUILTIN(__builtin_ia32_vpdpbssd128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8") -TARGET_BUILTIN(__builtin_ia32_vpdpbssd256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8") -TARGET_BUILTIN(__builtin_ia32_vpdpbssds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8") -TARGET_BUILTIN(__builtin_ia32_vpdpbssds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8") -TARGET_BUILTIN(__builtin_ia32_vpdpbsud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8") -TARGET_BUILTIN(__builtin_ia32_vpdpbsud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8") -TARGET_BUILTIN(__builtin_ia32_vpdpbsuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8") -TARGET_BUILTIN(__builtin_ia32_vpdpbsuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8") -TARGET_BUILTIN(__builtin_ia32_vpdpbuud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8") -TARGET_BUILTIN(__builtin_ia32_vpdpbuud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8") -TARGET_BUILTIN(__builtin_ia32_vpdpbuuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8") -TARGET_BUILTIN(__builtin_ia32_vpdpbuuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8") +TARGET_BUILTIN(__builtin_ia32_vpdpbssd128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vpdpbssd256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vpdpbssds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vpdpbssds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vpdpbsud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vpdpbsud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vpdpbsuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vpdpbsuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vpdpbuud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vpdpbuud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vpdpbuuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vpdpbuuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256") TARGET_BUILTIN(__builtin_ia32_gather3div2df, "V2dV2dvC*V2OiUcIi", "nV:128:", "avx512vl") TARGET_BUILTIN(__builtin_ia32_gather3div2di, "V2OiV2OivC*V2OiUcIi", "nV:128:", "avx512vl") @@ -1959,6 +1959,27 @@ TARGET_HEADER_BUILTIN(__readgsword, "UsUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, TARGET_HEADER_BUILTIN(__readgsdword, "UNiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "") TARGET_HEADER_BUILTIN(__readgsqword, "ULLiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "") +// AVX10.2 VNNI FP16 +TARGET_BUILTIN(__builtin_ia32_vdpphps128, "V4fV4fV8xV8x", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vdpphps256, "V8fV8fV16xV16x", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vdpphps512, "V16fV16fV32xV32x", "ncV:512:", "avx10.2-512") + +// AVX10.2 VNNI INT8 +TARGET_BUILTIN(__builtin_ia32_vpdpbssd512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vpdpbssds512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vpdpbsud512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vpdpbsuds512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vpdpbuud512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vpdpbuuds512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512") + +// AVX10.2 VNNI INT16 +TARGET_BUILTIN(__builtin_ia32_vpdpwsud512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vpdpwsuds512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vpdpwusd512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vpdpwusds512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vpdpwuud512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vpdpwuuds512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512") + // AVX10.2 VMPSADBW TARGET_BUILTIN(__builtin_ia32_mpsadbw512, "V32sV64cV64cIc", "ncV:512:", "avx10.2-512") @@ -2088,18 +2109,18 @@ TARGET_BUILTIN(__builtin_ia32_vsubph256_round, "V16xV16xV16xIi", "nV:256:", "avx TARGET_BUILTIN(__builtin_ia32_vsubps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256") // AVX-VNNI-INT16 -TARGET_BUILTIN(__builtin_ia32_vpdpwsud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16") -TARGET_BUILTIN(__builtin_ia32_vpdpwsud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16") -TARGET_BUILTIN(__builtin_ia32_vpdpwsuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16") -TARGET_BUILTIN(__builtin_ia32_vpdpwsuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16") -TARGET_BUILTIN(__builtin_ia32_vpdpwusd128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16") -TARGET_BUILTIN(__builtin_ia32_vpdpwusd256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16") -TARGET_BUILTIN(__builtin_ia32_vpdpwusds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16") -TARGET_BUILTIN(__builtin_ia32_vpdpwusds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16") -TARGET_BUILTIN(__builtin_ia32_vpdpwuud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16") -TARGET_BUILTIN(__builtin_ia32_vpdpwuud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16") -TARGET_BUILTIN(__builtin_ia32_vpdpwuuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16") -TARGET_BUILTIN(__builtin_ia32_vpdpwuuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16") +TARGET_BUILTIN(__builtin_ia32_vpdpwsud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vpdpwsud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vpdpwsuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vpdpwsuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vpdpwusd128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vpdpwusd256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vpdpwusds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vpdpwusds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vpdpwuud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vpdpwuud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vpdpwuuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vpdpwuuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256") // AVX-NE-CONVERT TARGET_BUILTIN(__builtin_ia32_vbcstnebf162ps128, "V4fyC*", "nV:128:", "avxneconvert") diff --git a/clang/lib/Headers/avx10_2_512niintrin.h b/clang/lib/Headers/avx10_2_512niintrin.h index 5ad6993b454338c..7e614f7740bffc5 100644 --- a/clang/lib/Headers/avx10_2_512niintrin.h +++ b/clang/lib/Headers/avx10_2_512niintrin.h @@ -16,6 +16,35 @@ #ifndef __AVX10_2_512NIINTRIN_H #define __AVX10_2_512NIINTRIN_H +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"), \ + __min_vector_width__(512))) + +/* VNNI FP16 */ +static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_dpph_ps(__m512 __W, + __m512h __A, + __m512h __B) { + return (__m512)__builtin_ia32_vdpphps512((__v16sf)__W, (__v32hf)__A, + (__v32hf)__B); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_dpph_ps(__m512 __W, + __mmask16 __U, + __m512h __A, + __m512h __B) { + return (__m512)__builtin_ia32_selectps_512( + (__mmask16)__U, (__v16sf)_mm512_dpph_ps(__W, __A, __B), (__v16sf)__W); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_dpph_ps(__mmask16 __U, + __m512 __W, + __m512h __A, + __m512h __B) { + return (__m512)__builtin_ia32_selectps_512( + (__mmask16)__U, (__v16sf)_mm512_dpph_ps(__W, __A, __B), + (__v16sf)_mm512_setzero_ps()); +} + /* VMPSADBW */ #define _mm512_mpsadbw_epu8(A, B, imm) \ ((__m512i)__builtin_ia32_mpsadbw512((__v64qi)(__m512i)(A), \ @@ -31,5 +60,255 @@ (__mmask32)(U), (__v32hi)_mm512_mpsadbw_epu8((A), (B), (imm)), \ (__v32hi)_mm512_setzero_si512())) +/* VNNI INT8 */ +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssd_epi32(__m512i __W, + __m512i __A, + __m512i __B) { + return (__m512i)__builtin_ia32_vpdpbssd512((__v16si)__W, (__v16si)__A, + (__v16si)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_dpbssd_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512( + __U, (__v16si)_mm512_dpbssd_epi32(__W, __A, __B), (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssd_epi32( + __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512( + __U, (__v16si)_mm512_dpbssd_epi32(__W, __A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssds_epi32(__m512i __W, + __m512i __A, + __m512i __B) { + return (__m512i)__builtin_ia32_vpdpbssds512((__v16si)__W, (__v16si)__A, + (__v16si)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbssds_epi32( + __m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512( + __U, (__v16si)_mm512_dpbssds_epi32(__W, __A, __B), (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssds_epi32( + __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512( + __U, (__v16si)_mm512_dpbssds_epi32(__W, __A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsud_epi32(__m512i __W, + __m512i __A, + __m512i __B) { + return (__m512i)__builtin_ia32_vpdpbsud512((__v16si)__W, (__v16si)__A, + (__v16si)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_dpbsud_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512( + __U, (__v16si)_mm512_dpbsud_epi32(__W, __A, __B), (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsud_epi32( + __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512( + __U, (__v16si)_mm512_dpbsud_epi32(__W, __A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsuds_epi32(__m512i __W, + __m512i __A, + __m512i __B) { + return (__m512i)__builtin_ia32_vpdpbsuds512((__v16si)__W, (__v16si)__A, + (__v16si)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbsuds_epi32( + __m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512( + __U, (__v16si)_mm512_dpbsuds_epi32(__W, __A, __B), (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsuds_epi32( + __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512( + __U, (__v16si)_mm512_dpbsuds_epi32(__W, __A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuud_epi32(__m512i __W, + __m512i __A, + __m512i __B) { + return (__m512i)__builtin_ia32_vpdpbuud512((__v16si)__W, (__v16si)__A, + (__v16si)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_dpbuud_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512( + __U, (__v16si)_mm512_dpbuud_epi32(__W, __A, __B), (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuud_epi32( + __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512( + __U, (__v16si)_mm512_dpbuud_epi32(__W, __A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuuds_epi32(__m512i __W, + __m512i __A, + __m512i __B) { + return (__m512i)__builtin_ia32_vpdpbuuds512((__v16si)__W, (__v16si)__A, + (__v16si)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbuuds_epi32( + __m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512( + __U, (__v16si)_mm512_dpbuuds_epi32(__W, __A, __B), (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuuds_epi32( + __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512( + __U, (__v16si)_mm512_dpbuuds_epi32(__W, __A, __B), + (__v16si)_mm512_setzero_si512()); +} + +/* VNNI INT16 */ +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsud_epi32(__m512i __A, + __m512i __B, + __m512i __C) { + return (__m512i)__builtin_ia32_vpdpwsud512((__v16si)__A, (__v16si)__B, + (__v16si)__C); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_dpwsud_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_dpwsud_epi32(__A, __B, __C), + (__v16si)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsud_epi32( + __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_dpwsud_epi32(__A, __B, __C), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsuds_epi32(__m512i __A, + __m512i __B, + __m512i __C) { + return (__m512i)__builtin_ia32_vpdpwsuds512((__v16si)__A, (__v16si)__B, + (__v16si)__C); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwsuds_epi32( + __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_dpwsuds_epi32(__A, __B, __C), + (__v16si)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsuds_epi32( + __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_dpwsuds_epi32(__A, __B, __C), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusd_epi32(__m512i __A, + __m512i __B, + __m512i __C) { + return (__m512i)__builtin_ia32_vpdpwusd512((__v16si)__A, (__v16si)__B, + (__v16si)__C); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_dpwusd_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_dpwusd_epi32(__A, __B, __C), + (__v16si)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusd_epi32( + __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_dpwusd_epi32(__A, __B, __C), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusds_epi32(__m512i __A, + __m512i __B, + __m512i __C) { + return (__m512i)__builtin_ia32_vpdpwusds512((__v16si)__A, (__v16si)__B, + (__v16si)__C); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwusds_epi32( + __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_dpwusds_epi32(__A, __B, __C), + (__v16si)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusds_epi32( + __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_dpwusds_epi32(__A, __B, __C), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuud_epi32(__m512i __A, + __m512i __B, + __m512i __C) { + return (__m512i)__builtin_ia32_vpdpwuud512((__v16si)__A, (__v16si)__B, + (__v16si)__C); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_dpwuud_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_dpwuud_epi32(__A, __B, __C), + (__v16si)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuud_epi32( + __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_dpwuud_epi32(__A, __B, __C), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuuds_epi32(__m512i __A, + __m512i __B, + __m512i __C) { + return (__m512i)__builtin_ia32_vpdpwuuds512((__v16si)__A, (__v16si)__B, + (__v16si)__C); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwuuds_epi32( + __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_dpwuuds_epi32(__A, __B, __C), + (__v16si)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuuds_epi32( + __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_dpwuuds_epi32(__A, __B, __C), + (__v16si)_mm512_setzero_si512()); +} + +#undef __DEFAULT_FN_ATTRS + #endif /* __SSE2__ */ #endif /* __AVX10_2_512NIINTRIN_H */ diff --git a/clang/lib/Headers/avx10_2niintrin.h b/clang/lib/Headers/avx10_2niintrin.h index 42b24d2b5b18fba..c91a7b57c752760 100644 --- a/clang/lib/Headers/avx10_2niintrin.h +++ b/clang/lib/Headers/avx10_2niintrin.h @@ -15,6 +15,58 @@ #ifndef __AVX10_2NIINTRIN_H #define __AVX10_2NIINTRIN_H +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \ + __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \ + __min_vector_width__(256))) + +/* VNNI FP16 */ +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_dpph_ps(__m128 __W, + __m128h __A, + __m128h __B) { + return (__m128)__builtin_ia32_vdpphps128((__v4sf)__W, (__v8hf)__A, + (__v8hf)__B); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_dpph_ps(__m128 __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128)__builtin_ia32_selectps_128( + (__mmask8)__U, (__v4sf)_mm_dpph_ps(__W, __A, __B), (__v4sf)__W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_dpph_ps(__mmask8 __U, + __m128 __W, + __m128h __A, + __m128h __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_dpph_ps(__W, __A, __B), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_dpph_ps(__m256 __W, + __m256h __A, + __m256h __B) { + return (__m256)__builtin_ia32_vdpphps256((__v8sf)__W, (__v16hf)__A, + (__v16hf)__B); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_dpph_ps(__m256 __W, __mmask8 __U, __m256h __A, __m256h __B) { + return (__m256)__builtin_ia32_selectps_256( + (__mmask8)__U, (__v8sf)_mm256_dpph_ps(__W, __A, __B), (__v8sf)__W); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_dpph_ps(__mmask8 __U, __m256 __W, __m256h __A, __m256h __B) { + return (__m256)__builtin_ia32_selectps_256( + (__mmask8)__U, (__v8sf)_mm256_dpph_ps(__W, __A, __B), + (__v8sf)_mm256_setzero_ps()); +} + /* VMPSADBW */ #define _mm_mask_mpsadbw_epu8(W, U, A, B, imm) \ ((__m128i)__builtin_ia32_selectw_128( \ @@ -36,6 +88,320 @@ (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \ (__v16hi)_mm256_setzero_si256())) +/* VNNI INT8 */ +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpbssd_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128( + __U, (__v4si)_mm_dpbssd_epi32(__W, __A, __B), (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpbssd_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128( + __U, (__v4si)_mm_dpbssd_epi32(__W, __A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpbssd_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256( + __U, (__v8si)_mm256_dpbssd_epi32(__W, __A, __B), (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_dpbssd_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256( + __U, (__v8si)_mm256_dpbssd_epi32(__W, __A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpbssds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128( + __U, (__v4si)_mm_dpbssds_epi32(__W, __A, __B), (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpbssds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128( + __U, (__v4si)_mm_dpbssds_epi32(__W, __A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpbssds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256( + __U, (__v8si)_mm256_dpbssds_epi32(__W, __A, __B), (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbssds_epi32( + __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256( + __U, (__v8si)_mm256_dpbssds_epi32(__W, __A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpbsud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128( + __U, (__v4si)_mm_dpbsud_epi32(__W, __A, __B), (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpbsud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128( + __U, (__v4si)_mm_dpbsud_epi32(__W, __A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpbsud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256( + __U, (__v8si)_mm256_dpbsud_epi32(__W, __A, __B), (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_dpbsud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256( + __U, (__v8si)_mm256_dpbsud_epi32(__W, __A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpbsuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128( + __U, (__v4si)_mm_dpbsuds_epi32(__W, __A, __B), (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpbsuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128( + __U, (__v4si)_mm_dpbsuds_epi32(__W, __A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpbsuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256( + __U, (__v8si)_mm256_dpbsuds_epi32(__W, __A, __B), (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbsuds_epi32( + __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256( + __U, (__v8si)_mm256_dpbsuds_epi32(__W, __A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpbuud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128( + __U, (__v4si)_mm_dpbuud_epi32(__W, __A, __B), (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpbuud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128( + __U, (__v4si)_mm_dpbuud_epi32(__W, __A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpbuud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256( + __U, (__v8si)_mm256_dpbuud_epi32(__W, __A, __B), (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_dpbuud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256( + __U, (__v8si)_mm256_dpbuud_epi32(__W, __A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpbuuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128( + __U, (__v4si)_mm_dpbuuds_epi32(__W, __A, __B), (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpbuuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128( + __U, (__v4si)_mm_dpbuuds_epi32(__W, __A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpbuuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256( + __U, (__v8si)_mm256_dpbuuds_epi32(__W, __A, __B), (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbuuds_epi32( + __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256( + __U, (__v8si)_mm256_dpbuuds_epi32(__W, __A, __B), + (__v8si)_mm256_setzero_si256()); +} + +/* VNNI INT16 */ +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpwsud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_dpwsud_epi32(__A, __B, __C), (__v4si)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpwsud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_dpwsud_epi32(__A, __B, __C), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpwsud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_dpwsud_epi32(__A, __B, __C), (__v8si)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_dpwsud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_dpwsud_epi32(__A, __B, __C), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpwsuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_dpwsuds_epi32(__A, __B, __C), (__v4si)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpwsuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_dpwsuds_epi32(__A, __B, __C), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpwsuds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_dpwsuds_epi32(__A, __B, __C), (__v8si)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwsuds_epi32( + __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_dpwsuds_epi32(__A, __B, __C), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpwusd_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_dpwusd_epi32(__A, __B, __C), (__v4si)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpwusd_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_dpwusd_epi32(__A, __B, __C), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpwusd_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_dpwusd_epi32(__A, __B, __C), (__v8si)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_dpwusd_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_dpwusd_epi32(__A, __B, __C), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpwusds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_dpwusds_epi32(__A, __B, __C), (__v4si)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpwusds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_dpwusds_epi32(__A, __B, __C), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpwusds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_dpwusds_epi32(__A, __B, __C), (__v8si)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwusds_epi32( + __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_dpwusds_epi32(__A, __B, __C), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpwuud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_dpwuud_epi32(__A, __B, __C), (__v4si)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpwuud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_dpwuud_epi32(__A, __B, __C), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpwuud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_dpwuud_epi32(__A, __B, __C), (__v8si)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_dpwuud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_dpwuud_epi32(__A, __B, __C), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpwuuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_dpwuuds_epi32(__A, __B, __C), (__v4si)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpwuuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_dpwuuds_epi32(__A, __B, __C), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpwuuds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_dpwuuds_epi32(__A, __B, __C), (__v8si)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwuuds_epi32( + __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_dpwuuds_epi32(__A, __B, __C), + (__v8si)_mm256_setzero_si256()); +} + /* YMM Rounding */ #define _mm256_add_round_pd(A, B, R) \ ((__m256d)__builtin_ia32_vaddpd256_round((__v4df)(__m256d)(A), \ @@ -1702,5 +2068,8 @@ (__mmask8)(U), (__v8sf)_mm256_sub_round_ps((A), (B), (R)), \ (__v8sf)_mm256_setzero_ps())) +#undef __DEFAULT_FN_ATTRS256 +#undef __DEFAULT_FN_ATTRS128 + #endif /* __AVX10_2NIINTRIN_H */ #endif /* __SSE2__ */ diff --git a/clang/lib/Headers/avxvnniint16intrin.h b/clang/lib/Headers/avxvnniint16intrin.h index e4d342a8b45b1de..805d249911c176c 100644 --- a/clang/lib/Headers/avxvnniint16intrin.h +++ b/clang/lib/Headers/avxvnniint16intrin.h @@ -15,14 +15,6 @@ #ifndef __AVXVNNIINT16INTRIN_H #define __AVXVNNIINT16INTRIN_H -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS128 \ - __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \ - __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS256 \ - __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \ - __min_vector_width__(256))) - /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate /// signed 16-bit results. Sum these 2 results with the corresponding @@ -53,12 +45,9 @@ /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W, - __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_vpdpwsud128((__v4si)__W, (__v4si)__A, - (__v4si)__B); -} +#define _mm_dpwsud_epi32(__W, __A, __B) \ + ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v4si)(__A), \ + (__v4si)(__B))) /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate @@ -90,11 +79,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W, /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpwsud256((__v8si)__W, (__v8si)__A, - (__v8si)__B); -} +#define _mm256_dpwsud_epi32(__W, __A, __B) \ + ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v8si)(__A), \ + (__v8si)(__B))) /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate @@ -127,12 +114,9 @@ _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) { /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W, - __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_vpdpwsuds128((__v4si)__W, (__v4si)__A, - (__v4si)__B); -} +#define _mm_dpwsuds_epi32(__W, __A, __B) \ + ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v4si)(__A), \ + (__v4si)(__B))) /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate @@ -165,11 +149,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W, /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpwsuds256((__v8si)__W, (__v8si)__A, - (__v8si)__B); -} +#define _mm256_dpwsuds_epi32(__W, __A, __B) \ + ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v8si)(__A), \ + (__v8si)(__B))) /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate @@ -201,12 +183,9 @@ _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) { /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W, - __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_vpdpwusd128((__v4si)__W, (__v4si)__A, - (__v4si)__B); -} +#define _mm_dpwusd_epi32(__W, __A, __B) \ + ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v4si)(__A), \ + (__v4si)(__B))) /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate @@ -238,11 +217,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W, /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpwusd256((__v8si)__W, (__v8si)__A, - (__v8si)__B); -} +#define _mm256_dpwusd_epi32(__W, __A, __B) \ + ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v8si)(__A), \ + (__v8si)(__B))) /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate @@ -275,12 +252,9 @@ _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) { /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W, - __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_vpdpwusds128((__v4si)__W, (__v4si)__A, - (__v4si)__B); -} +#define _mm_dpwusds_epi32(__W, __A, __B) \ + ((__m128i)__builtin_ia32_vpdpwusds128((__v4si)(__W), (__v4si)(__A), \ + (__v4si)(__B))) /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate @@ -313,11 +287,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W, /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpwusds256((__v8si)__W, (__v8si)__A, - (__v8si)__B); -} +#define _mm256_dpwusds_epi32(__W, __A, __B) \ + ((__m256i)__builtin_ia32_vpdpwusds256((__v8si)(__W), (__v8si)(__A), \ + (__v8si)(__B))) /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate @@ -349,12 +321,9 @@ _mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) { /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W, - __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_vpdpwuud128((__v4si)__W, (__v4si)__A, - (__v4si)__B); -} +#define _mm_dpwuud_epi32(__W, __A, __B) \ + ((__m128i)__builtin_ia32_vpdpwuud128((__v4si)(__W), (__v4si)(__A), \ + (__v4si)(__B))) /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate @@ -386,11 +355,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W, /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpwuud256((__v8si)__W, (__v8si)__A, - (__v8si)__B); -} +#define _mm256_dpwuud_epi32(__W, __A, __B) \ + ((__m256i)__builtin_ia32_vpdpwuud256((__v8si)(__W), (__v8si)(__A), \ + (__v8si)(__B))) /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate @@ -423,12 +390,9 @@ _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) { /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W, - __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_vpdpwuuds128((__v4si)__W, (__v4si)__A, - (__v4si)__B); -} +#define _mm_dpwuuds_epi32(__W, __A, __B) \ + ((__m128i)__builtin_ia32_vpdpwuuds128((__v4si)(__W), (__v4si)(__A), \ + (__v4si)(__B))) /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate @@ -461,13 +425,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W, /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpwuuds256((__v8si)__W, (__v8si)__A, - (__v8si)__B); -} - -#undef __DEFAULT_FN_ATTRS128 -#undef __DEFAULT_FN_ATTRS256 +#define _mm256_dpwuuds_epi32(__W, __A, __B) \ + ((__m256i)__builtin_ia32_vpdpwuuds256((__v8si)(__W), (__v8si)(__A), \ + (__v8si)(__B))) #endif // __AVXVNNIINT16INTRIN_H diff --git a/clang/lib/Headers/avxvnniint8intrin.h b/clang/lib/Headers/avxvnniint8intrin.h index b0b6cb853f713f4..c211620c68f0760 100644 --- a/clang/lib/Headers/avxvnniint8intrin.h +++ b/clang/lib/Headers/avxvnniint8intrin.h @@ -14,14 +14,6 @@ #ifndef __AVXVNNIINT8INTRIN_H #define __AVXVNNIINT8INTRIN_H -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS256 \ - __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \ - __min_vector_width__(256))) -#define __DEFAULT_FN_ATTRS128 \ - __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \ - __min_vector_width__(128))) - /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate /// signed 16-bit results. Sum these 4 results with the corresponding @@ -52,12 +44,9 @@ /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W, - __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A, - (__v4si)__B); -} +#define _mm_dpbssd_epi32(__W, __A, __B) \ + ((__m128i)__builtin_ia32_vpdpbssd128((__v4si)(__W), (__v4si)(__A), \ + (__v4si)(__B))) /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate @@ -89,11 +78,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W, /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A, - (__v8si)__B); -} +#define _mm256_dpbssd_epi32(__W, __A, __B) \ + ((__m256i)__builtin_ia32_vpdpbssd256((__v8si)(__W), (__v8si)(__A), \ + (__v8si)(__B))) /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate @@ -126,12 +113,9 @@ _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) { /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W, - __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A, - (__v4si)__B); -} +#define _mm_dpbssds_epi32(__W, __A, __B) \ + ((__m128i)__builtin_ia32_vpdpbssds128((__v4si)(__W), (__v4si)(__A), \ + (__v4si)(__B))) /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate @@ -164,11 +148,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W, /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A, - (__v8si)__B); -} +#define _mm256_dpbssds_epi32(__W, __A, __B) \ + ((__m256i)__builtin_ia32_vpdpbssds256((__v8si)(__W), (__v8si)(__A), \ + (__v8si)(__B))) /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate @@ -200,12 +182,9 @@ _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) { /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W, - __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A, - (__v4si)__B); -} +#define _mm_dpbsud_epi32(__W, __A, __B) \ + ((__m128i)__builtin_ia32_vpdpbsud128((__v4si)(__W), (__v4si)(__A), \ + (__v4si)(__B))) /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate @@ -237,11 +216,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W, /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A, - (__v8si)__B); -} +#define _mm256_dpbsud_epi32(__W, __A, __B) \ + ((__m256i)__builtin_ia32_vpdpbsud256((__v8si)(__W), (__v8si)(__A), \ + (__v8si)(__B))) /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate @@ -274,12 +251,9 @@ _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) { /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W, - __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A, - (__v4si)__B); -} +#define _mm_dpbsuds_epi32(__W, __A, __B) \ + ((__m128i)__builtin_ia32_vpdpbsuds128((__v4si)(__W), (__v4si)(__A), \ + (__v4si)(__B))) /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate @@ -312,11 +286,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W, /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A, - (__v8si)__B); -} +#define _mm256_dpbsuds_epi32(__W, __A, __B) \ + ((__m256i)__builtin_ia32_vpdpbsuds256((__v8si)(__W), (__v8si)(__A), \ + (__v8si)(__B))) /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate @@ -348,12 +320,9 @@ _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) { /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W, - __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A, - (__v4si)__B); -} +#define _mm_dpbuud_epi32(__W, __A, __B) \ + ((__m128i)__builtin_ia32_vpdpbuud128((__v4si)(__W), (__v4si)(__A), \ + (__v4si)(__B))) /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate @@ -385,11 +354,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W, /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A, - (__v8si)__B); -} +#define _mm256_dpbuud_epi32(__W, __A, __B) \ + ((__m256i)__builtin_ia32_vpdpbuud256((__v8si)(__W), (__v8si)(__A), \ + (__v8si)(__B))) /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate @@ -422,14 +389,10 @@ _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) { /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W, - __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A, - (__v4si)__B); -} +#define _mm_dpbuuds_epi32(__W, __A, __B) \ + ((__m128i)__builtin_ia32_vpdpbuuds128((__v4si)(__W), (__v4si)(__A), \ + (__v4si)(__B))) -/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate /// signed 16-bit results. Sum these 4 results with the corresponding /// 32-bit integer in \a __W with signed saturation, and store the packed @@ -460,12 +423,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W, /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A, - (__v8si)__B); -} -#undef __DEFAULT_FN_ATTRS128 -#undef __DEFAULT_FN_ATTRS256 +#define _mm256_dpbuuds_epi32(__W, __A, __B) \ + ((__m256i)__builtin_ia32_vpdpbuuds256((__v8si)(__W), (__v8si)(__A), \ + (__v8si)(__B))) #endif // __AVXVNNIINT8INTRIN_H diff --git a/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c index b7982e6ecca84d2..26e0d124c828476 100644 --- a/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c +++ b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c @@ -3,6 +3,28 @@ #include +// VNNI FP16 +__m512 test_mm512_dpph_ps(__m512 __W, __m512h __A, __m512h __B) { +// CHECK-LABEL: @test_mm512_dpph_ps( +// CHECK: call <16 x float> @llvm.x86.avx10.vdpphps.512 + return _mm512_dpph_ps(__W, __A, __B); +} + +__m512 test_mm512_mask_dpph_ps(__m512 __W, __mmask16 __U, __m512h __A, __m512h __B) { +// CHECK-LABEL: @test_mm512_mask_dpph_ps( +// CHECK: call <16 x float> @llvm.x86.avx10.vdpphps.512 +// CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} + return _mm512_mask_dpph_ps(__W, __U, __A, __B); +} + +__m512 test_mm512_maskz_dpph_ps(__mmask16 __U, __m512 __W, __m512h __A, __m512h __B) { +// CHECK-LABEL: @test_mm512_maskz_dpph_ps( +// CHECK: call <16 x float> @llvm.x86.avx10.vdpphps.512 +// CHECK: zeroinitializer +// CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} + return _mm512_maskz_dpph_ps(__U, __W, __A, __B); +} + // VMPSADBW __m512i test_mm512_mpsadbw_epu8(__m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_mpsadbw_epu8 @@ -23,3 +45,257 @@ __m512i test_mm512_maskz_mpsadbw_epu8(__mmask32 __U, __m512i __A, __m512i __B) { // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_maskz_mpsadbw_epu8(__U, __A, __B, 17); } + +// VNNI INT8 +__m512i test_mm512_dpbssd_epi32(__m512i __W, __m512i __A, __m512i __B) { +// CHECK-LABEL: @test_mm512_dpbssd_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbssd.512 + return _mm512_dpbssd_epi32(__W, __A, __B); +} + +__m512i test_mm512_mask_dpbssd_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { +// CHECK-LABEL: @test_mm512_mask_dpbssd_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbssd.512 +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} + return _mm512_mask_dpbssd_epi32(__W, __U, __A, __B); +} + +__m512i test_mm512_maskz_dpbssd_epi32(__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) { +// CHECK-LABEL: @test_mm512_maskz_dpbssd_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbssd.512 +// CHECK: zeroinitializer +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} + return _mm512_maskz_dpbssd_epi32(__U, __W, __A, __B); +} + +__m512i test_mm512_dpbssds_epi32(__m512i __W, __m512i __A, __m512i __B) { +// CHECK-LABEL: @test_mm512_dpbssds_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbssds.512 + return _mm512_dpbssds_epi32(__W, __A, __B); +} + +__m512i test_mm512_mask_dpbssds_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { +// CHECK-LABEL: @test_mm512_mask_dpbssds_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbssds.512 +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} + return _mm512_mask_dpbssds_epi32(__W, __U, __A, __B); +} + +__m512i test_mm512_maskz_dpbssds_epi32(__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) { +// CHECK-LABEL: @test_mm512_maskz_dpbssds_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbssds.512 +// CHECK: zeroinitializer +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} + return _mm512_maskz_dpbssds_epi32(__U, __W, __A, __B); +} + +__m512i test_mm512_dpbsud_epi32(__m512i __W, __m512i __A, __m512i __B) { +// CHECK-LABEL: @test_mm512_dpbsud_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbsud.512 + return _mm512_dpbsud_epi32(__W, __A, __B); +} + +__m512i test_mm512_mask_dpbsud_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { +// CHECK-LABEL: @test_mm512_mask_dpbsud_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbsud.512 +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} + return _mm512_mask_dpbsud_epi32(__W, __U, __A, __B); +} + +__m512i test_mm512_maskz_dpbsud_epi32(__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) { +// CHECK-LABEL: @test_mm512_maskz_dpbsud_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbsud.512 +// CHECK: zeroinitializer +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} + return _mm512_maskz_dpbsud_epi32(__U, __W, __A, __B); +} + +__m512i test_mm512_dpbsuds_epi32(__m512i __W, __m512i __A, __m512i __B) { +// CHECK-LABEL: @test_mm512_dpbsuds_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512 + return _mm512_dpbsuds_epi32(__W, __A, __B); +} + +__m512i test_mm512_mask_dpbsuds_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { +// CHECK-LABEL: @test_mm512_mask_dpbsuds_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512 +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} + return _mm512_mask_dpbsuds_epi32(__W, __U, __A, __B); +} + +__m512i test_mm512_maskz_dpbsuds_epi32(__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) { +// CHECK-LABEL: @test_mm512_maskz_dpbsuds_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512 +// CHECK: zeroinitializer +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} + return _mm512_maskz_dpbsuds_epi32(__U, __W, __A, __B); +} + +__m512i test_mm512_dpbuud_epi32(__m512i __W, __m512i __A, __m512i __B) { +// CHECK-LABEL: @test_mm512_dpbuud_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbuud.512 + return _mm512_dpbuud_epi32(__W, __A, __B); +} + +__m512i test_mm512_mask_dpbuud_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { +// CHECK-LABEL: @test_mm512_mask_dpbuud_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbuud.512 +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} + return _mm512_mask_dpbuud_epi32(__W, __U, __A, __B); +} + +__m512i test_mm512_maskz_dpbuud_epi32(__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) { +// CHECK-LABEL: @test_mm512_maskz_dpbuud_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbuud.512 +// CHECK: zeroinitializer +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} + return _mm512_maskz_dpbuud_epi32(__U, __W, __A, __B); +} + +__m512i test_mm512_dpbuuds_epi32(__m512i __W, __m512i __A, __m512i __B) { +// CHECK-LABEL: @test_mm512_dpbuuds_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512 + return _mm512_dpbuuds_epi32(__W, __A, __B); +} + +__m512i test_mm512_mask_dpbuuds_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { +// CHECK-LABEL: @test_mm512_mask_dpbuuds_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512 +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} + return _mm512_mask_dpbuuds_epi32(__W, __U, __A, __B); +} + +__m512i test_mm512_maskz_dpbuuds_epi32(__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) { +// CHECK-LABEL: @test_mm512_maskz_dpbuuds_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512 +// CHECK: zeroinitializer +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} + return _mm512_maskz_dpbuuds_epi32(__U, __W, __A, __B); +} + +/* VNNI INT16 */ +__m512i test_mm512_dpwsud_epi32(__m512i __A, __m512i __B, __m512i __C) { +// CHECK-LABEL: @test_mm512_dpwsud_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) + return _mm512_dpwsud_epi32(__A, __B, __C); +} + +__m512i test_mm512_mask_dpwsud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { +// CHECK-LABEL: @test_mm512_mask_dpwsud_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} + return _mm512_mask_dpwsud_epi32(__A, __B, __C, __D); +} + +__m512i test_mm512_maskz_dpwsud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { +// CHECK-LABEL: @test_mm512_maskz_dpwsud_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: zeroinitializer +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} + return _mm512_maskz_dpwsud_epi32(__A, __B, __C, __D); +} + +__m512i test_mm512_dpwsuds_epi32(__m512i __A, __m512i __B, __m512i __C) { +// CHECK-LABEL: @test_mm512_dpwsuds_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) + return _mm512_dpwsuds_epi32(__A, __B, __C); +} + +__m512i test_mm512_mask_dpwsuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { +// CHECK-LABEL: @test_mm512_mask_dpwsuds_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} + return _mm512_mask_dpwsuds_epi32(__A, __B, __C, __D); +} + +__m512i test_mm512_maskz_dpwsuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { +// CHECK-LABEL: @test_mm512_maskz_dpwsuds_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: zeroinitializer +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} + return _mm512_maskz_dpwsuds_epi32(__A, __B, __C, __D); +} + +__m512i test_mm512_dpwusd_epi32(__m512i __A, __m512i __B, __m512i __C) { +// CHECK-LABEL: @test_mm512_dpwusd_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) + return _mm512_dpwusd_epi32(__A, __B, __C); +} + +__m512i test_mm512_mask_dpwusd_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { +// CHECK-LABEL: @test_mm512_mask_dpwusd_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} + return _mm512_mask_dpwusd_epi32(__A, __B, __C, __D); +} + +__m512i test_mm512_maskz_dpwusd_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { +// CHECK-LABEL: @test_mm512_maskz_dpwusd_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: zeroinitializer +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} + return _mm512_maskz_dpwusd_epi32(__A, __B, __C, __D); +} + +__m512i test_mm512_dpwusds_epi32(__m512i __A, __m512i __B, __m512i __C) { +// CHECK-LABEL: @test_mm512_dpwusds_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) + return _mm512_dpwusds_epi32(__A, __B, __C); +} + +__m512i test_mm512_mask_dpwusds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { +// CHECK-LABEL: @test_mm512_mask_dpwusds_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} + return _mm512_mask_dpwusds_epi32(__A, __B, __C, __D); +} + +__m512i test_mm512_maskz_dpwusds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { +// CHECK-LABEL: @test_mm512_maskz_dpwusds_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: zeroinitializer +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} + return _mm512_maskz_dpwusds_epi32(__A, __B, __C, __D); +} + +__m512i test_mm512_dpwuud_epi32(__m512i __A, __m512i __B, __m512i __C) { +// CHECK-LABEL: @test_mm512_dpwuud_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) + return _mm512_dpwuud_epi32(__A, __B, __C); +} + +__m512i test_mm512_mask_dpwuud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { +// CHECK-LABEL: @test_mm512_mask_dpwuud_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} + return _mm512_mask_dpwuud_epi32(__A, __B, __C, __D); +} + +__m512i test_mm512_maskz_dpwuud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { +// CHECK-LABEL: @test_mm512_maskz_dpwuud_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: zeroinitializer +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} + return _mm512_maskz_dpwuud_epi32(__A, __B, __C, __D); +} + +__m512i test_mm512_dpwuuds_epi32(__m512i __A, __m512i __B, __m512i __C) { +// CHECK-LABEL: @test_mm512_dpwuuds_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) + return _mm512_dpwuuds_epi32(__A, __B, __C); +} + +__m512i test_mm512_mask_dpwuuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { +// CHECK-LABEL: @test_mm512_mask_dpwuuds_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} + return _mm512_mask_dpwuuds_epi32(__A, __B, __C, __D); +} + +__m512i test_mm512_maskz_dpwuuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { +// CHECK-LABEL: @test_mm512_maskz_dpwuuds_epi32( +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: zeroinitializer +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} + return _mm512_maskz_dpwuuds_epi32(__A, __B, __C, __D); +} diff --git a/clang/test/CodeGen/X86/avx10_2ni-builtins.c b/clang/test/CodeGen/X86/avx10_2ni-builtins.c index ace3b7e30c7f65b..d06a008c09e71de 100644 --- a/clang/test/CodeGen/X86/avx10_2ni-builtins.c +++ b/clang/test/CodeGen/X86/avx10_2ni-builtins.c @@ -3,6 +3,49 @@ #include +// VNNI FP16 +__m128 test_mm_dpph_ps(__m128 __W, __m128h __A, __m128h __B) { +// CHECK-LABEL: @test_mm_dpph_ps( +// CHECK: call <4 x float> @llvm.x86.avx10.vdpphps.128 + return _mm_dpph_ps(__W, __A, __B); +} + +__m128 test_mm_mask_dpph_ps(__m128 __W, __mmask8 __U, __m128h __A, __m128h __B) { +// CHECK-LABEL: @test_mm_mask_dpph_ps( +// CHECK: call <4 x float> @llvm.x86.avx10.vdpphps.128 +// CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} + return _mm_mask_dpph_ps(__W, __U, __A, __B); +} + +__m128 test_mm_maskz_dpph_ps(__mmask8 __U, __m128 __W, __m128h __A, __m128h __B) { +// CHECK-LABEL: @test_mm_maskz_dpph_ps( +// CHECK: call <4 x float> @llvm.x86.avx10.vdpphps.128 +// CHECK: zeroinitializer +// CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} + return _mm_maskz_dpph_ps(__U, __W, __A, __B); +} + +__m256 test_mm256_dpph_ps(__m256 __W, __m256h __A, __m256h __B) { +// CHECK-LABEL: @test_mm256_dpph_ps( +// CHECK: call <8 x float> @llvm.x86.avx10.vdpphps.256 + return _mm256_dpph_ps(__W, __A, __B); +} + +__m256 test_mm256_mask_dpph_ps(__m256 __W, __mmask8 __U, __m256h __A, __m256h __B) { +// CHECK-LABEL: @test_mm256_mask_dpph_ps( +// CHECK: call <8 x float> @llvm.x86.avx10.vdpphps.256 +// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} + return _mm256_mask_dpph_ps(__W, __U, __A, __B); +} + +__m256 test_mm256_maskz_dpph_ps(__mmask8 __U, __m256 __W, __m256h __A, __m256h __B) { +// CHECK-LABEL: @test_mm256_maskz_dpph_ps( +// CHECK: call <8 x float> @llvm.x86.avx10.vdpphps.256 +// CHECK: zeroinitializer +// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} + return _mm256_maskz_dpph_ps(__U, __W, __A, __B); +} + // VMPSADBW __m128i test_mm_mpsadbw_epu8(__m128i __A, __m128i __B) { // CHECK-LABEL: @test_mm_mpsadbw_epu8 @@ -44,6 +87,344 @@ __m256i test_mm256_maskz_mpsadbw_epu8(__mmask16 __U, __m256i __A, __m256i __B) { return _mm256_maskz_mpsadbw_epu8(__U, __A, __B, 170); } +// VNNI INT8 +__m128i test_mm_mask_dpbssd_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +// CHECK-LABEL: @test_mm_mask_dpbssd_epi32 +// CHECK: @llvm.x86.avx2.vpdpbssd.128 +// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} + return _mm_mask_dpbssd_epi32(__W, __U, __A, __B); +} + +__m128i test_mm_maskz_dpbssd_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) { +// CHECK-LABEL: @test_mm_maskz_dpbssd_epi32 +// CHECK: @llvm.x86.avx2.vpdpbssd.128 +// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} + return _mm_maskz_dpbssd_epi32(__U, __W, __A, __B); +} + +__m128i test_mm_mask_dpbssds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +// CHECK-LABEL: @test_mm_mask_dpbssds_epi32 +// CHECK: @llvm.x86.avx2.vpdpbssds.128 +// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} + return _mm_mask_dpbssds_epi32(__W, __U, __A, __B); +} + +__m128i test_mm_maskz_dpbssds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) { +// CHECK-LABEL: @test_mm_maskz_dpbssds_epi32 +// CHECK: @llvm.x86.avx2.vpdpbssds.128 +// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} + return _mm_maskz_dpbssds_epi32(__U, __W, __A, __B); +} + +__m128i test_mm_mask_dpbsud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +// CHECK-LABEL: @test_mm_mask_dpbsud_epi32 +// CHECK: @llvm.x86.avx2.vpdpbsud.128 +// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} + return _mm_mask_dpbsud_epi32(__W, __U, __A, __B); +} + +__m128i test_mm_maskz_dpbsud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) { +// CHECK-LABEL: @test_mm_maskz_dpbsud_epi32 +// CHECK: @llvm.x86.avx2.vpdpbsud.128 +// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} + return _mm_maskz_dpbsud_epi32(__U, __W, __A, __B); +} + +__m128i test_mm_mask_dpbsuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +// CHECK-LABEL: @test_mm_mask_dpbsuds_epi32 +// CHECK: @llvm.x86.avx2.vpdpbsuds.128 +// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} + return _mm_mask_dpbsuds_epi32(__W, __U, __A, __B); +} + +__m128i test_mm_maskz_dpbsuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) { +// CHECK-LABEL: @test_mm_maskz_dpbsuds_epi32 +// CHECK: @llvm.x86.avx2.vpdpbsuds.128 +// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} + return _mm_maskz_dpbsuds_epi32(__U, __W, __A, __B); +} + +__m128i test_mm_mask_dpbuud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +// CHECK-LABEL: @test_mm_mask_dpbuud_epi32 +// CHECK: @llvm.x86.avx2.vpdpbuud.128 +// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} + return _mm_mask_dpbuud_epi32(__W, __U, __A, __B); +} + +__m128i test_mm_maskz_dpbuud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) { +// CHECK-LABEL: @test_mm_maskz_dpbuud_epi32 +// CHECK: @llvm.x86.avx2.vpdpbuud.128 +// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} + return _mm_maskz_dpbuud_epi32(__U, __W, __A, __B); +} + +__m128i test_mm_mask_dpbuuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +// CHECK-LABEL: @test_mm_mask_dpbuuds_epi32 +// CHECK: @llvm.x86.avx2.vpdpbuuds.128 +// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} + return _mm_mask_dpbuuds_epi32(__W, __U, __A, __B); +} + +__m128i test_mm_maskz_dpbuuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) { +// CHECK-LABEL: @test_mm_maskz_dpbuuds_epi32 +// CHECK: @llvm.x86.avx2.vpdpbuuds.128 +// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} + return _mm_maskz_dpbuuds_epi32(__U, __W, __A, __B); +} + +__m256i test_mm256_mask_dpbssd_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { +// CHECK-LABEL: @test_mm256_mask_dpbssd_epi32 +// CHECK: @llvm.x86.avx2.vpdpbssd.256 +// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} + return _mm256_mask_dpbssd_epi32(__W, __U, __A, __B); +} + +__m256i test_mm256_maskz_dpbssd_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) { +// CHECK-LABEL: @test_mm256_maskz_dpbssd_epi32 +// CHECK: @llvm.x86.avx2.vpdpbssd.256 +// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} + return _mm256_maskz_dpbssd_epi32(__U, __W, __A, __B); +} + +__m256i test_mm256_mask_dpbssds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { +// CHECK-LABEL: @test_mm256_mask_dpbssds_epi32 +// CHECK: @llvm.x86.avx2.vpdpbssds.256 +// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} + return _mm256_mask_dpbssds_epi32(__W, __U, __A, __B); +} + +__m256i test_mm256_maskz_dpbssds_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) { +// CHECK-LABEL: @test_mm256_maskz_dpbssds_epi32 +// CHECK: @llvm.x86.avx2.vpdpbssds.256 +// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} + return _mm256_maskz_dpbssds_epi32(__U, __W, __A, __B); +} + +__m256i test_mm256_mask_dpbsud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { +// CHECK-LABEL: @test_mm256_mask_dpbsud_epi32 +// CHECK: @llvm.x86.avx2.vpdpbsud.256 +// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} + return _mm256_mask_dpbsud_epi32(__W, __U, __A, __B); +} + +__m256i test_mm256_maskz_dpbsud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) { +// CHECK-LABEL: @test_mm256_maskz_dpbsud_epi32 +// CHECK: @llvm.x86.avx2.vpdpbsud.256 +// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} + return _mm256_maskz_dpbsud_epi32(__U, __W, __A, __B); +} + +__m256i test_mm256_mask_dpbsuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { +// CHECK-LABEL: @test_mm256_mask_dpbsuds_epi32 +// CHECK: @llvm.x86.avx2.vpdpbsuds.256 +// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} + return _mm256_mask_dpbsuds_epi32(__W, __U, __A, __B); +} + +__m256i test_mm256_maskz_dpbsuds_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) { +// CHECK-LABEL: @test_mm256_maskz_dpbsuds_epi32 +// CHECK: @llvm.x86.avx2.vpdpbsuds.256 +// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} + return _mm256_maskz_dpbsuds_epi32(__U, __W, __A, __B); +} + +__m256i test_mm256_mask_dpbuud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { +// CHECK-LABEL: @test_mm256_mask_dpbuud_epi32 +// CHECK: @llvm.x86.avx2.vpdpbuud.256 +// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} + return _mm256_mask_dpbuud_epi32(__W, __U, __A, __B); +} + +__m256i test_mm256_maskz_dpbuud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) { +// CHECK-LABEL: @test_mm256_maskz_dpbuud_epi32 +// CHECK: @llvm.x86.avx2.vpdpbuud.256 +// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} + return _mm256_maskz_dpbuud_epi32(__U, __W, __A, __B); +} + +__m256i test_mm256_mask_dpbuuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { +// CHECK-LABEL: @test_mm256_mask_dpbuuds_epi32 +// CHECK: @llvm.x86.avx2.vpdpbuuds.256 +// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} + return _mm256_mask_dpbuuds_epi32(__W, __U, __A, __B); +} + +__m256i test_mm256_maskz_dpbuuds_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) { +// CHECK-LABEL: @test_mm256_maskz_dpbuuds_epi32 +// CHECK: @llvm.x86.avx2.vpdpbuuds.256 +// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} + return _mm256_maskz_dpbuuds_epi32(__U, __W, __A, __B); +} + +// VNNI INT16 +__m128i test_mm_mask_dpwsud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { +// CHECK-LABEL: @test_mm_mask_dpwsud_epi32( +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) +// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} + return _mm_mask_dpwsud_epi32(__A, __B, __C, __D); +} + +__m128i test_mm_maskz_dpwsud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { +// CHECK-LABEL: @test_mm_maskz_dpwsud_epi32( +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) +// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} + return _mm_maskz_dpwsud_epi32(__A, __B, __C, __D); +} + +__m256i test_mm256_mask_dpwsud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { +// CHECK-LABEL: @test_mm256_mask_dpwsud_epi32( +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) +// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} + return _mm256_mask_dpwsud_epi32(__A, __B, __C, __D); +} + +__m256i test_mm256_maskz_dpwsud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { +// CHECK-LABEL: @test_mm256_maskz_dpwsud_epi32( +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) +// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} + return _mm256_maskz_dpwsud_epi32(__A, __B, __C, __D); +} + +__m128i test_mm_mask_dpwsuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { +// CHECK-LABEL: @test_mm_mask_dpwsuds_epi32( +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) +// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} + return _mm_mask_dpwsuds_epi32(__A, __B, __C, __D); +} + +__m128i test_mm_maskz_dpwsuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { +// CHECK-LABEL: @test_mm_maskz_dpwsuds_epi32( +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) +// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} + return _mm_maskz_dpwsuds_epi32(__A, __B, __C, __D); +} + +__m256i test_mm256_mask_dpwsuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { +// CHECK-LABEL: @test_mm256_mask_dpwsuds_epi32( +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) +// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} + return _mm256_mask_dpwsuds_epi32(__A, __B, __C, __D); +} + +__m256i test_mm256_maskz_dpwsuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { +// CHECK-LABEL: @test_mm256_maskz_dpwsuds_epi32( +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) +// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} + return _mm256_maskz_dpwsuds_epi32(__A, __B, __C, __D); +} + +__m128i test_mm_mask_dpwusd_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { +// CHECK-LABEL: @test_mm_mask_dpwusd_epi32( +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) +// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} + return _mm_mask_dpwusd_epi32(__A, __B, __C, __D); +} + +__m128i test_mm_maskz_dpwusd_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { +// CHECK-LABEL: @test_mm_maskz_dpwusd_epi32( +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) +// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} + return _mm_maskz_dpwusd_epi32(__A, __B, __C, __D); +} + +__m256i test_mm256_mask_dpwusd_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { +// CHECK-LABEL: @test_mm256_mask_dpwusd_epi32( +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) +// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} + return _mm256_mask_dpwusd_epi32(__A, __B, __C, __D); +} + +__m256i test_mm256_maskz_dpwusd_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { +// CHECK-LABEL: @test_mm256_maskz_dpwusd_epi32( +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) +// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} + return _mm256_maskz_dpwusd_epi32(__A, __B, __C, __D); +} + +__m128i test_mm_mask_dpwusds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { +// CHECK-LABEL: @test_mm_mask_dpwusds_epi32( +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) +// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} + return _mm_mask_dpwusds_epi32(__A, __B, __C, __D); +} + +__m128i test_mm_maskz_dpwusds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { +// CHECK-LABEL: @test_mm_maskz_dpwusds_epi32( +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) +// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} + return _mm_maskz_dpwusds_epi32(__A, __B, __C, __D); +} + +__m256i test_mm256_mask_dpwusds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { +// CHECK-LABEL: @test_mm256_mask_dpwusds_epi32( +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) +// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} + return _mm256_mask_dpwusds_epi32(__A, __B, __C, __D); +} + +__m256i test_mm256_maskz_dpwusds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { +// CHECK-LABEL: @test_mm256_maskz_dpwusds_epi32( +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) +// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} + return _mm256_maskz_dpwusds_epi32(__A, __B, __C, __D); +} + +__m128i test_mm_mask_dpwuud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { +// CHECK-LABEL: @test_mm_mask_dpwuud_epi32( +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) +// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} + return _mm_mask_dpwuud_epi32(__A, __B, __C, __D); +} + +__m128i test_mm_maskz_dpwuud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { +// CHECK-LABEL: @test_mm_maskz_dpwuud_epi32( +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) +// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} + return _mm_maskz_dpwuud_epi32(__A, __B, __C, __D); +} + +__m256i test_mm256_mask_dpwuud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { +// CHECK-LABEL: @test_mm256_mask_dpwuud_epi32( +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) +// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} + return _mm256_mask_dpwuud_epi32(__A, __B, __C, __D); +} + +__m256i test_mm256_maskz_dpwuud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { +// CHECK-LABEL: @test_mm256_maskz_dpwuud_epi32( +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) +// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} + return _mm256_maskz_dpwuud_epi32(__A, __B, __C, __D); +} + +__m128i test_mm_mask_dpwuuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { +// CHECK-LABEL: @test_mm_mask_dpwuuds_epi32( +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) +// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} + return _mm_mask_dpwuuds_epi32(__A, __B, __C, __D); +} + +__m128i test_mm_maskz_dpwuuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { +// CHECK-LABEL: @test_mm_maskz_dpwuuds_epi32( +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) +// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} + return _mm_maskz_dpwuuds_epi32(__A, __B, __C, __D); +} + +__m256i test_mm256_mask_dpwuuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { +// CHECK-LABEL: @test_mm256_mask_dpwuuds_epi32( +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) +// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} + return _mm256_mask_dpwuuds_epi32(__A, __B, __C, __D); +} + +__m256i test_mm256_maskz_dpwuuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { +// CHECK-LABEL: @test_mm256_maskz_dpwuuds_epi32( +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) +// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} + return _mm256_maskz_dpwuuds_epi32(__A, __B, __C, __D); +} + // YMM Rounding __m256d test_mm256_add_round_pd(__m256d __A, __m256d __B) { // CHECK-LABEL: @test_mm256_add_round_pd diff --git a/clang/test/CodeGen/X86/avxvnniint16-builtins.c b/clang/test/CodeGen/X86/avxvnniint16-builtins.c index a10ca551a151468..f9feaea1e244d0e 100644 --- a/clang/test/CodeGen/X86/avxvnniint16-builtins.c +++ b/clang/test/CodeGen/X86/avxvnniint16-builtins.c @@ -1,5 +1,7 @@ // RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avxvnniint16 -emit-llvm -o - -Wall -Werror | FileCheck %s // RUN: %clang_cc1 %s -ffreestanding -triple=i386-unknown-unknown -target-feature +avxvnniint16 -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avx10.2-256 -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 %s -ffreestanding -triple=i386-unknown-unknown -target-feature +avx10.2-256 -emit-llvm -o - -Wall -Werror | FileCheck %s #include diff --git a/clang/test/CodeGen/X86/avxvnniint8-builtins.c b/clang/test/CodeGen/X86/avxvnniint8-builtins.c index cbdf443888a15ac..80d005c16d3877a 100644 --- a/clang/test/CodeGen/X86/avxvnniint8-builtins.c +++ b/clang/test/CodeGen/X86/avxvnniint8-builtins.c @@ -1,5 +1,7 @@ // RUN: %clang_cc1 -ffreestanding %s -triple=x86_64- -target-feature +avxvnniint8 -emit-llvm -o - -Wall -Werror | FileCheck %s // RUN: %clang_cc1 -ffreestanding %s -triple=i386- -target-feature +avxvnniint8 -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64- -target-feature +avx10.2-256 -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -ffreestanding %s -triple=i386- -target-feature +avx10.2-256 -emit-llvm -o - -Wall -Werror | FileCheck %s #include diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 7160c8dfa7600d2..eb2cb3fbfce8e81 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -4980,6 +4980,85 @@ let TargetPrefix = "x86" in { //===----------------------------------------------------------------------===// // AVX10.2 intrinsics let TargetPrefix = "x86" in { + // VNNI FP16 + def int_x86_avx10_vdpphps_128 : + ClangBuiltin<"__builtin_ia32_vdpphps128">, + DefaultAttrsIntrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v8f16_ty, llvm_v8f16_ty], + [IntrNoMem]>; + def int_x86_avx10_vdpphps_256 : + ClangBuiltin<"__builtin_ia32_vdpphps256">, + DefaultAttrsIntrinsic<[llvm_v8f32_ty], + [llvm_v8f32_ty, llvm_v16f16_ty, llvm_v16f16_ty], + [IntrNoMem]>; + def int_x86_avx10_vdpphps_512 : + ClangBuiltin<"__builtin_ia32_vdpphps512">, + DefaultAttrsIntrinsic<[llvm_v16f32_ty], + [llvm_v16f32_ty, llvm_v32f16_ty, llvm_v32f16_ty], + [IntrNoMem]>; + // VNNI INT8 + def int_x86_avx10_vpdpbssd_512 : + ClangBuiltin<"__builtin_ia32_vpdpbssd512">, + DefaultAttrsIntrinsic<[llvm_v16i32_ty], + [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty], + [IntrNoMem]>; + def int_x86_avx10_vpdpbssds_512 : + ClangBuiltin<"__builtin_ia32_vpdpbssds512">, + DefaultAttrsIntrinsic<[llvm_v16i32_ty], + [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty], + [IntrNoMem]>; + def int_x86_avx10_vpdpbsud_512 : + ClangBuiltin<"__builtin_ia32_vpdpbsud512">, + DefaultAttrsIntrinsic<[llvm_v16i32_ty], + [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty], + [IntrNoMem]>; + def int_x86_avx10_vpdpbsuds_512 : + ClangBuiltin<"__builtin_ia32_vpdpbsuds512">, + DefaultAttrsIntrinsic<[llvm_v16i32_ty], + [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty], + [IntrNoMem]>; + def int_x86_avx10_vpdpbuud_512 : + ClangBuiltin<"__builtin_ia32_vpdpbuud512">, + DefaultAttrsIntrinsic<[llvm_v16i32_ty], + [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty], + [IntrNoMem]>; + def int_x86_avx10_vpdpbuuds_512 : + ClangBuiltin<"__builtin_ia32_vpdpbuuds512">, + DefaultAttrsIntrinsic<[llvm_v16i32_ty], + [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty], + [IntrNoMem]>; + // VNNI INT16 + def int_x86_avx10_vpdpwsud_512 : + ClangBuiltin<"__builtin_ia32_vpdpwsud512">, + DefaultAttrsIntrinsic<[llvm_v16i32_ty], + [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty], + [IntrNoMem]>; + def int_x86_avx10_vpdpwsuds_512 : + ClangBuiltin<"__builtin_ia32_vpdpwsuds512">, + DefaultAttrsIntrinsic<[llvm_v16i32_ty], + [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty], + [IntrNoMem]>; + def int_x86_avx10_vpdpwusd_512 : + ClangBuiltin<"__builtin_ia32_vpdpwusd512">, + DefaultAttrsIntrinsic<[llvm_v16i32_ty], + [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty], + [IntrNoMem]>; + def int_x86_avx10_vpdpwusds_512 : + ClangBuiltin<"__builtin_ia32_vpdpwusds512">, + DefaultAttrsIntrinsic<[llvm_v16i32_ty], + [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty], + [IntrNoMem]>; + def int_x86_avx10_vpdpwuud_512 : + ClangBuiltin<"__builtin_ia32_vpdpwuud512">, + DefaultAttrsIntrinsic<[llvm_v16i32_ty], + [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty], + [IntrNoMem]>; + def int_x86_avx10_vpdpwuuds_512 : + ClangBuiltin<"__builtin_ia32_vpdpwuuds512">, + DefaultAttrsIntrinsic<[llvm_v16i32_ty], + [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty], + [IntrNoMem]>; + // VMPSADBW def int_x86_avx10_vmpsadbw_512 : ClangBuiltin<"__builtin_ia32_mpsadbw512">, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index fe1865409a26545..fff65a1bd967c54 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34033,6 +34033,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CVTNEPS2BF16) NODE_NAME_CASE(MCVTNEPS2BF16) NODE_NAME_CASE(DPBF16PS) + NODE_NAME_CASE(DPFP16PS) NODE_NAME_CASE(MPSADBW) NODE_NAME_CASE(LWPINS) NODE_NAME_CASE(MGATHER) @@ -34058,6 +34059,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(VPDPBUUDS) NODE_NAME_CASE(VPDPBSSD) NODE_NAME_CASE(VPDPBSSDS) + NODE_NAME_CASE(VPDPWSUD) + NODE_NAME_CASE(VPDPWSUDS) + NODE_NAME_CASE(VPDPWUSD) + NODE_NAME_CASE(VPDPWUSDS) + NODE_NAME_CASE(VPDPWUUD) + NODE_NAME_CASE(VPDPWUUDS) NODE_NAME_CASE(VMINMAX) NODE_NAME_CASE(VMINMAX_SAE) NODE_NAME_CASE(VMINMAXS) diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 7642a528fb22ed1..b985f7529ea2abd 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -595,6 +595,13 @@ namespace llvm { VPDPBSSD, VPDPBSSDS, + VPDPWSUD, + VPDPWSUDS, + VPDPWUSD, + VPDPWUSDS, + VPDPWUUD, + VPDPWUUDS, + VMINMAX, VMINMAX_SAE, VMINMAXS, @@ -661,9 +668,10 @@ namespace llvm { // SRC, PASSTHRU, MASK MCVTNEPS2BF16, - // Dot product of BF16 pairs to accumulated into + // Dot product of BF16/FP16 pairs to accumulated into // packed single precision. DPBF16PS, + DPFP16PS, // A stack checking function call. On Windows it's _chkstk call. DYN_ALLOCA, diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td index 920317ded15c672..8e4586f2002d976 100644 --- a/llvm/lib/Target/X86/X86InstrAVX10.td +++ b/llvm/lib/Target/X86/X86InstrAVX10.td @@ -12,6 +12,40 @@ // //===----------------------------------------------------------------------===// +// VNNI FP16 +let ExeDomain = SSEPackedSingle in +defm VDPPHPS : avx512_dpf16ps_sizes<0x52, "vdpphps", X86dpfp16ps, avx512vl_f16_info, + [HasAVX10_2], [HasAVX10_2_512]>, + T8, PS, EVEX_CD8<32, CD8VF>; + +// VNNI INT8 +defm VPDPBSSD : VNNI_common<0x50, "vpdpbssd", X86vpdpbssd, SchedWriteVecIMul, 1, + [HasAVX10_2], [HasAVX10_2_512]>, XD; +defm VPDPBSSDS : VNNI_common<0x51, "vpdpbssds", X86vpdpbssds, SchedWriteVecIMul, 1, + [HasAVX10_2], [HasAVX10_2_512]>, XD; +defm VPDPBSUD : VNNI_common<0x50, "vpdpbsud", X86vpdpbsud, SchedWriteVecIMul, 0, + [HasAVX10_2], [HasAVX10_2_512]>, XS; +defm VPDPBSUDS : VNNI_common<0x51, "vpdpbsuds", X86vpdpbsuds, SchedWriteVecIMul, 0, + [HasAVX10_2], [HasAVX10_2_512]>, XS; +defm VPDPBUUD : VNNI_common<0x50, "vpdpbuud", X86vpdpbuud, SchedWriteVecIMul, 1, + [HasAVX10_2], [HasAVX10_2_512]>, PS; +defm VPDPBUUDS : VNNI_common<0x51, "vpdpbuuds", X86vpdpbuuds, SchedWriteVecIMul, 1, + [HasAVX10_2], [HasAVX10_2_512]>, PS; + +// VNNI INT16 +defm VPDPWSUD : VNNI_common<0xd2, "vpdpwsud", X86vpdpwsud, SchedWriteVecIMul, 0, + [HasAVX10_2], [HasAVX10_2_512]>, XS; +defm VPDPWSUDS : VNNI_common<0xd3, "vpdpwsuds", X86vpdpwsuds, SchedWriteVecIMul, 0, + [HasAVX10_2], [HasAVX10_2_512]>, XS; +defm VPDPWUSD : VNNI_common<0xd2, "vpdpwusd", X86vpdpwusd, SchedWriteVecIMul, 0, + [HasAVX10_2], [HasAVX10_2_512]>, PD; +defm VPDPWUSDS : VNNI_common<0xd3, "vpdpwusds", X86vpdpwusds, SchedWriteVecIMul, 0, + [HasAVX10_2], [HasAVX10_2_512]>, PD; +defm VPDPWUUD : VNNI_common<0xd2, "vpdpwuud", X86vpdpwuud, SchedWriteVecIMul, 1, + [HasAVX10_2], [HasAVX10_2_512]>, PS; +defm VPDPWUUDS : VNNI_common<0xd3, "vpdpwuuds", X86vpdpwuuds, SchedWriteVecIMul, 1, + [HasAVX10_2], [HasAVX10_2_512]>, PS; + // VMPSADBW defm VMPSADBW : avx512_common_3Op_rm_imm8<0x42, X86Vmpsadbw, "vmpsadbw", SchedWritePSADBW, avx512vl_i16_info, avx512vl_i8_info, diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index f9b8cb689694e7b..e616a8a37c64872 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -12390,13 +12390,13 @@ multiclass VNNI_rmb Op, string OpStr, SDNode OpNode, (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, VTI.RC:$src3)), IsCommutable, IsCommutable>, - EVEX, VVVV, T8, PD, Sched<[sched]>; + EVEX, VVVV, T8, Sched<[sched]>; defm m : AVX512_maskable_3src, - EVEX, VVVV, EVEX_CD8<32, CD8VF>, T8, PD, + EVEX, VVVV, EVEX_CD8<32, CD8VF>, T8, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>; defm mb : AVX512_maskable_3src Op, string OpStr, SDNode OpNode, (OpNode VTI.RC:$src1, VTI.RC:$src2, (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>, EVEX, VVVV, EVEX_CD8<32, CD8VF>, EVEX_B, - T8, PD, Sched<[sched.Folded, sched.ReadAfterFold, + T8, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>; } } multiclass VNNI_common Op, string OpStr, SDNode OpNode, - X86SchedWriteWidths sched, bit IsCommutable> { - let Predicates = [HasVNNI] in + X86SchedWriteWidths sched, bit IsCommutable, + list prds, list prds512> { + let Predicates = prds512 in defm Z : VNNI_rmb, EVEX_V512; - let Predicates = [HasVNNI, HasVLX] in { + let Predicates = prds in { defm Z256 : VNNI_rmb, EVEX_V256; defm Z128 : VNNI_rmb Op, string OpStr, SDNode OpNode, } // FIXME: Is there a better scheduler class for VPDP? -defm VPDPBUSD : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd, SchedWriteVecIMul, 0>; -defm VPDPBUSDS : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SchedWriteVecIMul, 0>; -defm VPDPWSSD : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SchedWriteVecIMul, 1>; -defm VPDPWSSDS : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SchedWriteVecIMul, 1>; +defm VPDPBUSD : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd, SchedWriteVecIMul, 0, + [HasVNNI, HasVLX], [HasVNNI]>, PD; +defm VPDPBUSDS : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SchedWriteVecIMul, 0, + [HasVNNI, HasVLX], [HasVNNI]>, PD; +defm VPDPWSSD : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SchedWriteVecIMul, 1, + [HasVNNI, HasVLX], [HasVNNI]>, PD; +defm VPDPWSSDS : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SchedWriteVecIMul, 1, + [HasVNNI, HasVLX], [HasVNNI]>, PD; // Patterns to match VPDPWSSD from existing instructions/intrinsics. let Predicates = [HasVNNI] in { @@ -12806,9 +12811,9 @@ let Predicates = [HasBF16] in { } let Constraints = "$src1 = $dst" in { -multiclass avx512_dpbf16ps_rm opc, string OpcodeStr, SDNode OpNode, - X86FoldableSchedWrite sched, - X86VectorVTInfo _, X86VectorVTInfo src_v> { +multiclass avx512_dpf16ps_rm opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, + X86VectorVTInfo _, X86VectorVTInfo src_v> { defm r: AVX512_maskable_3src opc, string OpcodeStr, SDNode OpNode, } } // Constraints = "$src1 = $dst" -multiclass avx512_dpbf16ps_sizes opc, string OpcodeStr, SDNode OpNode, - X86SchedWriteWidths sched, AVX512VLVectorVTInfo _, - AVX512VLVectorVTInfo src_v, Predicate prd> { - let Predicates = [prd] in { - defm Z : avx512_dpbf16ps_rm, EVEX_V512; +multiclass avx512_dpf16ps_sizes opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo _, list prds, + list prds512> { + let Predicates = prds512 in { + defm Z : avx512_dpf16ps_rm, EVEX_V512; } - let Predicates = [HasVLX, prd] in { - defm Z256 : avx512_dpbf16ps_rm, EVEX_V256; - defm Z128 : avx512_dpbf16ps_rm, EVEX_V128; + let Predicates = prds in { + defm Z256 : avx512_dpf16ps_rm, EVEX_V256; + defm Z128 : avx512_dpf16ps_rm, EVEX_V128; } } let ExeDomain = SSEPackedSingle in -defm VDPBF16PS : avx512_dpbf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps, SchedWriteFMA, - avx512vl_f32_info, avx512vl_bf16_info, - HasBF16>, T8, XS, EVEX_CD8<32, CD8VF>; +defm VDPBF16PS : avx512_dpf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps, avx512vl_bf16_info, + [HasVLX, HasBF16], [HasBF16]>, + T8, XS, EVEX_CD8<32, CD8VF>; //===----------------------------------------------------------------------===// // AVX512FP16 diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 11b75240b2504a4..78c76cacbfef3fc 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -802,6 +802,11 @@ def X86dpbf16ps : SDNode<"X86ISD::DPBF16PS", SDTCisSameAs<0,1>, SDTCVecEltisVT<2, bf16>, SDTCisSameAs<2,3>]>>; +def X86dpfp16ps : SDNode<"X86ISD::DPFP16PS", + SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>, + SDTCisSameAs<0,1>, + SDTCVecEltisVT<2, f16>, + SDTCisSameAs<2,3>]>>; // galois field arithmetic def X86GF2P8affineinvqb : SDNode<"X86ISD::GF2P8AFFINEINVQB", SDTBlend>; @@ -819,6 +824,13 @@ def X86vpdpbsuds : SDNode<"X86ISD::VPDPBSUDS", SDTVnni>; def X86vpdpbuud : SDNode<"X86ISD::VPDPBUUD", SDTVnni>; def X86vpdpbuuds : SDNode<"X86ISD::VPDPBUUDS", SDTVnni>; +def X86vpdpwsud : SDNode<"X86ISD::VPDPWSUD", SDTVnni>; +def X86vpdpwsuds : SDNode<"X86ISD::VPDPWSUDS", SDTVnni>; +def X86vpdpwusd : SDNode<"X86ISD::VPDPWUSD", SDTVnni>; +def X86vpdpwusds : SDNode<"X86ISD::VPDPWUSDS", SDTVnni>; +def X86vpdpwuud : SDNode<"X86ISD::VPDPWUUD", SDTVnni>; +def X86vpdpwuuds : SDNode<"X86ISD::VPDPWUUDS", SDTVnni>; + def X86Vmpsadbw : SDNode<"X86ISD::MPSADBW", SDTX86PSADBW>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 7fc786b1e570ba1..7e2e97d387a83fb 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -2953,6 +2953,42 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI, case X86::VPDPBUUDSYrr: case X86::VPDPBUUDrr: case X86::VPDPBUUDYrr: + case X86::VPDPBSSDSZ128r: + case X86::VPDPBSSDSZ128rk: + case X86::VPDPBSSDSZ128rkz: + case X86::VPDPBSSDSZ256r: + case X86::VPDPBSSDSZ256rk: + case X86::VPDPBSSDSZ256rkz: + case X86::VPDPBSSDSZr: + case X86::VPDPBSSDSZrk: + case X86::VPDPBSSDSZrkz: + case X86::VPDPBSSDZ128r: + case X86::VPDPBSSDZ128rk: + case X86::VPDPBSSDZ128rkz: + case X86::VPDPBSSDZ256r: + case X86::VPDPBSSDZ256rk: + case X86::VPDPBSSDZ256rkz: + case X86::VPDPBSSDZr: + case X86::VPDPBSSDZrk: + case X86::VPDPBSSDZrkz: + case X86::VPDPBUUDSZ128r: + case X86::VPDPBUUDSZ128rk: + case X86::VPDPBUUDSZ128rkz: + case X86::VPDPBUUDSZ256r: + case X86::VPDPBUUDSZ256rk: + case X86::VPDPBUUDSZ256rkz: + case X86::VPDPBUUDSZr: + case X86::VPDPBUUDSZrk: + case X86::VPDPBUUDSZrkz: + case X86::VPDPBUUDZ128r: + case X86::VPDPBUUDZ128rk: + case X86::VPDPBUUDZ128rkz: + case X86::VPDPBUUDZ256r: + case X86::VPDPBUUDZ256rk: + case X86::VPDPBUUDZ256rkz: + case X86::VPDPBUUDZr: + case X86::VPDPBUUDZrk: + case X86::VPDPBUUDZrkz: case X86::VPDPWSSDZ128r: case X86::VPDPWSSDZ128rk: case X86::VPDPWSSDZ128rkz: @@ -2971,6 +3007,24 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI, case X86::VPDPWSSDSZr: case X86::VPDPWSSDSZrk: case X86::VPDPWSSDSZrkz: + case X86::VPDPWUUDZ128r: + case X86::VPDPWUUDZ128rk: + case X86::VPDPWUUDZ128rkz: + case X86::VPDPWUUDZ256r: + case X86::VPDPWUUDZ256rk: + case X86::VPDPWUUDZ256rkz: + case X86::VPDPWUUDZr: + case X86::VPDPWUUDZrk: + case X86::VPDPWUUDZrkz: + case X86::VPDPWUUDSZ128r: + case X86::VPDPWUUDSZ128rk: + case X86::VPDPWUUDSZ128rkz: + case X86::VPDPWUUDSZ256r: + case X86::VPDPWUUDSZ256rk: + case X86::VPDPWUUDSZ256rkz: + case X86::VPDPWUUDSZr: + case X86::VPDPWUUDSZrk: + case X86::VPDPWUUDSZrkz: case X86::VPMADD52HUQrr: case X86::VPMADD52HUQYrr: case X86::VPMADD52HUQZ128r: diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 2fc3b6aa9885805..5f9211edfa161be 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -8425,46 +8425,41 @@ defm VSM4KEY4Y : SM4_Base<"vsm4key4", VR256, "256", loadv8i32, i256mem>, T8, XS, defm VSM4RNDS4 : SM4_Base<"vsm4rnds4", VR128, "128", loadv4i32, i128mem>, T8, XD, VEX, VVVV; defm VSM4RNDS4Y : SM4_Base<"vsm4rnds4", VR256, "256", loadv8i32, i256mem>, T8, XD, VEX_L, VEX, VVVV; -let Predicates = [HasAVXVNNIINT16], Constraints = "$src1 = $dst" in -multiclass avx_vnni_int16 opc, string OpcodeStr, bit IsCommutable> { - let isCommutable = IsCommutable in - def rr : I("int_x86_avx2_"#OpcodeStr#"_128") - VR128:$src1, VR128:$src2, VR128:$src3)))]>, - VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>; - - def rm : I("int_x86_avx2_"#OpcodeStr#"_128") - VR128:$src1, VR128:$src2, (loadv4i32 addr:$src3))))]>, - VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>; - - let isCommutable = IsCommutable in - def Yrr : I("int_x86_avx2_"#OpcodeStr#"_256") - VR256:$src1, VR256:$src2, VR256:$src3)))]>, - VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>; - - def Yrm : I("int_x86_avx2_"#OpcodeStr#"_256") - VR256:$src1, VR256:$src2, (loadv8i32 addr:$src3))))]>, - VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>; +let Predicates = [HasAVXVNNIINT16] in { + defm VPDPWSUD : avx_dotprod_rm<0xd2,"vpdpwsud", v4i32, VR128, loadv4i32, + i128mem, X86vpdpwsud, SchedWriteVecIMul.XMM, + 0>, T8, XS; + defm VPDPWSUDY : avx_dotprod_rm<0xd2,"vpdpwsud", v8i32, VR256, loadv8i32, + i256mem, X86vpdpwsud, SchedWriteVecIMul.YMM, + 0>, VEX_L, T8, XS; + defm VPDPWSUDS : avx_dotprod_rm<0xd3,"vpdpwsuds", v4i32, VR128, loadv4i32, + i128mem, X86vpdpwsuds, SchedWriteVecIMul.XMM, + 0>, T8, XS; + defm VPDPWSUDSY : avx_dotprod_rm<0xd3,"vpdpwsuds", v8i32, VR256, loadv8i32, + i256mem, X86vpdpwsuds, SchedWriteVecIMul.YMM, + 0>, VEX_L, T8, XS; + defm VPDPWUSD : avx_dotprod_rm<0xd2,"vpdpwusd", v4i32, VR128, loadv4i32, + i128mem, X86vpdpwusd, SchedWriteVecIMul.XMM, + 0>, T8, PD; + defm VPDPWUSDY : avx_dotprod_rm<0xd2,"vpdpwusd", v8i32, VR256, loadv8i32, + i256mem, X86vpdpwusd, SchedWriteVecIMul.YMM, + 0>, VEX_L, T8, PD; + defm VPDPWUSDS : avx_dotprod_rm<0xd3,"vpdpwusds", v4i32, VR128, loadv4i32, + i128mem, X86vpdpwusds, SchedWriteVecIMul.XMM, + 0>, T8, PD; + defm VPDPWUSDSY : avx_dotprod_rm<0xd3,"vpdpwusds", v8i32, VR256, loadv8i32, + i256mem, X86vpdpwusds, SchedWriteVecIMul.YMM, + 0>, VEX_L, T8, PD; + defm VPDPWUUD : avx_dotprod_rm<0xd2,"vpdpwuud", v4i32, VR128, loadv4i32, + i128mem, X86vpdpwuud, SchedWriteVecIMul.XMM, + 1>, T8; + defm VPDPWUUDY : avx_dotprod_rm<0xd2,"vpdpwuud", v8i32, VR256, loadv8i32, + i256mem, X86vpdpwuud, SchedWriteVecIMul.YMM, + 1>, VEX_L, T8; + defm VPDPWUUDS : avx_dotprod_rm<0xd3,"vpdpwuuds", v4i32, VR128, loadv4i32, + i128mem, X86vpdpwuuds, SchedWriteVecIMul.XMM, + 1>, T8; + defm VPDPWUUDSY : avx_dotprod_rm<0xd3,"vpdpwuuds", v8i32, VR256, loadv8i32, + i256mem, X86vpdpwuuds, SchedWriteVecIMul.YMM, + 1>, VEX_L, T8; } - -defm VPDPWSUD : avx_vnni_int16<0xd2, "vpdpwsud", 0>, T8, XS; -defm VPDPWSUDS : avx_vnni_int16<0xd3, "vpdpwsuds", 0>, T8, XS; -defm VPDPWUSD : avx_vnni_int16<0xd2, "vpdpwusd", 0>, T8, PD; -defm VPDPWUSDS : avx_vnni_int16<0xd3, "vpdpwusds", 0>, T8, PD; -defm VPDPWUUD : avx_vnni_int16<0xd2, "vpdpwuud", 1>, T8; -defm VPDPWUUDS : avx_vnni_int16<0xd3, "vpdpwuuds", 1>, T8; diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index a7473e495330bea..536391da295ddef 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -552,6 +552,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FDIV_RND), X86_INTRINSIC_DATA(avx10_vdivps256, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND), + X86_INTRINSIC_DATA(avx10_vdpphps_128, INTR_TYPE_3OP, X86ISD::DPFP16PS, 0), + X86_INTRINSIC_DATA(avx10_vdpphps_256, INTR_TYPE_3OP, X86ISD::DPFP16PS, 0), + X86_INTRINSIC_DATA(avx10_vdpphps_512, INTR_TYPE_3OP, X86ISD::DPFP16PS, 0), X86_INTRINSIC_DATA(avx10_vfmaddpd256, INTR_TYPE_3OP, ISD::FMA, X86ISD::FMADD_RND), X86_INTRINSIC_DATA(avx10_vfmaddph256, INTR_TYPE_3OP, ISD::FMA, @@ -590,6 +593,24 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FMUL_RND), X86_INTRINSIC_DATA(avx10_vmulps256, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND), + X86_INTRINSIC_DATA(avx10_vpdpbssd_512, INTR_TYPE_3OP, X86ISD::VPDPBSSD, 0), + X86_INTRINSIC_DATA(avx10_vpdpbssds_512, INTR_TYPE_3OP, X86ISD::VPDPBSSDS, + 0), + X86_INTRINSIC_DATA(avx10_vpdpbsud_512, INTR_TYPE_3OP, X86ISD::VPDPBSUD, 0), + X86_INTRINSIC_DATA(avx10_vpdpbsuds_512, INTR_TYPE_3OP, X86ISD::VPDPBSUDS, + 0), + X86_INTRINSIC_DATA(avx10_vpdpbuud_512, INTR_TYPE_3OP, X86ISD::VPDPBUUD, 0), + X86_INTRINSIC_DATA(avx10_vpdpbuuds_512, INTR_TYPE_3OP, X86ISD::VPDPBUUDS, + 0), + X86_INTRINSIC_DATA(avx10_vpdpwsud_512, INTR_TYPE_3OP, X86ISD::VPDPWSUD, 0), + X86_INTRINSIC_DATA(avx10_vpdpwsuds_512, INTR_TYPE_3OP, X86ISD::VPDPWSUDS, + 0), + X86_INTRINSIC_DATA(avx10_vpdpwusd_512, INTR_TYPE_3OP, X86ISD::VPDPWUSD, 0), + X86_INTRINSIC_DATA(avx10_vpdpwusds_512, INTR_TYPE_3OP, X86ISD::VPDPWUSDS, + 0), + X86_INTRINSIC_DATA(avx10_vpdpwuud_512, INTR_TYPE_3OP, X86ISD::VPDPWUUD, 0), + X86_INTRINSIC_DATA(avx10_vpdpwuuds_512, INTR_TYPE_3OP, X86ISD::VPDPWUUDS, + 0), X86_INTRINSIC_DATA(avx10_vsqrtpd256, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND), X86_INTRINSIC_DATA(avx10_vsqrtph256, INTR_TYPE_1OP, ISD::FSQRT, @@ -662,6 +683,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_vpdpbuud_256, INTR_TYPE_3OP, X86ISD::VPDPBUUD, 0), X86_INTRINSIC_DATA(avx2_vpdpbuuds_128, INTR_TYPE_3OP, X86ISD::VPDPBUUDS, 0), X86_INTRINSIC_DATA(avx2_vpdpbuuds_256, INTR_TYPE_3OP, X86ISD::VPDPBUUDS, 0), + X86_INTRINSIC_DATA(avx2_vpdpwsud_128, INTR_TYPE_3OP, X86ISD::VPDPWSUD, 0), + X86_INTRINSIC_DATA(avx2_vpdpwsud_256, INTR_TYPE_3OP, X86ISD::VPDPWSUD, 0), + X86_INTRINSIC_DATA(avx2_vpdpwsuds_128, INTR_TYPE_3OP, X86ISD::VPDPWSUDS, 0), + X86_INTRINSIC_DATA(avx2_vpdpwsuds_256, INTR_TYPE_3OP, X86ISD::VPDPWSUDS, 0), + X86_INTRINSIC_DATA(avx2_vpdpwusd_128, INTR_TYPE_3OP, X86ISD::VPDPWUSD, 0), + X86_INTRINSIC_DATA(avx2_vpdpwusd_256, INTR_TYPE_3OP, X86ISD::VPDPWUSD, 0), + X86_INTRINSIC_DATA(avx2_vpdpwusds_128, INTR_TYPE_3OP, X86ISD::VPDPWUSDS, 0), + X86_INTRINSIC_DATA(avx2_vpdpwusds_256, INTR_TYPE_3OP, X86ISD::VPDPWUSDS, 0), + X86_INTRINSIC_DATA(avx2_vpdpwuud_128, INTR_TYPE_3OP, X86ISD::VPDPWUUD, 0), + X86_INTRINSIC_DATA(avx2_vpdpwuud_256, INTR_TYPE_3OP, X86ISD::VPDPWUUD, 0), + X86_INTRINSIC_DATA(avx2_vpdpwuuds_128, INTR_TYPE_3OP, X86ISD::VPDPWUUDS, 0), + X86_INTRINSIC_DATA(avx2_vpdpwuuds_256, INTR_TYPE_3OP, X86ISD::VPDPWUUDS, 0), X86_INTRINSIC_DATA(avx512_add_pd_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND), X86_INTRINSIC_DATA(avx512_add_ps_512, INTR_TYPE_2OP, ISD::FADD, diff --git a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll index bafa52a2a83ae7d..07e86cb01e133ca 100644 --- a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll @@ -1,6 +1,389 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx10.2-512 --show-mc-encoding | FileCheck %s --check-prefix=X86 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-512 --show-mc-encoding | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx10.2-512 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-512 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64 + +; VNNI FP16 + +define <16 x float> @test_mm512_dpph_ps(<16 x float> %__W, <32 x half> %__A, <32 x half> %__B) { +; CHECK-LABEL: test_mm512_dpph_ps: +; CHECK: # %bb.0: +; CHECK-NEXT: vdpphps %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0x52,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = tail call <16 x float> @llvm.x86.avx10.vdpphps.512(<16 x float> %__W, <32 x half> %__A, <32 x half> %__B) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_dpph_ps(<16 x float> %__W, i16 zeroext %__U, <32 x half> %__A, <32 x half> %__B) { +; X86-LABEL: test_mm512_mask_dpph_ps: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vdpphps %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x49,0x52,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_dpph_ps: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vdpphps %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x49,0x52,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dph = tail call <16 x float> @llvm.x86.avx10.vdpphps.512(<16 x float> %__W, <32 x half> %__A, <32 x half> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x float> %dph, <16 x float> %__W + ret <16 x float> %res +} + +define <16 x float> @test_mm512_maskz_dpph_ps(i16 zeroext %__U, <16 x float> %__W, <32 x half> %__A, <32 x half> %__B) { +; X86-LABEL: test_mm512_maskz_dpph_ps: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vdpphps %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xc9,0x52,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm512_maskz_dpph_ps: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vdpphps %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xc9,0x52,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dph = tail call <16 x float> @llvm.x86.avx10.vdpphps.512(<16 x float> %__W, <32 x half> %__A, <32 x half> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x float> %dph, <16 x float> zeroinitializer + ret <16 x float> %res +} + +declare <16 x float> @llvm.x86.avx10.vdpphps.512(<16 x float>, <32 x half>, <32 x half>) + +; VNNI INT8 + +define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) { +; X86-LABEL: test_mm512_dpbssd_epi32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vpdpbssd (%eax), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x77,0x48,0x50,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm512_dpbssd_epi32: +; X64: # %bb.0: +; X64-NEXT: vpdpbssd (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x77,0x48,0x50,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %__B = load <16 x i32>, ptr %pB + %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + ret <16 x i32> %res +} + +define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) { +; X86-LABEL: test_mm512_mask_dpbssds_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpbssds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x49,0x51,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_dpbssds_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpbssds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x49,0x51,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W + ret <16 x i32> %res +} + +define <16 x i32> @test_mm512_maskz_dpbssd_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) { +; X86-LABEL: test_mm512_maskz_dpbssd_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpbssd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0xc9,0x50,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm512_maskz_dpbssd_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpbssd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0xc9,0x50,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32>, <16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32>, <16 x i32>, <16 x i32>) + +define <16 x i32> @test_mm512_dpbsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) { +; X86-LABEL: test_mm512_dpbsud_epi32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vpdpbsud (%eax), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0x50,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm512_dpbsud_epi32: +; X64: # %bb.0: +; X64-NEXT: vpdpbsud (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0x50,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %__B = load <16 x i32>, ptr %pB + %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + ret <16 x i32> %res +} + +define <16 x i32> @test_mm512_mask_dpbsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) { +; X86-LABEL: test_mm512_mask_dpbsuds_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpbsuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x49,0x51,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_dpbsuds_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpbsuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x49,0x51,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W + ret <16 x i32> %res +} + +define <16 x i32> @test_mm512_maskz_dpbsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) { +; X86-LABEL: test_mm512_maskz_dpbsud_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpbsud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xc9,0x50,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm512_maskz_dpbsud_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpbsud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xc9,0x50,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32>, <16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32>, <16 x i32>, <16 x i32>) + +define <16 x i32> @test_mm512_dpbuud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) { +; X86-LABEL: test_mm512_dpbuud_epi32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vpdpbuud (%eax), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0x50,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm512_dpbuud_epi32: +; X64: # %bb.0: +; X64-NEXT: vpdpbuud (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0x50,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %__B = load <16 x i32>, ptr %pB + %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + ret <16 x i32> %res +} + +define <16 x i32> @test_mm512_mask_dpbuuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) { +; X86-LABEL: test_mm512_mask_dpbuuds_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpbuuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x49,0x51,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_dpbuuds_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpbuuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x49,0x51,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W + ret <16 x i32> %res +} + +define <16 x i32> @test_mm512_maskz_dpbuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) { +; X86-LABEL: test_mm512_maskz_dpbuud_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpbuud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xc9,0x50,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm512_maskz_dpbuud_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpbuud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xc9,0x50,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32>, <16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32>, <16 x i32>, <16 x i32>) + +; VNNI INT16 + +define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) { +; X86-LABEL: test_mm512_dpwsud_epi32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vpdpwsud (%eax), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd2,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm512_dpwsud_epi32: +; X64: # %bb.0: +; X64-NEXT: vpdpwsud (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd2,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %__B = load <16 x i32>, ptr %pB + %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + ret <16 x i32> %res +} + +define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) { +; X86-LABEL: test_mm512_mask_dpwsuds_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpwsuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x49,0xd3,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_dpwsuds_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpwsuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x49,0xd3,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W + ret <16 x i32> %res +} + +define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) { +; X86-LABEL: test_mm512_maskz_dpwsud_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpwsud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xc9,0xd2,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm512_maskz_dpwsud_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpwsud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xc9,0xd2,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <16 x i32>, <16 x i32>) + +define <16 x i32> @test_mm512_dpwusd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) { +; X86-LABEL: test_mm512_dpwusd_epi32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vpdpwusd (%eax), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd2,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm512_dpwusd_epi32: +; X64: # %bb.0: +; X64-NEXT: vpdpwusd (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd2,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %__B = load <16 x i32>, ptr %pB + %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + ret <16 x i32> %res +} + +define <16 x i32> @test_mm512_mask_dpwusds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) { +; X86-LABEL: test_mm512_mask_dpwusds_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpwusds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0xd3,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_dpwusds_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpwusds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0xd3,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W + ret <16 x i32> %res +} + +define <16 x i32> @test_mm512_maskz_dpwusd_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) { +; X86-LABEL: test_mm512_maskz_dpwusd_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpwusd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0xd2,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm512_maskz_dpwusd_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpwusd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0xd2,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32>, <16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32>, <16 x i32>, <16 x i32>) + +define <16 x i32> @test_mm512_dpwuud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) { +; X86-LABEL: test_mm512_dpwuud_epi32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vpdpwuud (%eax), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd2,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm512_dpwuud_epi32: +; X64: # %bb.0: +; X64-NEXT: vpdpwuud (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd2,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %__B = load <16 x i32>, ptr %pB + %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + ret <16 x i32> %res +} + +define <16 x i32> @test_mm512_mask_dpwuuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) { +; X86-LABEL: test_mm512_mask_dpwuuds_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpwuuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x49,0xd3,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_dpwuuds_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpwuuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x49,0xd3,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W + ret <16 x i32> %res +} + +define <16 x i32> @test_mm512_maskz_dpwuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) { +; X86-LABEL: test_mm512_maskz_dpwuud_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpwuud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xc9,0xd2,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm512_maskz_dpwuud_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpwuud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xc9,0xd2,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %bst = bitcast i16 %__U to <16 x i1> + %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32>, <16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32>, <16 x i32>, <16 x i32>) ; VMPSADBW diff --git a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll index 34d740302d744ec..31cec891c4cf386 100644 --- a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll @@ -2,6 +2,569 @@ ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx10.2-256 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64 +; VNNI FP16 + +define <4 x float> @test_mm_dpph_ps(<4 x float> %__W, <8 x half> %__A, <8 x half> %__B) { +; CHECK-LABEL: test_mm_dpph_ps: +; CHECK: # %bb.0: +; CHECK-NEXT: vdpphps %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x52,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = tail call <4 x float> @llvm.x86.avx10.vdpphps.128(<4 x float> %__W, <8 x half> %__A, <8 x half> %__B) + ret <4 x float> %res +} + +define <4 x float> @test_mm_mask_dpph_ps(<4 x float> %__W, i8 zeroext %__U, <8 x half> %__A, <8 x half> %__B) { +; X86-LABEL: test_mm_mask_dpph_ps: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vdpphps %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x09,0x52,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm_mask_dpph_ps: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vdpphps %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x09,0x52,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dph = tail call <4 x float> @llvm.x86.avx10.vdpphps.128(<4 x float> %__W, <8 x half> %__A, <8 x half> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %ext = shufflevector <8 x i1> %bst, <8 x i1> poison, <4 x i32> + %res = select <4 x i1> %ext, <4 x float> %dph, <4 x float> %__W + ret <4 x float> %res +} + +define <4 x float> @test_mm_maskz_dpph_ps(i8 zeroext %__U, <4 x float> %__W, <8 x half> %__A, <8 x half> %__B) { +; X86-LABEL: test_mm_maskz_dpph_ps: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vdpphps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0x89,0x52,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm_maskz_dpph_ps: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vdpphps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0x89,0x52,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dph = tail call <4 x float> @llvm.x86.avx10.vdpphps.128(<4 x float> %__W, <8 x half> %__A, <8 x half> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %ext = shufflevector <8 x i1> %bst, <8 x i1> poison, <4 x i32> + %res = select <4 x i1> %ext, <4 x float> %dph, <4 x float> zeroinitializer + ret <4 x float> %res +} + +define <8 x float> @test_mm256_dpph_ps(<8 x float> %__W, <16 x half> %__A, <16 x half> %__B) { +; CHECK-LABEL: test_mm256_dpph_ps: +; CHECK: # %bb.0: +; CHECK-NEXT: vdpphps %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x52,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = tail call <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float> %__W, <16 x half> %__A, <16 x half> %__B) + ret <8 x float> %res +} + +define <8 x float> @test_mm256_mask_dpph_ps(<8 x float> %__W, i8 zeroext %__U, <16 x half> %__A, <16 x half> %__B) { +; X86-LABEL: test_mm256_mask_dpph_ps: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vdpphps %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x74,0x29,0x52,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm256_mask_dpph_ps: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vdpphps %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x74,0x29,0x52,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dph = tail call <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float> %__W, <16 x half> %__A, <16 x half> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x float> %dph, <8 x float> %__W + ret <8 x float> %res +} + +define <8 x float> @test_mm256_maskz_dpph_ps(i8 zeroext %__U, <8 x float> %__W, <16 x half> %__A, <16 x half> %__B) { +; X86-LABEL: test_mm256_maskz_dpph_ps: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vdpphps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xa9,0x52,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm256_maskz_dpph_ps: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vdpphps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xa9,0x52,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dph = tail call <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float> %__W, <16 x half> %__A, <16 x half> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x float> %dph, <8 x float> zeroinitializer + ret <8 x float> %res +} + +declare <4 x float> @llvm.x86.avx10.vdpphps.128(<4 x float>, <8 x half>, <8 x half>) +declare <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float>, <16 x half>, <16 x half>) + +; VNNI INT8 + +define <4 x i32> @test_mm_mask_dpbssd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) { +; X86-LABEL: test_mm_mask_dpbssd_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpbssd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x09,0x50,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm_mask_dpbssd_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpbssd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x09,0x50,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %bst = bitcast i4 %__U to <4 x i1> + %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W + ret <4 x i32> %res +} + +define <4 x i32> @test_mm_maskz_dpbssds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) { +; X86-LABEL: test_mm_maskz_dpbssds_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpbssds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0x89,0x51,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm_maskz_dpbssds_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpbssds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0x89,0x51,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %bst = bitcast i4 %__U to <4 x i1> + %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer + ret <4 x i32> %res +} + +define <8 x i32> @test_mm256_maskz_dpbssds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) { +; X86-LABEL: test_mm256_maskz_dpbssds_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpbssds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x77,0x29,0x51,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm256_maskz_dpbssds_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpbssds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x77,0x29,0x51,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W + ret <8 x i32> %res +} + +define <8 x i32> @test_mm256_mask_dpbssd_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) { +; X86-LABEL: test_mm256_mask_dpbssd_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpbssd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0xa9,0x50,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm256_mask_dpbssd_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpbssd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0xa9,0x50,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <4 x i32> @test_mm_mask_dpbsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) { +; X86-LABEL: test_mm_mask_dpbsud_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpbsud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0x50,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm_mask_dpbsud_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpbsud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0x50,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %bst = bitcast i4 %__U to <4 x i1> + %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W + ret <4 x i32> %res +} + +define <4 x i32> @test_mm_maskz_dpbsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) { +; X86-LABEL: test_mm_maskz_dpbsuds_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpbsuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0x51,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm_maskz_dpbsuds_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpbsuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0x51,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %bst = bitcast i4 %__U to <4 x i1> + %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer + ret <4 x i32> %res +} + +define <8 x i32> @test_mm256_maskz_dpbsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) { +; X86-LABEL: test_mm256_maskz_dpbsuds_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpbsuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0x51,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm256_maskz_dpbsuds_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpbsuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0x51,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W + ret <8 x i32> %res +} + +define <8 x i32> @test_mm256_mask_dpbsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) { +; X86-LABEL: test_mm256_mask_dpbsud_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpbsud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0x50,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm256_mask_dpbsud_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpbsud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0x50,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <4 x i32> @test_mm_mask_dpbuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) { +; X86-LABEL: test_mm_mask_dpbuud_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpbuud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x09,0x50,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm_mask_dpbuud_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpbuud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x09,0x50,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %bst = bitcast i4 %__U to <4 x i1> + %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W + ret <4 x i32> %res +} + +define <4 x i32> @test_mm_maskz_dpbuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) { +; X86-LABEL: test_mm_maskz_dpbuuds_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpbuuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0x89,0x51,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm_maskz_dpbuuds_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpbuuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0x89,0x51,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %bst = bitcast i4 %__U to <4 x i1> + %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer + ret <4 x i32> %res +} + +define <8 x i32> @test_mm256_maskz_dpbuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) { +; X86-LABEL: test_mm256_maskz_dpbuuds_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpbuuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x74,0x29,0x51,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm256_maskz_dpbuuds_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpbuuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x74,0x29,0x51,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W + ret <8 x i32> %res +} + +define <8 x i32> @test_mm256_mask_dpbuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) { +; X86-LABEL: test_mm256_mask_dpbuud_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpbuud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xa9,0x50,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm256_mask_dpbuud_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpbuud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xa9,0x50,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>) + +; VNNI INT16 + +define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) { +; X86-LABEL: test_mm_mask_dpwsud_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpwsud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0xd2,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm_mask_dpwsud_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpwsud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0xd2,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %bst = bitcast i4 %__U to <4 x i1> + %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W + ret <4 x i32> %res +} + +define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) { +; X86-LABEL: test_mm_maskz_dpwsuds_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpwsuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0xd3,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm_maskz_dpwsuds_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpwsuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0xd3,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %bst = bitcast i4 %__U to <4 x i1> + %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer + ret <4 x i32> %res +} + +define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) { +; X86-LABEL: test_mm256_maskz_dpwsuds_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpwsuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0xd3,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm256_maskz_dpwsuds_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpwsuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0xd3,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W + ret <8 x i32> %res +} + +define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) { +; X86-LABEL: test_mm256_mask_dpwsud_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpwsud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0xd2,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm256_mask_dpwsud_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpwsud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0xd2,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <4 x i32> @test_mm_mask_dpwusd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) { +; X86-LABEL: test_mm_mask_dpwusd_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpwusd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xd2,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm_mask_dpwusd_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpwusd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xd2,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %bst = bitcast i4 %__U to <4 x i1> + %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W + ret <4 x i32> %res +} + +define <4 x i32> @test_mm_maskz_dpwusds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) { +; X86-LABEL: test_mm_maskz_dpwusds_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpwusds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xd3,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm_maskz_dpwusds_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpwusds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xd3,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %bst = bitcast i4 %__U to <4 x i1> + %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer + ret <4 x i32> %res +} + +define <8 x i32> @test_mm256_maskz_dpwusds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) { +; X86-LABEL: test_mm256_maskz_dpwusds_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpwusds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0xd3,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm256_maskz_dpwusds_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpwusds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0xd3,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W + ret <8 x i32> %res +} + +define <8 x i32> @test_mm256_mask_dpwusd_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) { +; X86-LABEL: test_mm256_mask_dpwusd_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpwusd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xd2,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm256_mask_dpwusd_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpwusd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xd2,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <4 x i32> @test_mm_mask_dpwuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) { +; X86-LABEL: test_mm_mask_dpwuud_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpwuud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x09,0xd2,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm_mask_dpwuud_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpwuud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x09,0xd2,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %bst = bitcast i4 %__U to <4 x i1> + %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W + ret <4 x i32> %res +} + +define <4 x i32> @test_mm_maskz_dpwuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) { +; X86-LABEL: test_mm_maskz_dpwuuds_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpwuuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0x89,0xd3,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm_maskz_dpwuuds_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpwuuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0x89,0xd3,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %bst = bitcast i4 %__U to <4 x i1> + %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer + ret <4 x i32> %res +} + +define <8 x i32> @test_mm256_maskz_dpwuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) { +; X86-LABEL: test_mm256_maskz_dpwuuds_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpwuuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x74,0x29,0xd3,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm256_maskz_dpwuuds_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpwuuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x74,0x29,0xd3,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W + ret <8 x i32> %res +} + +define <8 x i32> @test_mm256_mask_dpwuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) { +; X86-LABEL: test_mm256_mask_dpwuud_epi32: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpwuud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xa9,0xd2,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mm256_mask_dpwuud_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpdpwuud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xa9,0xd2,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %bst = bitcast i8 %__U to <8 x i1> + %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32>, <8 x i32>, <8 x i32>) + ; VMPSADBW define { <8 x i16>, <8 x i16>, <8 x i16> } @test_mask_mpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) { diff --git a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll index 999c968fa80db58..8601d454215ad1c 100644 --- a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll @@ -1,12 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s ; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefix=AVX10 +; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefix=AVX10 define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd2,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwsud_128: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd2,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) ret <4 x i32> %ret } @@ -17,6 +24,11 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd2,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwsud_256: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd2,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) ret <8 x i32> %ret } @@ -27,6 +39,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd3,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwsuds_128: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd3,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) ret <4 x i32> %ret } @@ -37,6 +54,11 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd3,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwsuds_256: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd3,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) ret <8 x i32> %ret } @@ -47,6 +69,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd2,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwusd_128: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd2,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) ret <4 x i32> %ret } @@ -57,6 +84,11 @@ define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd2,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwusd_256: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd2,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) ret <8 x i32> %ret } @@ -67,6 +99,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd3,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwusds_128: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd3,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) ret <4 x i32> %ret } @@ -77,6 +114,11 @@ define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd3,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwusds_256: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd3,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) ret <8 x i32> %ret } @@ -87,6 +129,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd2,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwuud_128: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd2,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) ret <4 x i32> %ret } @@ -97,6 +144,11 @@ define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd2,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwuud_256: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd2,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) ret <8 x i32> %ret } @@ -107,6 +159,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd3,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwuuds_128: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd3,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) ret <4 x i32> %ret } @@ -117,6 +174,11 @@ define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd3,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwuuds_256: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd3,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) ret <8 x i32> %ret } diff --git a/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll b/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll index f9e44ac4132be51..607720fbc3f3387 100644 --- a/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avxvnniint8 --show-mc-encoding | FileCheck %s --check-prefixes=X86 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnniint8 --show-mc-encoding | FileCheck %s --check-prefixes=X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx10.2-256 --show-mc-encoding | FileCheck %s --check-prefixes=AVX10-X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256 --show-mc-encoding | FileCheck %s --check-prefixes=AVX10-X64 declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>) @@ -22,6 +24,23 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <4 x i32> %x1, pt ; X64-NEXT: vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x50,0xc2] ; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] +; +; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbssd_128: +; AVX10-X86: # %bb.0: +; AVX10-X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; AVX10-X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; AVX10-X86-NEXT: vpdpbssd (%eax), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x77,0x08,0x50,0x18] +; AVX10-X86-NEXT: vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x50,0xc2] +; AVX10-X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0] +; AVX10-X86-NEXT: retl # encoding: [0xc3] +; +; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbssd_128: +; AVX10-X64: # %bb.0: +; AVX10-X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; AVX10-X64-NEXT: vpdpbssd (%rdi), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x77,0x08,0x50,0x1f] +; AVX10-X64-NEXT: vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x50,0xc2] +; AVX10-X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0] +; AVX10-X64-NEXT: retq # encoding: [0xc3] %x2 = load <4 x i32>, ptr %x2p %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) @@ -48,6 +67,23 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <4 x i32> %x1, p ; X64-NEXT: vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x51,0xc2] ; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] +; +; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbssds_128: +; AVX10-X86: # %bb.0: +; AVX10-X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; AVX10-X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; AVX10-X86-NEXT: vpdpbssds (%eax), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x77,0x08,0x51,0x18] +; AVX10-X86-NEXT: vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x51,0xc2] +; AVX10-X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0] +; AVX10-X86-NEXT: retl # encoding: [0xc3] +; +; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbssds_128: +; AVX10-X64: # %bb.0: +; AVX10-X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; AVX10-X64-NEXT: vpdpbssds (%rdi), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x77,0x08,0x51,0x1f] +; AVX10-X64-NEXT: vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x51,0xc2] +; AVX10-X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0] +; AVX10-X64-NEXT: retq # encoding: [0xc3] %x2 = load <4 x i32>, ptr %x2p %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) @@ -74,6 +110,23 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <8 x i32> %x1, pt ; X64-NEXT: vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x50,0xc2] ; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] +; +; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbssd_256: +; AVX10-X86: # %bb.0: +; AVX10-X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; AVX10-X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; AVX10-X86-NEXT: vpdpbssd (%eax), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x77,0x28,0x50,0x18] +; AVX10-X86-NEXT: vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x50,0xc2] +; AVX10-X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0] +; AVX10-X86-NEXT: retl # encoding: [0xc3] +; +; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbssd_256: +; AVX10-X64: # %bb.0: +; AVX10-X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; AVX10-X64-NEXT: vpdpbssd (%rdi), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x77,0x28,0x50,0x1f] +; AVX10-X64-NEXT: vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x50,0xc2] +; AVX10-X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0] +; AVX10-X64-NEXT: retq # encoding: [0xc3] %x2 = load <8 x i32>, ptr %x2p %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) @@ -100,6 +153,23 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <8 x i32> %x1, p ; X64-NEXT: vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x51,0xc2] ; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] +; +; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbssds_256: +; AVX10-X86: # %bb.0: +; AVX10-X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; AVX10-X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; AVX10-X86-NEXT: vpdpbssds (%eax), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x77,0x28,0x51,0x18] +; AVX10-X86-NEXT: vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x51,0xc2] +; AVX10-X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0] +; AVX10-X86-NEXT: retl # encoding: [0xc3] +; +; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbssds_256: +; AVX10-X64: # %bb.0: +; AVX10-X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; AVX10-X64-NEXT: vpdpbssds (%rdi), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x77,0x28,0x51,0x1f] +; AVX10-X64-NEXT: vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x51,0xc2] +; AVX10-X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0] +; AVX10-X64-NEXT: retq # encoding: [0xc3] %x2 = load <8 x i32>, ptr %x2p %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) @@ -126,6 +196,23 @@ define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <4 x i32> %x1, pt ; X64-NEXT: vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x50,0xc2] ; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] +; +; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbsud_128: +; AVX10-X86: # %bb.0: +; AVX10-X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; AVX10-X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; AVX10-X86-NEXT: vpdpbsud (%eax), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x76,0x08,0x50,0x18] +; AVX10-X86-NEXT: vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x50,0xc2] +; AVX10-X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0] +; AVX10-X86-NEXT: retl # encoding: [0xc3] +; +; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbsud_128: +; AVX10-X64: # %bb.0: +; AVX10-X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; AVX10-X64-NEXT: vpdpbsud (%rdi), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x76,0x08,0x50,0x1f] +; AVX10-X64-NEXT: vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x50,0xc2] +; AVX10-X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0] +; AVX10-X64-NEXT: retq # encoding: [0xc3] %x2 = load <4 x i32>, ptr %x2p %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) @@ -152,6 +239,23 @@ define <4 x i32>@test_int_x86_avx2_vpdpbsuds_128(<4 x i32> %x0, <4 x i32> %x1, p ; X64-NEXT: vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x51,0xc2] ; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] +; +; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbsuds_128: +; AVX10-X86: # %bb.0: +; AVX10-X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; AVX10-X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; AVX10-X86-NEXT: vpdpbsuds (%eax), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x76,0x08,0x51,0x18] +; AVX10-X86-NEXT: vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x51,0xc2] +; AVX10-X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0] +; AVX10-X86-NEXT: retl # encoding: [0xc3] +; +; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbsuds_128: +; AVX10-X64: # %bb.0: +; AVX10-X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; AVX10-X64-NEXT: vpdpbsuds (%rdi), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x76,0x08,0x51,0x1f] +; AVX10-X64-NEXT: vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x51,0xc2] +; AVX10-X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0] +; AVX10-X64-NEXT: retq # encoding: [0xc3] %x2 = load <4 x i32>, ptr %x2p %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) @@ -178,6 +282,23 @@ define <8 x i32>@test_int_x86_avx2_vpdpbsud_256(<8 x i32> %x0, <8 x i32> %x1, pt ; X64-NEXT: vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x50,0xc2] ; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] +; +; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbsud_256: +; AVX10-X86: # %bb.0: +; AVX10-X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; AVX10-X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; AVX10-X86-NEXT: vpdpbsud (%eax), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x76,0x28,0x50,0x18] +; AVX10-X86-NEXT: vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x50,0xc2] +; AVX10-X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0] +; AVX10-X86-NEXT: retl # encoding: [0xc3] +; +; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbsud_256: +; AVX10-X64: # %bb.0: +; AVX10-X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; AVX10-X64-NEXT: vpdpbsud (%rdi), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x76,0x28,0x50,0x1f] +; AVX10-X64-NEXT: vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x50,0xc2] +; AVX10-X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0] +; AVX10-X64-NEXT: retq # encoding: [0xc3] %x2 = load <8 x i32>, ptr %x2p %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) @@ -204,6 +325,23 @@ define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <8 x i32> %x1, p ; X64-NEXT: vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x51,0xc2] ; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] +; +; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbsuds_256: +; AVX10-X86: # %bb.0: +; AVX10-X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; AVX10-X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; AVX10-X86-NEXT: vpdpbsuds (%eax), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x76,0x28,0x51,0x18] +; AVX10-X86-NEXT: vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x51,0xc2] +; AVX10-X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0] +; AVX10-X86-NEXT: retl # encoding: [0xc3] +; +; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbsuds_256: +; AVX10-X64: # %bb.0: +; AVX10-X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; AVX10-X64-NEXT: vpdpbsuds (%rdi), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x76,0x28,0x51,0x1f] +; AVX10-X64-NEXT: vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x51,0xc2] +; AVX10-X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0] +; AVX10-X64-NEXT: retq # encoding: [0xc3] %x2 = load <8 x i32>, ptr %x2p %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) @@ -230,6 +368,23 @@ define <4 x i32>@test_int_x86_avx2_vpdpbuud_128(<4 x i32> %x0, <4 x i32> %x1, pt ; X64-NEXT: vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x50,0xc2] ; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] +; +; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbuud_128: +; AVX10-X86: # %bb.0: +; AVX10-X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; AVX10-X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; AVX10-X86-NEXT: vpdpbuud (%eax), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x74,0x08,0x50,0x18] +; AVX10-X86-NEXT: vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x50,0xc2] +; AVX10-X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0] +; AVX10-X86-NEXT: retl # encoding: [0xc3] +; +; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbuud_128: +; AVX10-X64: # %bb.0: +; AVX10-X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; AVX10-X64-NEXT: vpdpbuud (%rdi), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x74,0x08,0x50,0x1f] +; AVX10-X64-NEXT: vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x50,0xc2] +; AVX10-X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0] +; AVX10-X64-NEXT: retq # encoding: [0xc3] %x2 = load <4 x i32>, ptr %x2p %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) @@ -256,6 +411,23 @@ define <4 x i32>@test_int_x86_avx2_vpdpbuuds_128(<4 x i32> %x0, <4 x i32> %x1, p ; X64-NEXT: vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x51,0xc2] ; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] +; +; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbuuds_128: +; AVX10-X86: # %bb.0: +; AVX10-X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; AVX10-X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; AVX10-X86-NEXT: vpdpbuuds (%eax), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x74,0x08,0x51,0x18] +; AVX10-X86-NEXT: vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x51,0xc2] +; AVX10-X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0] +; AVX10-X86-NEXT: retl # encoding: [0xc3] +; +; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbuuds_128: +; AVX10-X64: # %bb.0: +; AVX10-X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; AVX10-X64-NEXT: vpdpbuuds (%rdi), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x74,0x08,0x51,0x1f] +; AVX10-X64-NEXT: vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x51,0xc2] +; AVX10-X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0] +; AVX10-X64-NEXT: retq # encoding: [0xc3] %x2 = load <4 x i32>, ptr %x2p %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) @@ -282,6 +454,23 @@ define <8 x i32>@test_int_x86_avx2_vpdpbuud_256(<8 x i32> %x0, <8 x i32> %x1, pt ; X64-NEXT: vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x50,0xc2] ; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] +; +; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbuud_256: +; AVX10-X86: # %bb.0: +; AVX10-X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; AVX10-X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; AVX10-X86-NEXT: vpdpbuud (%eax), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x74,0x28,0x50,0x18] +; AVX10-X86-NEXT: vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x50,0xc2] +; AVX10-X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0] +; AVX10-X86-NEXT: retl # encoding: [0xc3] +; +; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbuud_256: +; AVX10-X64: # %bb.0: +; AVX10-X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; AVX10-X64-NEXT: vpdpbuud (%rdi), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x74,0x28,0x50,0x1f] +; AVX10-X64-NEXT: vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x50,0xc2] +; AVX10-X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0] +; AVX10-X64-NEXT: retq # encoding: [0xc3] %x2 = load <8 x i32>, ptr %x2p %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) @@ -308,6 +497,23 @@ define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <8 x i32> %x1, p ; X64-NEXT: vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x51,0xc2] ; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] +; +; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbuuds_256: +; AVX10-X86: # %bb.0: +; AVX10-X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; AVX10-X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; AVX10-X86-NEXT: vpdpbuuds (%eax), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x74,0x28,0x51,0x18] +; AVX10-X86-NEXT: vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x51,0xc2] +; AVX10-X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0] +; AVX10-X86-NEXT: retl # encoding: [0xc3] +; +; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbuuds_256: +; AVX10-X64: # %bb.0: +; AVX10-X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; AVX10-X64-NEXT: vpdpbuuds (%rdi), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x74,0x28,0x51,0x1f] +; AVX10-X64-NEXT: vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x51,0xc2] +; AVX10-X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0] +; AVX10-X64-NEXT: retq # encoding: [0xc3] %x2 = load <8 x i32>, ptr %x2p %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) diff --git a/llvm/test/MC/Disassembler/X86/avx10_2ni-32.txt b/llvm/test/MC/Disassembler/X86/avx10_2ni-32.txt index 8254e37e9aa9e37..912c0799d1316c2 100644 --- a/llvm/test/MC/Disassembler/X86/avx10_2ni-32.txt +++ b/llvm/test/MC/Disassembler/X86/avx10_2ni-32.txt @@ -1,6 +1,1416 @@ # RUN: llvm-mc --disassemble %s -triple=i386 | FileCheck %s --check-prefixes=ATT # RUN: llvm-mc --disassemble %s -triple=i386 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL +# VNNI FP16 + +# ATT: vdpphps %xmm4, %xmm3, %xmm2 +# INTEL: vdpphps xmm2, xmm3, xmm4 +0x62,0xf2,0x64,0x08,0x52,0xd4 + +# ATT: vdpphps %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vdpphps xmm2 {k7}, xmm3, xmm4 +0x62,0xf2,0x64,0x0f,0x52,0xd4 + +# ATT: vdpphps %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vdpphps xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf2,0x64,0x8f,0x52,0xd4 + +# ATT: vdpphps %ymm4, %ymm3, %ymm2 +# INTEL: vdpphps ymm2, ymm3, ymm4 +0x62,0xf2,0x64,0x28,0x52,0xd4 + +# ATT: vdpphps %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vdpphps ymm2 {k7}, ymm3, ymm4 +0x62,0xf2,0x64,0x2f,0x52,0xd4 + +# ATT: vdpphps %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vdpphps ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf2,0x64,0xaf,0x52,0xd4 + +# ATT: vdpphps %zmm4, %zmm3, %zmm2 +# INTEL: vdpphps zmm2, zmm3, zmm4 +0x62,0xf2,0x64,0x48,0x52,0xd4 + +# ATT: vdpphps %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vdpphps zmm2 {k7}, zmm3, zmm4 +0x62,0xf2,0x64,0x4f,0x52,0xd4 + +# ATT: vdpphps %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vdpphps zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf2,0x64,0xcf,0x52,0xd4 + +# ATT: vdpphps 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vdpphps xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x64,0x08,0x52,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vdpphps 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vdpphps xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x64,0x0f,0x52,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vdpphps (%eax){1to4}, %xmm3, %xmm2 +# INTEL: vdpphps xmm2, xmm3, dword ptr [eax]{1to4} +0x62,0xf2,0x64,0x18,0x52,0x10 + +# ATT: vdpphps -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vdpphps xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf2,0x64,0x08,0x52,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vdpphps 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vdpphps xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf2,0x64,0x8f,0x52,0x51,0x7f + +# ATT: vdpphps -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vdpphps xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +0x62,0xf2,0x64,0x9f,0x52,0x52,0x80 + +# ATT: vdpphps 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vdpphps ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x64,0x28,0x52,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vdpphps 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vdpphps ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x64,0x2f,0x52,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vdpphps (%eax){1to8}, %ymm3, %ymm2 +# INTEL: vdpphps ymm2, ymm3, dword ptr [eax]{1to8} +0x62,0xf2,0x64,0x38,0x52,0x10 + +# ATT: vdpphps -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vdpphps ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf2,0x64,0x28,0x52,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vdpphps 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vdpphps ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf2,0x64,0xaf,0x52,0x51,0x7f + +# ATT: vdpphps -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vdpphps ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +0x62,0xf2,0x64,0xbf,0x52,0x52,0x80 + +# ATT: vdpphps 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vdpphps zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x64,0x48,0x52,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vdpphps 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vdpphps zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x64,0x4f,0x52,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vdpphps (%eax){1to16}, %zmm3, %zmm2 +# INTEL: vdpphps zmm2, zmm3, dword ptr [eax]{1to16} +0x62,0xf2,0x64,0x58,0x52,0x10 + +# ATT: vdpphps -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vdpphps zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf2,0x64,0x48,0x52,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vdpphps 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vdpphps zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf2,0x64,0xcf,0x52,0x51,0x7f + +# ATT: vdpphps -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vdpphps zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +0x62,0xf2,0x64,0xdf,0x52,0x52,0x80 + +# VNNI INT8 + +# ATT: vpdpbssd %xmm4, %xmm3, %xmm2 +# INTEL: vpdpbssd xmm2, xmm3, xmm4 +0xc4,0xe2,0x63,0x50,0xd4 + +# ATT: vpdpbssd %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vpdpbssd xmm2 {k7}, xmm3, xmm4 +0x62,0xf2,0x67,0x0f,0x50,0xd4 + +# ATT: vpdpbssd %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpbssd xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf2,0x67,0x8f,0x50,0xd4 + +# ATT: vpdpbssd %ymm4, %ymm3, %ymm2 +# INTEL: vpdpbssd ymm2, ymm3, ymm4 +0xc4,0xe2,0x67,0x50,0xd4 + +# ATT: vpdpbssd %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vpdpbssd ymm2 {k7}, ymm3, ymm4 +0x62,0xf2,0x67,0x2f,0x50,0xd4 + +# ATT: vpdpbssd %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpbssd ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf2,0x67,0xaf,0x50,0xd4 + +# ATT: vpdpbssd %zmm4, %zmm3, %zmm2 +# INTEL: vpdpbssd zmm2, zmm3, zmm4 +0x62,0xf2,0x67,0x48,0x50,0xd4 + +# ATT: vpdpbssd %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vpdpbssd zmm2 {k7}, zmm3, zmm4 +0x62,0xf2,0x67,0x4f,0x50,0xd4 + +# ATT: vpdpbssd %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpbssd zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf2,0x67,0xcf,0x50,0xd4 + +# ATT: vpdpbssd 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vpdpbssd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x63,0x50,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbssd 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vpdpbssd xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x67,0x0f,0x50,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbssd (%eax){1to4}, %xmm3, %xmm2 +# INTEL: vpdpbssd xmm2, xmm3, dword ptr [eax]{1to4} +0x62,0xf2,0x67,0x18,0x50,0x10 + +# ATT: vpdpbssd -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vpdpbssd xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x63,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpbssd 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpbssd xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf2,0x67,0x8f,0x50,0x51,0x7f + +# ATT: vpdpbssd -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpbssd xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +0x62,0xf2,0x67,0x9f,0x50,0x52,0x80 + +# ATT: vpdpbssd 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vpdpbssd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x67,0x50,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbssd 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vpdpbssd ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x67,0x2f,0x50,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbssd (%eax){1to8}, %ymm3, %ymm2 +# INTEL: vpdpbssd ymm2, ymm3, dword ptr [eax]{1to8} +0x62,0xf2,0x67,0x38,0x50,0x10 + +# ATT: vpdpbssd -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vpdpbssd ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x67,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpbssd 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpbssd ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf2,0x67,0xaf,0x50,0x51,0x7f + +# ATT: vpdpbssd -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpbssd ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +0x62,0xf2,0x67,0xbf,0x50,0x52,0x80 + +# ATT: vpdpbssd 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vpdpbssd zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x67,0x48,0x50,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbssd 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vpdpbssd zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x67,0x4f,0x50,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbssd (%eax){1to16}, %zmm3, %zmm2 +# INTEL: vpdpbssd zmm2, zmm3, dword ptr [eax]{1to16} +0x62,0xf2,0x67,0x58,0x50,0x10 + +# ATT: vpdpbssd -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vpdpbssd zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf2,0x67,0x48,0x50,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vpdpbssd 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpbssd zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf2,0x67,0xcf,0x50,0x51,0x7f + +# ATT: vpdpbssd -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpbssd zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +0x62,0xf2,0x67,0xdf,0x50,0x52,0x80 + +# ATT: vpdpbssds %xmm4, %xmm3, %xmm2 +# INTEL: vpdpbssds xmm2, xmm3, xmm4 +0xc4,0xe2,0x63,0x51,0xd4 + +# ATT: vpdpbssds %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vpdpbssds xmm2 {k7}, xmm3, xmm4 +0x62,0xf2,0x67,0x0f,0x51,0xd4 + +# ATT: vpdpbssds %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpbssds xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf2,0x67,0x8f,0x51,0xd4 + +# ATT: vpdpbssds %ymm4, %ymm3, %ymm2 +# INTEL: vpdpbssds ymm2, ymm3, ymm4 +0xc4,0xe2,0x67,0x51,0xd4 + +# ATT: vpdpbssds %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vpdpbssds ymm2 {k7}, ymm3, ymm4 +0x62,0xf2,0x67,0x2f,0x51,0xd4 + +# ATT: vpdpbssds %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpbssds ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf2,0x67,0xaf,0x51,0xd4 + +# ATT: vpdpbssds %zmm4, %zmm3, %zmm2 +# INTEL: vpdpbssds zmm2, zmm3, zmm4 +0x62,0xf2,0x67,0x48,0x51,0xd4 + +# ATT: vpdpbssds %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vpdpbssds zmm2 {k7}, zmm3, zmm4 +0x62,0xf2,0x67,0x4f,0x51,0xd4 + +# ATT: vpdpbssds %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpbssds zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf2,0x67,0xcf,0x51,0xd4 + +# ATT: vpdpbssds 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vpdpbssds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x63,0x51,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbssds 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vpdpbssds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x67,0x0f,0x51,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbssds (%eax){1to4}, %xmm3, %xmm2 +# INTEL: vpdpbssds xmm2, xmm3, dword ptr [eax]{1to4} +0x62,0xf2,0x67,0x18,0x51,0x10 + +# ATT: vpdpbssds -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vpdpbssds xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x63,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpbssds 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpbssds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf2,0x67,0x8f,0x51,0x51,0x7f + +# ATT: vpdpbssds -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpbssds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +0x62,0xf2,0x67,0x9f,0x51,0x52,0x80 + +# ATT: vpdpbssds 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vpdpbssds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x67,0x51,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbssds 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vpdpbssds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x67,0x2f,0x51,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbssds (%eax){1to8}, %ymm3, %ymm2 +# INTEL: vpdpbssds ymm2, ymm3, dword ptr [eax]{1to8} +0x62,0xf2,0x67,0x38,0x51,0x10 + +# ATT: vpdpbssds -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vpdpbssds ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x67,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpbssds 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpbssds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf2,0x67,0xaf,0x51,0x51,0x7f + +# ATT: vpdpbssds -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpbssds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +0x62,0xf2,0x67,0xbf,0x51,0x52,0x80 + +# ATT: vpdpbssds 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vpdpbssds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x67,0x48,0x51,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbssds 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vpdpbssds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x67,0x4f,0x51,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbssds (%eax){1to16}, %zmm3, %zmm2 +# INTEL: vpdpbssds zmm2, zmm3, dword ptr [eax]{1to16} +0x62,0xf2,0x67,0x58,0x51,0x10 + +# ATT: vpdpbssds -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vpdpbssds zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf2,0x67,0x48,0x51,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vpdpbssds 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpbssds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf2,0x67,0xcf,0x51,0x51,0x7f + +# ATT: vpdpbssds -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpbssds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +0x62,0xf2,0x67,0xdf,0x51,0x52,0x80 + +# ATT: vpdpbsud %xmm4, %xmm3, %xmm2 +# INTEL: vpdpbsud xmm2, xmm3, xmm4 +0xc4,0xe2,0x62,0x50,0xd4 + +# ATT: vpdpbsud %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vpdpbsud xmm2 {k7}, xmm3, xmm4 +0x62,0xf2,0x66,0x0f,0x50,0xd4 + +# ATT: vpdpbsud %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpbsud xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf2,0x66,0x8f,0x50,0xd4 + +# ATT: vpdpbsud %ymm4, %ymm3, %ymm2 +# INTEL: vpdpbsud ymm2, ymm3, ymm4 +0xc4,0xe2,0x66,0x50,0xd4 + +# ATT: vpdpbsud %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vpdpbsud ymm2 {k7}, ymm3, ymm4 +0x62,0xf2,0x66,0x2f,0x50,0xd4 + +# ATT: vpdpbsud %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpbsud ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf2,0x66,0xaf,0x50,0xd4 + +# ATT: vpdpbsud %zmm4, %zmm3, %zmm2 +# INTEL: vpdpbsud zmm2, zmm3, zmm4 +0x62,0xf2,0x66,0x48,0x50,0xd4 + +# ATT: vpdpbsud %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vpdpbsud zmm2 {k7}, zmm3, zmm4 +0x62,0xf2,0x66,0x4f,0x50,0xd4 + +# ATT: vpdpbsud %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpbsud zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf2,0x66,0xcf,0x50,0xd4 + +# ATT: vpdpbsud 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vpdpbsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x62,0x50,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbsud 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vpdpbsud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x66,0x0f,0x50,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbsud (%eax){1to4}, %xmm3, %xmm2 +# INTEL: vpdpbsud xmm2, xmm3, dword ptr [eax]{1to4} +0x62,0xf2,0x66,0x18,0x50,0x10 + +# ATT: vpdpbsud -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vpdpbsud xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x62,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpbsud 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpbsud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf2,0x66,0x8f,0x50,0x51,0x7f + +# ATT: vpdpbsud -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpbsud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +0x62,0xf2,0x66,0x9f,0x50,0x52,0x80 + +# ATT: vpdpbsud 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vpdpbsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x66,0x50,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbsud 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vpdpbsud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x66,0x2f,0x50,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbsud (%eax){1to8}, %ymm3, %ymm2 +# INTEL: vpdpbsud ymm2, ymm3, dword ptr [eax]{1to8} +0x62,0xf2,0x66,0x38,0x50,0x10 + +# ATT: vpdpbsud -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vpdpbsud ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x66,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpbsud 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpbsud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf2,0x66,0xaf,0x50,0x51,0x7f + +# ATT: vpdpbsud -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpbsud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +0x62,0xf2,0x66,0xbf,0x50,0x52,0x80 + +# ATT: vpdpbsud 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vpdpbsud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x66,0x48,0x50,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbsud 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vpdpbsud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x66,0x4f,0x50,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbsud (%eax){1to16}, %zmm3, %zmm2 +# INTEL: vpdpbsud zmm2, zmm3, dword ptr [eax]{1to16} +0x62,0xf2,0x66,0x58,0x50,0x10 + +# ATT: vpdpbsud -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vpdpbsud zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf2,0x66,0x48,0x50,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vpdpbsud 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpbsud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf2,0x66,0xcf,0x50,0x51,0x7f + +# ATT: vpdpbsud -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpbsud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +0x62,0xf2,0x66,0xdf,0x50,0x52,0x80 + +# ATT: vpdpbsuds %xmm4, %xmm3, %xmm2 +# INTEL: vpdpbsuds xmm2, xmm3, xmm4 +0xc4,0xe2,0x62,0x51,0xd4 + +# ATT: vpdpbsuds %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vpdpbsuds xmm2 {k7}, xmm3, xmm4 +0x62,0xf2,0x66,0x0f,0x51,0xd4 + +# ATT: vpdpbsuds %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpbsuds xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf2,0x66,0x8f,0x51,0xd4 + +# ATT: vpdpbsuds %ymm4, %ymm3, %ymm2 +# INTEL: vpdpbsuds ymm2, ymm3, ymm4 +0xc4,0xe2,0x66,0x51,0xd4 + +# ATT: vpdpbsuds %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vpdpbsuds ymm2 {k7}, ymm3, ymm4 +0x62,0xf2,0x66,0x2f,0x51,0xd4 + +# ATT: vpdpbsuds %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpbsuds ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf2,0x66,0xaf,0x51,0xd4 + +# ATT: vpdpbsuds %zmm4, %zmm3, %zmm2 +# INTEL: vpdpbsuds zmm2, zmm3, zmm4 +0x62,0xf2,0x66,0x48,0x51,0xd4 + +# ATT: vpdpbsuds %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vpdpbsuds zmm2 {k7}, zmm3, zmm4 +0x62,0xf2,0x66,0x4f,0x51,0xd4 + +# ATT: vpdpbsuds %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpbsuds zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf2,0x66,0xcf,0x51,0xd4 + +# ATT: vpdpbsuds 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vpdpbsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x62,0x51,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbsuds 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vpdpbsuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x66,0x0f,0x51,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbsuds (%eax){1to4}, %xmm3, %xmm2 +# INTEL: vpdpbsuds xmm2, xmm3, dword ptr [eax]{1to4} +0x62,0xf2,0x66,0x18,0x51,0x10 + +# ATT: vpdpbsuds -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vpdpbsuds xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x62,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpbsuds 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpbsuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf2,0x66,0x8f,0x51,0x51,0x7f + +# ATT: vpdpbsuds -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpbsuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +0x62,0xf2,0x66,0x9f,0x51,0x52,0x80 + +# ATT: vpdpbsuds 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vpdpbsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x66,0x51,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbsuds 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vpdpbsuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x66,0x2f,0x51,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbsuds (%eax){1to8}, %ymm3, %ymm2 +# INTEL: vpdpbsuds ymm2, ymm3, dword ptr [eax]{1to8} +0x62,0xf2,0x66,0x38,0x51,0x10 + +# ATT: vpdpbsuds -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vpdpbsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x66,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpbsuds 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpbsuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf2,0x66,0xaf,0x51,0x51,0x7f + +# ATT: vpdpbsuds -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpbsuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +0x62,0xf2,0x66,0xbf,0x51,0x52,0x80 + +# ATT: vpdpbsuds 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vpdpbsuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x66,0x48,0x51,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbsuds 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vpdpbsuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x66,0x4f,0x51,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbsuds (%eax){1to16}, %zmm3, %zmm2 +# INTEL: vpdpbsuds zmm2, zmm3, dword ptr [eax]{1to16} +0x62,0xf2,0x66,0x58,0x51,0x10 + +# ATT: vpdpbsuds -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vpdpbsuds zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf2,0x66,0x48,0x51,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vpdpbsuds 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpbsuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf2,0x66,0xcf,0x51,0x51,0x7f + +# ATT: vpdpbsuds -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpbsuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +0x62,0xf2,0x66,0xdf,0x51,0x52,0x80 + +# ATT: vpdpbuud %xmm4, %xmm3, %xmm2 +# INTEL: vpdpbuud xmm2, xmm3, xmm4 +0xc4,0xe2,0x60,0x50,0xd4 + +# ATT: vpdpbuud %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vpdpbuud xmm2 {k7}, xmm3, xmm4 +0x62,0xf2,0x64,0x0f,0x50,0xd4 + +# ATT: vpdpbuud %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpbuud xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf2,0x64,0x8f,0x50,0xd4 + +# ATT: vpdpbuud %ymm4, %ymm3, %ymm2 +# INTEL: vpdpbuud ymm2, ymm3, ymm4 +0xc4,0xe2,0x64,0x50,0xd4 + +# ATT: vpdpbuud %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vpdpbuud ymm2 {k7}, ymm3, ymm4 +0x62,0xf2,0x64,0x2f,0x50,0xd4 + +# ATT: vpdpbuud %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpbuud ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf2,0x64,0xaf,0x50,0xd4 + +# ATT: vpdpbuud %zmm4, %zmm3, %zmm2 +# INTEL: vpdpbuud zmm2, zmm3, zmm4 +0x62,0xf2,0x64,0x48,0x50,0xd4 + +# ATT: vpdpbuud %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vpdpbuud zmm2 {k7}, zmm3, zmm4 +0x62,0xf2,0x64,0x4f,0x50,0xd4 + +# ATT: vpdpbuud %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpbuud zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf2,0x64,0xcf,0x50,0xd4 + +# ATT: vpdpbuud 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vpdpbuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x60,0x50,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbuud 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vpdpbuud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x64,0x0f,0x50,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbuud (%eax){1to4}, %xmm3, %xmm2 +# INTEL: vpdpbuud xmm2, xmm3, dword ptr [eax]{1to4} +0x62,0xf2,0x64,0x18,0x50,0x10 + +# ATT: vpdpbuud -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vpdpbuud xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x60,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpbuud 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpbuud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf2,0x64,0x8f,0x50,0x51,0x7f + +# ATT: vpdpbuud -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpbuud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +0x62,0xf2,0x64,0x9f,0x50,0x52,0x80 + +# ATT: vpdpbuud 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vpdpbuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x64,0x50,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbuud 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vpdpbuud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x64,0x2f,0x50,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbuud (%eax){1to8}, %ymm3, %ymm2 +# INTEL: vpdpbuud ymm2, ymm3, dword ptr [eax]{1to8} +0x62,0xf2,0x64,0x38,0x50,0x10 + +# ATT: vpdpbuud -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vpdpbuud ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x64,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpbuud 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpbuud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf2,0x64,0xaf,0x50,0x51,0x7f + +# ATT: vpdpbuud -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpbuud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +0x62,0xf2,0x64,0xbf,0x50,0x52,0x80 + +# ATT: vpdpbuud 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vpdpbuud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x64,0x48,0x50,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbuud 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vpdpbuud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x64,0x4f,0x50,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbuud (%eax){1to16}, %zmm3, %zmm2 +# INTEL: vpdpbuud zmm2, zmm3, dword ptr [eax]{1to16} +0x62,0xf2,0x64,0x58,0x50,0x10 + +# ATT: vpdpbuud -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vpdpbuud zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf2,0x64,0x48,0x50,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vpdpbuud 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpbuud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf2,0x64,0xcf,0x50,0x51,0x7f + +# ATT: vpdpbuud -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpbuud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +0x62,0xf2,0x64,0xdf,0x50,0x52,0x80 + +# ATT: vpdpbuuds %xmm4, %xmm3, %xmm2 +# INTEL: vpdpbuuds xmm2, xmm3, xmm4 +0xc4,0xe2,0x60,0x51,0xd4 + +# ATT: vpdpbuuds %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vpdpbuuds xmm2 {k7}, xmm3, xmm4 +0x62,0xf2,0x64,0x0f,0x51,0xd4 + +# ATT: vpdpbuuds %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpbuuds xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf2,0x64,0x8f,0x51,0xd4 + +# ATT: vpdpbuuds %ymm4, %ymm3, %ymm2 +# INTEL: vpdpbuuds ymm2, ymm3, ymm4 +0xc4,0xe2,0x64,0x51,0xd4 + +# ATT: vpdpbuuds %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vpdpbuuds ymm2 {k7}, ymm3, ymm4 +0x62,0xf2,0x64,0x2f,0x51,0xd4 + +# ATT: vpdpbuuds %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpbuuds ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf2,0x64,0xaf,0x51,0xd4 + +# ATT: vpdpbuuds %zmm4, %zmm3, %zmm2 +# INTEL: vpdpbuuds zmm2, zmm3, zmm4 +0x62,0xf2,0x64,0x48,0x51,0xd4 + +# ATT: vpdpbuuds %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vpdpbuuds zmm2 {k7}, zmm3, zmm4 +0x62,0xf2,0x64,0x4f,0x51,0xd4 + +# ATT: vpdpbuuds %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpbuuds zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf2,0x64,0xcf,0x51,0xd4 + +# ATT: vpdpbuuds 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vpdpbuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x60,0x51,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbuuds 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vpdpbuuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x64,0x0f,0x51,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbuuds (%eax){1to4}, %xmm3, %xmm2 +# INTEL: vpdpbuuds xmm2, xmm3, dword ptr [eax]{1to4} +0x62,0xf2,0x64,0x18,0x51,0x10 + +# ATT: vpdpbuuds -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vpdpbuuds xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x60,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpbuuds 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpbuuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf2,0x64,0x8f,0x51,0x51,0x7f + +# ATT: vpdpbuuds -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpbuuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +0x62,0xf2,0x64,0x9f,0x51,0x52,0x80 + +# ATT: vpdpbuuds 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vpdpbuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x64,0x51,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbuuds 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vpdpbuuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x64,0x2f,0x51,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbuuds (%eax){1to8}, %ymm3, %ymm2 +# INTEL: vpdpbuuds ymm2, ymm3, dword ptr [eax]{1to8} +0x62,0xf2,0x64,0x38,0x51,0x10 + +# ATT: vpdpbuuds -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vpdpbuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x64,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpbuuds 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpbuuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf2,0x64,0xaf,0x51,0x51,0x7f + +# ATT: vpdpbuuds -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpbuuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +0x62,0xf2,0x64,0xbf,0x51,0x52,0x80 + +# ATT: vpdpbuuds 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vpdpbuuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x64,0x48,0x51,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbuuds 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vpdpbuuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x64,0x4f,0x51,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbuuds (%eax){1to16}, %zmm3, %zmm2 +# INTEL: vpdpbuuds zmm2, zmm3, dword ptr [eax]{1to16} +0x62,0xf2,0x64,0x58,0x51,0x10 + +# ATT: vpdpbuuds -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vpdpbuuds zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf2,0x64,0x48,0x51,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vpdpbuuds 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpbuuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf2,0x64,0xcf,0x51,0x51,0x7f + +# ATT: vpdpbuuds -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpbuuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +0x62,0xf2,0x64,0xdf,0x51,0x52,0x80 + +# VNNI INT16 + +# ATT: vpdpwsud %xmm4, %xmm3, %xmm2 +# INTEL: vpdpwsud xmm2, xmm3, xmm4 +0xc4,0xe2,0x62,0xd2,0xd4 + +# ATT: vpdpwsud %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vpdpwsud xmm2 {k7}, xmm3, xmm4 +0x62,0xf2,0x66,0x0f,0xd2,0xd4 + +# ATT: vpdpwsud %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpwsud xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf2,0x66,0x8f,0xd2,0xd4 + +# ATT: vpdpwsud %ymm4, %ymm3, %ymm2 +# INTEL: vpdpwsud ymm2, ymm3, ymm4 +0xc4,0xe2,0x66,0xd2,0xd4 + +# ATT: vpdpwsud %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vpdpwsud ymm2 {k7}, ymm3, ymm4 +0x62,0xf2,0x66,0x2f,0xd2,0xd4 + +# ATT: vpdpwsud %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpwsud ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf2,0x66,0xaf,0xd2,0xd4 + +# ATT: vpdpwsud %zmm4, %zmm3, %zmm2 +# INTEL: vpdpwsud zmm2, zmm3, zmm4 +0x62,0xf2,0x66,0x48,0xd2,0xd4 + +# ATT: vpdpwsud %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vpdpwsud zmm2 {k7}, zmm3, zmm4 +0x62,0xf2,0x66,0x4f,0xd2,0xd4 + +# ATT: vpdpwsud %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpwsud zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf2,0x66,0xcf,0xd2,0xd4 + +# ATT: vpdpwsud 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vpdpwsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x62,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwsud 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vpdpwsud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x66,0x0f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwsud (%eax){1to4}, %xmm3, %xmm2 +# INTEL: vpdpwsud xmm2, xmm3, dword ptr [eax]{1to4} +0x62,0xf2,0x66,0x18,0xd2,0x10 + +# ATT: vpdpwsud -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vpdpwsud xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x62,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpwsud 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpwsud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf2,0x66,0x8f,0xd2,0x51,0x7f + +# ATT: vpdpwsud -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpwsud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +0x62,0xf2,0x66,0x9f,0xd2,0x52,0x80 + +# ATT: vpdpwsud 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vpdpwsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x66,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwsud 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vpdpwsud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x66,0x2f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwsud (%eax){1to8}, %ymm3, %ymm2 +# INTEL: vpdpwsud ymm2, ymm3, dword ptr [eax]{1to8} +0x62,0xf2,0x66,0x38,0xd2,0x10 + +# ATT: vpdpwsud -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vpdpwsud ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x66,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpwsud 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpwsud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf2,0x66,0xaf,0xd2,0x51,0x7f + +# ATT: vpdpwsud -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpwsud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +0x62,0xf2,0x66,0xbf,0xd2,0x52,0x80 + +# ATT: vpdpwsud 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vpdpwsud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x66,0x48,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwsud 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vpdpwsud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x66,0x4f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwsud (%eax){1to16}, %zmm3, %zmm2 +# INTEL: vpdpwsud zmm2, zmm3, dword ptr [eax]{1to16} +0x62,0xf2,0x66,0x58,0xd2,0x10 + +# ATT: vpdpwsud -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vpdpwsud zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf2,0x66,0x48,0xd2,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vpdpwsud 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpwsud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf2,0x66,0xcf,0xd2,0x51,0x7f + +# ATT: vpdpwsud -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpwsud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +0x62,0xf2,0x66,0xdf,0xd2,0x52,0x80 + +# ATT: vpdpwsuds %xmm4, %xmm3, %xmm2 +# INTEL: vpdpwsuds xmm2, xmm3, xmm4 +0xc4,0xe2,0x62,0xd3,0xd4 + +# ATT: vpdpwsuds %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vpdpwsuds xmm2 {k7}, xmm3, xmm4 +0x62,0xf2,0x66,0x0f,0xd3,0xd4 + +# ATT: vpdpwsuds %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpwsuds xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf2,0x66,0x8f,0xd3,0xd4 + +# ATT: vpdpwsuds %ymm4, %ymm3, %ymm2 +# INTEL: vpdpwsuds ymm2, ymm3, ymm4 +0xc4,0xe2,0x66,0xd3,0xd4 + +# ATT: vpdpwsuds %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vpdpwsuds ymm2 {k7}, ymm3, ymm4 +0x62,0xf2,0x66,0x2f,0xd3,0xd4 + +# ATT: vpdpwsuds %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpwsuds ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf2,0x66,0xaf,0xd3,0xd4 + +# ATT: vpdpwsuds %zmm4, %zmm3, %zmm2 +# INTEL: vpdpwsuds zmm2, zmm3, zmm4 +0x62,0xf2,0x66,0x48,0xd3,0xd4 + +# ATT: vpdpwsuds %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vpdpwsuds zmm2 {k7}, zmm3, zmm4 +0x62,0xf2,0x66,0x4f,0xd3,0xd4 + +# ATT: vpdpwsuds %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpwsuds zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf2,0x66,0xcf,0xd3,0xd4 + +# ATT: vpdpwsuds 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vpdpwsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x62,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwsuds 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vpdpwsuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x66,0x0f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwsuds (%eax){1to4}, %xmm3, %xmm2 +# INTEL: vpdpwsuds xmm2, xmm3, dword ptr [eax]{1to4} +0x62,0xf2,0x66,0x18,0xd3,0x10 + +# ATT: vpdpwsuds -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vpdpwsuds xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x62,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpwsuds 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpwsuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf2,0x66,0x8f,0xd3,0x51,0x7f + +# ATT: vpdpwsuds -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpwsuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +0x62,0xf2,0x66,0x9f,0xd3,0x52,0x80 + +# ATT: vpdpwsuds 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vpdpwsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x66,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwsuds 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vpdpwsuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x66,0x2f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwsuds (%eax){1to8}, %ymm3, %ymm2 +# INTEL: vpdpwsuds ymm2, ymm3, dword ptr [eax]{1to8} +0x62,0xf2,0x66,0x38,0xd3,0x10 + +# ATT: vpdpwsuds -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vpdpwsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x66,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpwsuds 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpwsuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf2,0x66,0xaf,0xd3,0x51,0x7f + +# ATT: vpdpwsuds -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpwsuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +0x62,0xf2,0x66,0xbf,0xd3,0x52,0x80 + +# ATT: vpdpwsuds 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vpdpwsuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x66,0x48,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwsuds 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vpdpwsuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x66,0x4f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwsuds (%eax){1to16}, %zmm3, %zmm2 +# INTEL: vpdpwsuds zmm2, zmm3, dword ptr [eax]{1to16} +0x62,0xf2,0x66,0x58,0xd3,0x10 + +# ATT: vpdpwsuds -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vpdpwsuds zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf2,0x66,0x48,0xd3,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vpdpwsuds 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpwsuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf2,0x66,0xcf,0xd3,0x51,0x7f + +# ATT: vpdpwsuds -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpwsuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +0x62,0xf2,0x66,0xdf,0xd3,0x52,0x80 + +# ATT: vpdpwusd %xmm4, %xmm3, %xmm2 +# INTEL: vpdpwusd xmm2, xmm3, xmm4 +0xc4,0xe2,0x61,0xd2,0xd4 + +# ATT: vpdpwusd %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vpdpwusd xmm2 {k7}, xmm3, xmm4 +0x62,0xf2,0x65,0x0f,0xd2,0xd4 + +# ATT: vpdpwusd %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpwusd xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf2,0x65,0x8f,0xd2,0xd4 + +# ATT: vpdpwusd %ymm4, %ymm3, %ymm2 +# INTEL: vpdpwusd ymm2, ymm3, ymm4 +0xc4,0xe2,0x65,0xd2,0xd4 + +# ATT: vpdpwusd %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vpdpwusd ymm2 {k7}, ymm3, ymm4 +0x62,0xf2,0x65,0x2f,0xd2,0xd4 + +# ATT: vpdpwusd %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpwusd ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf2,0x65,0xaf,0xd2,0xd4 + +# ATT: vpdpwusd %zmm4, %zmm3, %zmm2 +# INTEL: vpdpwusd zmm2, zmm3, zmm4 +0x62,0xf2,0x65,0x48,0xd2,0xd4 + +# ATT: vpdpwusd %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vpdpwusd zmm2 {k7}, zmm3, zmm4 +0x62,0xf2,0x65,0x4f,0xd2,0xd4 + +# ATT: vpdpwusd %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpwusd zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf2,0x65,0xcf,0xd2,0xd4 + +# ATT: vpdpwusd 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vpdpwusd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x61,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwusd 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vpdpwusd xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x65,0x0f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwusd (%eax){1to4}, %xmm3, %xmm2 +# INTEL: vpdpwusd xmm2, xmm3, dword ptr [eax]{1to4} +0x62,0xf2,0x65,0x18,0xd2,0x10 + +# ATT: vpdpwusd -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vpdpwusd xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x61,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpwusd 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpwusd xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf2,0x65,0x8f,0xd2,0x51,0x7f + +# ATT: vpdpwusd -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpwusd xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +0x62,0xf2,0x65,0x9f,0xd2,0x52,0x80 + +# ATT: vpdpwusd 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vpdpwusd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x65,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwusd 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vpdpwusd ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x65,0x2f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwusd (%eax){1to8}, %ymm3, %ymm2 +# INTEL: vpdpwusd ymm2, ymm3, dword ptr [eax]{1to8} +0x62,0xf2,0x65,0x38,0xd2,0x10 + +# ATT: vpdpwusd -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vpdpwusd ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x65,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpwusd 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpwusd ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf2,0x65,0xaf,0xd2,0x51,0x7f + +# ATT: vpdpwusd -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpwusd ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +0x62,0xf2,0x65,0xbf,0xd2,0x52,0x80 + +# ATT: vpdpwusd 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vpdpwusd zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x65,0x48,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwusd 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vpdpwusd zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x65,0x4f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwusd (%eax){1to16}, %zmm3, %zmm2 +# INTEL: vpdpwusd zmm2, zmm3, dword ptr [eax]{1to16} +0x62,0xf2,0x65,0x58,0xd2,0x10 + +# ATT: vpdpwusd -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vpdpwusd zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf2,0x65,0x48,0xd2,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vpdpwusd 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpwusd zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf2,0x65,0xcf,0xd2,0x51,0x7f + +# ATT: vpdpwusd -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpwusd zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +0x62,0xf2,0x65,0xdf,0xd2,0x52,0x80 + +# ATT: vpdpwusds %xmm4, %xmm3, %xmm2 +# INTEL: vpdpwusds xmm2, xmm3, xmm4 +0xc4,0xe2,0x61,0xd3,0xd4 + +# ATT: vpdpwusds %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vpdpwusds xmm2 {k7}, xmm3, xmm4 +0x62,0xf2,0x65,0x0f,0xd3,0xd4 + +# ATT: vpdpwusds %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpwusds xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf2,0x65,0x8f,0xd3,0xd4 + +# ATT: vpdpwusds %ymm4, %ymm3, %ymm2 +# INTEL: vpdpwusds ymm2, ymm3, ymm4 +0xc4,0xe2,0x65,0xd3,0xd4 + +# ATT: vpdpwusds %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vpdpwusds ymm2 {k7}, ymm3, ymm4 +0x62,0xf2,0x65,0x2f,0xd3,0xd4 + +# ATT: vpdpwusds %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpwusds ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf2,0x65,0xaf,0xd3,0xd4 + +# ATT: vpdpwusds %zmm4, %zmm3, %zmm2 +# INTEL: vpdpwusds zmm2, zmm3, zmm4 +0x62,0xf2,0x65,0x48,0xd3,0xd4 + +# ATT: vpdpwusds %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vpdpwusds zmm2 {k7}, zmm3, zmm4 +0x62,0xf2,0x65,0x4f,0xd3,0xd4 + +# ATT: vpdpwusds %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpwusds zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf2,0x65,0xcf,0xd3,0xd4 + +# ATT: vpdpwusds 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vpdpwusds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x61,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwusds 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vpdpwusds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x65,0x0f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwusds (%eax){1to4}, %xmm3, %xmm2 +# INTEL: vpdpwusds xmm2, xmm3, dword ptr [eax]{1to4} +0x62,0xf2,0x65,0x18,0xd3,0x10 + +# ATT: vpdpwusds -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vpdpwusds xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x61,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpwusds 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpwusds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf2,0x65,0x8f,0xd3,0x51,0x7f + +# ATT: vpdpwusds -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpwusds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +0x62,0xf2,0x65,0x9f,0xd3,0x52,0x80 + +# ATT: vpdpwusds 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vpdpwusds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x65,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwusds 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vpdpwusds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x65,0x2f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwusds (%eax){1to8}, %ymm3, %ymm2 +# INTEL: vpdpwusds ymm2, ymm3, dword ptr [eax]{1to8} +0x62,0xf2,0x65,0x38,0xd3,0x10 + +# ATT: vpdpwusds -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vpdpwusds ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x65,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpwusds 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpwusds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf2,0x65,0xaf,0xd3,0x51,0x7f + +# ATT: vpdpwusds -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpwusds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +0x62,0xf2,0x65,0xbf,0xd3,0x52,0x80 + +# ATT: vpdpwusds 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vpdpwusds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x65,0x48,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwusds 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vpdpwusds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x65,0x4f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwusds (%eax){1to16}, %zmm3, %zmm2 +# INTEL: vpdpwusds zmm2, zmm3, dword ptr [eax]{1to16} +0x62,0xf2,0x65,0x58,0xd3,0x10 + +# ATT: vpdpwusds -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vpdpwusds zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf2,0x65,0x48,0xd3,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vpdpwusds 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpwusds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf2,0x65,0xcf,0xd3,0x51,0x7f + +# ATT: vpdpwusds -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpwusds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +0x62,0xf2,0x65,0xdf,0xd3,0x52,0x80 + +# ATT: vpdpwuud %xmm4, %xmm3, %xmm2 +# INTEL: vpdpwuud xmm2, xmm3, xmm4 +0xc4,0xe2,0x60,0xd2,0xd4 + +# ATT: vpdpwuud %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vpdpwuud xmm2 {k7}, xmm3, xmm4 +0x62,0xf2,0x64,0x0f,0xd2,0xd4 + +# ATT: vpdpwuud %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpwuud xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf2,0x64,0x8f,0xd2,0xd4 + +# ATT: vpdpwuud %ymm4, %ymm3, %ymm2 +# INTEL: vpdpwuud ymm2, ymm3, ymm4 +0xc4,0xe2,0x64,0xd2,0xd4 + +# ATT: vpdpwuud %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vpdpwuud ymm2 {k7}, ymm3, ymm4 +0x62,0xf2,0x64,0x2f,0xd2,0xd4 + +# ATT: vpdpwuud %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpwuud ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf2,0x64,0xaf,0xd2,0xd4 + +# ATT: vpdpwuud %zmm4, %zmm3, %zmm2 +# INTEL: vpdpwuud zmm2, zmm3, zmm4 +0x62,0xf2,0x64,0x48,0xd2,0xd4 + +# ATT: vpdpwuud %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vpdpwuud zmm2 {k7}, zmm3, zmm4 +0x62,0xf2,0x64,0x4f,0xd2,0xd4 + +# ATT: vpdpwuud %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpwuud zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf2,0x64,0xcf,0xd2,0xd4 + +# ATT: vpdpwuud 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vpdpwuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x60,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwuud 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vpdpwuud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x64,0x0f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwuud (%eax){1to4}, %xmm3, %xmm2 +# INTEL: vpdpwuud xmm2, xmm3, dword ptr [eax]{1to4} +0x62,0xf2,0x64,0x18,0xd2,0x10 + +# ATT: vpdpwuud -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vpdpwuud xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x60,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpwuud 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpwuud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf2,0x64,0x8f,0xd2,0x51,0x7f + +# ATT: vpdpwuud -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpwuud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +0x62,0xf2,0x64,0x9f,0xd2,0x52,0x80 + +# ATT: vpdpwuud 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vpdpwuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x64,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwuud 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vpdpwuud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x64,0x2f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwuud (%eax){1to8}, %ymm3, %ymm2 +# INTEL: vpdpwuud ymm2, ymm3, dword ptr [eax]{1to8} +0x62,0xf2,0x64,0x38,0xd2,0x10 + +# ATT: vpdpwuud -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vpdpwuud ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x64,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpwuud 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpwuud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf2,0x64,0xaf,0xd2,0x51,0x7f + +# ATT: vpdpwuud -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpwuud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +0x62,0xf2,0x64,0xbf,0xd2,0x52,0x80 + +# ATT: vpdpwuud 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vpdpwuud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x64,0x48,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwuud 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vpdpwuud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x64,0x4f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwuud (%eax){1to16}, %zmm3, %zmm2 +# INTEL: vpdpwuud zmm2, zmm3, dword ptr [eax]{1to16} +0x62,0xf2,0x64,0x58,0xd2,0x10 + +# ATT: vpdpwuud -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vpdpwuud zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf2,0x64,0x48,0xd2,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vpdpwuud 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpwuud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf2,0x64,0xcf,0xd2,0x51,0x7f + +# ATT: vpdpwuud -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpwuud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +0x62,0xf2,0x64,0xdf,0xd2,0x52,0x80 + +# ATT: vpdpwuuds %xmm4, %xmm3, %xmm2 +# INTEL: vpdpwuuds xmm2, xmm3, xmm4 +0xc4,0xe2,0x60,0xd3,0xd4 + +# ATT: vpdpwuuds %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vpdpwuuds xmm2 {k7}, xmm3, xmm4 +0x62,0xf2,0x64,0x0f,0xd3,0xd4 + +# ATT: vpdpwuuds %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpwuuds xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf2,0x64,0x8f,0xd3,0xd4 + +# ATT: vpdpwuuds %ymm4, %ymm3, %ymm2 +# INTEL: vpdpwuuds ymm2, ymm3, ymm4 +0xc4,0xe2,0x64,0xd3,0xd4 + +# ATT: vpdpwuuds %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vpdpwuuds ymm2 {k7}, ymm3, ymm4 +0x62,0xf2,0x64,0x2f,0xd3,0xd4 + +# ATT: vpdpwuuds %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpwuuds ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf2,0x64,0xaf,0xd3,0xd4 + +# ATT: vpdpwuuds %zmm4, %zmm3, %zmm2 +# INTEL: vpdpwuuds zmm2, zmm3, zmm4 +0x62,0xf2,0x64,0x48,0xd3,0xd4 + +# ATT: vpdpwuuds %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vpdpwuuds zmm2 {k7}, zmm3, zmm4 +0x62,0xf2,0x64,0x4f,0xd3,0xd4 + +# ATT: vpdpwuuds %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpwuuds zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf2,0x64,0xcf,0xd3,0xd4 + +# ATT: vpdpwuuds 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vpdpwuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x60,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwuuds 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vpdpwuuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x64,0x0f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwuuds (%eax){1to4}, %xmm3, %xmm2 +# INTEL: vpdpwuuds xmm2, xmm3, dword ptr [eax]{1to4} +0x62,0xf2,0x64,0x18,0xd3,0x10 + +# ATT: vpdpwuuds -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vpdpwuuds xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x60,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpwuuds 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpwuuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf2,0x64,0x8f,0xd3,0x51,0x7f + +# ATT: vpdpwuuds -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vpdpwuuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +0x62,0xf2,0x64,0x9f,0xd3,0x52,0x80 + +# ATT: vpdpwuuds 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vpdpwuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x64,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwuuds 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vpdpwuuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x64,0x2f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwuuds (%eax){1to8}, %ymm3, %ymm2 +# INTEL: vpdpwuuds ymm2, ymm3, dword ptr [eax]{1to8} +0x62,0xf2,0x64,0x38,0xd3,0x10 + +# ATT: vpdpwuuds -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vpdpwuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x64,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpwuuds 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpwuuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf2,0x64,0xaf,0xd3,0x51,0x7f + +# ATT: vpdpwuuds -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vpdpwuuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +0x62,0xf2,0x64,0xbf,0xd3,0x52,0x80 + +# ATT: vpdpwuuds 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vpdpwuuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x64,0x48,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwuuds 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vpdpwuuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x64,0x4f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwuuds (%eax){1to16}, %zmm3, %zmm2 +# INTEL: vpdpwuuds zmm2, zmm3, dword ptr [eax]{1to16} +0x62,0xf2,0x64,0x58,0xd3,0x10 + +# ATT: vpdpwuuds -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vpdpwuuds zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf2,0x64,0x48,0xd3,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vpdpwuuds 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpwuuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf2,0x64,0xcf,0xd3,0x51,0x7f + +# ATT: vpdpwuuds -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vpdpwuuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +0x62,0xf2,0x64,0xdf,0xd3,0x52,0x80 + # VMPSADBW # ATT: vmpsadbw $123, %xmm4, %xmm3, %xmm2 diff --git a/llvm/test/MC/Disassembler/X86/avx10_2ni-64.txt b/llvm/test/MC/Disassembler/X86/avx10_2ni-64.txt index 7f68e9d0da131c0..b5d25ee7e0c017c 100644 --- a/llvm/test/MC/Disassembler/X86/avx10_2ni-64.txt +++ b/llvm/test/MC/Disassembler/X86/avx10_2ni-64.txt @@ -1,6 +1,1416 @@ # RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT # RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL +# VNNI FP16 + +# ATT: vdpphps %xmm24, %xmm23, %xmm22 +# INTEL: vdpphps xmm22, xmm23, xmm24 +0x62,0x82,0x44,0x00,0x52,0xf0 + +# ATT: vdpphps %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vdpphps xmm22 {k7}, xmm23, xmm24 +0x62,0x82,0x44,0x07,0x52,0xf0 + +# ATT: vdpphps %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vdpphps xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x82,0x44,0x87,0x52,0xf0 + +# ATT: vdpphps %ymm24, %ymm23, %ymm22 +# INTEL: vdpphps ymm22, ymm23, ymm24 +0x62,0x82,0x44,0x20,0x52,0xf0 + +# ATT: vdpphps %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vdpphps ymm22 {k7}, ymm23, ymm24 +0x62,0x82,0x44,0x27,0x52,0xf0 + +# ATT: vdpphps %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vdpphps ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x82,0x44,0xa7,0x52,0xf0 + +# ATT: vdpphps %zmm24, %zmm23, %zmm22 +# INTEL: vdpphps zmm22, zmm23, zmm24 +0x62,0x82,0x44,0x40,0x52,0xf0 + +# ATT: vdpphps %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vdpphps zmm22 {k7}, zmm23, zmm24 +0x62,0x82,0x44,0x47,0x52,0xf0 + +# ATT: vdpphps %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vdpphps zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x82,0x44,0xc7,0x52,0xf0 + +# ATT: vdpphps 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vdpphps xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x44,0x00,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vdpphps 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vdpphps xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x44,0x07,0x52,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vdpphps (%rip){1to4}, %xmm23, %xmm22 +# INTEL: vdpphps xmm22, xmm23, dword ptr [rip]{1to4} +0x62,0xe2,0x44,0x10,0x52,0x35,0x00,0x00,0x00,0x00 + +# ATT: vdpphps -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vdpphps xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe2,0x44,0x00,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vdpphps 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vdpphps xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe2,0x44,0x87,0x52,0x71,0x7f + +# ATT: vdpphps -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vdpphps xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4} +0x62,0xe2,0x44,0x97,0x52,0x72,0x80 + +# ATT: vdpphps 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vdpphps ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x44,0x20,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vdpphps 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vdpphps ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x44,0x27,0x52,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vdpphps (%rip){1to8}, %ymm23, %ymm22 +# INTEL: vdpphps ymm22, ymm23, dword ptr [rip]{1to8} +0x62,0xe2,0x44,0x30,0x52,0x35,0x00,0x00,0x00,0x00 + +# ATT: vdpphps -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vdpphps ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe2,0x44,0x20,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vdpphps 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vdpphps ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe2,0x44,0xa7,0x52,0x71,0x7f + +# ATT: vdpphps -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vdpphps ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8} +0x62,0xe2,0x44,0xb7,0x52,0x72,0x80 + +# ATT: vdpphps 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vdpphps zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x44,0x40,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vdpphps 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vdpphps zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x44,0x47,0x52,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vdpphps (%rip){1to16}, %zmm23, %zmm22 +# INTEL: vdpphps zmm22, zmm23, dword ptr [rip]{1to16} +0x62,0xe2,0x44,0x50,0x52,0x35,0x00,0x00,0x00,0x00 + +# ATT: vdpphps -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vdpphps zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe2,0x44,0x40,0x52,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vdpphps 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vdpphps zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe2,0x44,0xc7,0x52,0x71,0x7f + +# ATT: vdpphps -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vdpphps zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16} +0x62,0xe2,0x44,0xd7,0x52,0x72,0x80 + +# VNNI INT8 + +# ATT: vpdpbssd %xmm24, %xmm23, %xmm22 +# INTEL: vpdpbssd xmm22, xmm23, xmm24 +0x62,0x82,0x47,0x00,0x50,0xf0 + +# ATT: vpdpbssd %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vpdpbssd xmm22 {k7}, xmm23, xmm24 +0x62,0x82,0x47,0x07,0x50,0xf0 + +# ATT: vpdpbssd %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpbssd xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x82,0x47,0x87,0x50,0xf0 + +# ATT: vpdpbssd %ymm24, %ymm23, %ymm22 +# INTEL: vpdpbssd ymm22, ymm23, ymm24 +0x62,0x82,0x47,0x20,0x50,0xf0 + +# ATT: vpdpbssd %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vpdpbssd ymm22 {k7}, ymm23, ymm24 +0x62,0x82,0x47,0x27,0x50,0xf0 + +# ATT: vpdpbssd %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpbssd ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x82,0x47,0xa7,0x50,0xf0 + +# ATT: vpdpbssd %zmm24, %zmm23, %zmm22 +# INTEL: vpdpbssd zmm22, zmm23, zmm24 +0x62,0x82,0x47,0x40,0x50,0xf0 + +# ATT: vpdpbssd %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vpdpbssd zmm22 {k7}, zmm23, zmm24 +0x62,0x82,0x47,0x47,0x50,0xf0 + +# ATT: vpdpbssd %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpbssd zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x82,0x47,0xc7,0x50,0xf0 + +# ATT: vpdpbssd 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vpdpbssd xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x47,0x00,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbssd 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vpdpbssd xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x47,0x07,0x50,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbssd (%rip){1to4}, %xmm23, %xmm22 +# INTEL: vpdpbssd xmm22, xmm23, dword ptr [rip]{1to4} +0x62,0xe2,0x47,0x10,0x50,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpbssd -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vpdpbssd xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe2,0x47,0x00,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpbssd 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpbssd xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe2,0x47,0x87,0x50,0x71,0x7f + +# ATT: vpdpbssd -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpbssd xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4} +0x62,0xe2,0x47,0x97,0x50,0x72,0x80 + +# ATT: vpdpbssd 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vpdpbssd ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x47,0x20,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbssd 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vpdpbssd ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x47,0x27,0x50,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbssd (%rip){1to8}, %ymm23, %ymm22 +# INTEL: vpdpbssd ymm22, ymm23, dword ptr [rip]{1to8} +0x62,0xe2,0x47,0x30,0x50,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpbssd -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vpdpbssd ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe2,0x47,0x20,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpbssd 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpbssd ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe2,0x47,0xa7,0x50,0x71,0x7f + +# ATT: vpdpbssd -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpbssd ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8} +0x62,0xe2,0x47,0xb7,0x50,0x72,0x80 + +# ATT: vpdpbssd 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vpdpbssd zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x47,0x40,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbssd 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vpdpbssd zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x47,0x47,0x50,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbssd (%rip){1to16}, %zmm23, %zmm22 +# INTEL: vpdpbssd zmm22, zmm23, dword ptr [rip]{1to16} +0x62,0xe2,0x47,0x50,0x50,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpbssd -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vpdpbssd zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe2,0x47,0x40,0x50,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vpdpbssd 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpbssd zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe2,0x47,0xc7,0x50,0x71,0x7f + +# ATT: vpdpbssd -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpbssd zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16} +0x62,0xe2,0x47,0xd7,0x50,0x72,0x80 + +# ATT: vpdpbssds %xmm24, %xmm23, %xmm22 +# INTEL: vpdpbssds xmm22, xmm23, xmm24 +0x62,0x82,0x47,0x00,0x51,0xf0 + +# ATT: vpdpbssds %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vpdpbssds xmm22 {k7}, xmm23, xmm24 +0x62,0x82,0x47,0x07,0x51,0xf0 + +# ATT: vpdpbssds %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpbssds xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x82,0x47,0x87,0x51,0xf0 + +# ATT: vpdpbssds %ymm24, %ymm23, %ymm22 +# INTEL: vpdpbssds ymm22, ymm23, ymm24 +0x62,0x82,0x47,0x20,0x51,0xf0 + +# ATT: vpdpbssds %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vpdpbssds ymm22 {k7}, ymm23, ymm24 +0x62,0x82,0x47,0x27,0x51,0xf0 + +# ATT: vpdpbssds %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpbssds ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x82,0x47,0xa7,0x51,0xf0 + +# ATT: vpdpbssds %zmm24, %zmm23, %zmm22 +# INTEL: vpdpbssds zmm22, zmm23, zmm24 +0x62,0x82,0x47,0x40,0x51,0xf0 + +# ATT: vpdpbssds %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vpdpbssds zmm22 {k7}, zmm23, zmm24 +0x62,0x82,0x47,0x47,0x51,0xf0 + +# ATT: vpdpbssds %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpbssds zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x82,0x47,0xc7,0x51,0xf0 + +# ATT: vpdpbssds 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vpdpbssds xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x47,0x00,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbssds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vpdpbssds xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x47,0x07,0x51,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbssds (%rip){1to4}, %xmm23, %xmm22 +# INTEL: vpdpbssds xmm22, xmm23, dword ptr [rip]{1to4} +0x62,0xe2,0x47,0x10,0x51,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpbssds -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vpdpbssds xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe2,0x47,0x00,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpbssds 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpbssds xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe2,0x47,0x87,0x51,0x71,0x7f + +# ATT: vpdpbssds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpbssds xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4} +0x62,0xe2,0x47,0x97,0x51,0x72,0x80 + +# ATT: vpdpbssds 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vpdpbssds ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x47,0x20,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbssds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vpdpbssds ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x47,0x27,0x51,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbssds (%rip){1to8}, %ymm23, %ymm22 +# INTEL: vpdpbssds ymm22, ymm23, dword ptr [rip]{1to8} +0x62,0xe2,0x47,0x30,0x51,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpbssds -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vpdpbssds ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe2,0x47,0x20,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpbssds 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpbssds ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe2,0x47,0xa7,0x51,0x71,0x7f + +# ATT: vpdpbssds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpbssds ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8} +0x62,0xe2,0x47,0xb7,0x51,0x72,0x80 + +# ATT: vpdpbssds 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vpdpbssds zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x47,0x40,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbssds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vpdpbssds zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x47,0x47,0x51,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbssds (%rip){1to16}, %zmm23, %zmm22 +# INTEL: vpdpbssds zmm22, zmm23, dword ptr [rip]{1to16} +0x62,0xe2,0x47,0x50,0x51,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpbssds -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vpdpbssds zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe2,0x47,0x40,0x51,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vpdpbssds 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpbssds zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe2,0x47,0xc7,0x51,0x71,0x7f + +# ATT: vpdpbssds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpbssds zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16} +0x62,0xe2,0x47,0xd7,0x51,0x72,0x80 + +# ATT: vpdpbsud %xmm24, %xmm23, %xmm22 +# INTEL: vpdpbsud xmm22, xmm23, xmm24 +0x62,0x82,0x46,0x00,0x50,0xf0 + +# ATT: vpdpbsud %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vpdpbsud xmm22 {k7}, xmm23, xmm24 +0x62,0x82,0x46,0x07,0x50,0xf0 + +# ATT: vpdpbsud %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpbsud xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x82,0x46,0x87,0x50,0xf0 + +# ATT: vpdpbsud %ymm24, %ymm23, %ymm22 +# INTEL: vpdpbsud ymm22, ymm23, ymm24 +0x62,0x82,0x46,0x20,0x50,0xf0 + +# ATT: vpdpbsud %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vpdpbsud ymm22 {k7}, ymm23, ymm24 +0x62,0x82,0x46,0x27,0x50,0xf0 + +# ATT: vpdpbsud %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpbsud ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x82,0x46,0xa7,0x50,0xf0 + +# ATT: vpdpbsud %zmm24, %zmm23, %zmm22 +# INTEL: vpdpbsud zmm22, zmm23, zmm24 +0x62,0x82,0x46,0x40,0x50,0xf0 + +# ATT: vpdpbsud %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vpdpbsud zmm22 {k7}, zmm23, zmm24 +0x62,0x82,0x46,0x47,0x50,0xf0 + +# ATT: vpdpbsud %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpbsud zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x82,0x46,0xc7,0x50,0xf0 + +# ATT: vpdpbsud 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vpdpbsud xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x46,0x00,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbsud 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vpdpbsud xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x46,0x07,0x50,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbsud (%rip){1to4}, %xmm23, %xmm22 +# INTEL: vpdpbsud xmm22, xmm23, dword ptr [rip]{1to4} +0x62,0xe2,0x46,0x10,0x50,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpbsud -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vpdpbsud xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe2,0x46,0x00,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpbsud 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpbsud xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe2,0x46,0x87,0x50,0x71,0x7f + +# ATT: vpdpbsud -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpbsud xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4} +0x62,0xe2,0x46,0x97,0x50,0x72,0x80 + +# ATT: vpdpbsud 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vpdpbsud ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x46,0x20,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbsud 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vpdpbsud ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x46,0x27,0x50,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbsud (%rip){1to8}, %ymm23, %ymm22 +# INTEL: vpdpbsud ymm22, ymm23, dword ptr [rip]{1to8} +0x62,0xe2,0x46,0x30,0x50,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpbsud -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vpdpbsud ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe2,0x46,0x20,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpbsud 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpbsud ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe2,0x46,0xa7,0x50,0x71,0x7f + +# ATT: vpdpbsud -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpbsud ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8} +0x62,0xe2,0x46,0xb7,0x50,0x72,0x80 + +# ATT: vpdpbsud 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vpdpbsud zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x46,0x40,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbsud 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vpdpbsud zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x46,0x47,0x50,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbsud (%rip){1to16}, %zmm23, %zmm22 +# INTEL: vpdpbsud zmm22, zmm23, dword ptr [rip]{1to16} +0x62,0xe2,0x46,0x50,0x50,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpbsud -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vpdpbsud zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe2,0x46,0x40,0x50,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vpdpbsud 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpbsud zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe2,0x46,0xc7,0x50,0x71,0x7f + +# ATT: vpdpbsud -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpbsud zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16} +0x62,0xe2,0x46,0xd7,0x50,0x72,0x80 + +# ATT: vpdpbsuds %xmm24, %xmm23, %xmm22 +# INTEL: vpdpbsuds xmm22, xmm23, xmm24 +0x62,0x82,0x46,0x00,0x51,0xf0 + +# ATT: vpdpbsuds %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vpdpbsuds xmm22 {k7}, xmm23, xmm24 +0x62,0x82,0x46,0x07,0x51,0xf0 + +# ATT: vpdpbsuds %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpbsuds xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x82,0x46,0x87,0x51,0xf0 + +# ATT: vpdpbsuds %ymm24, %ymm23, %ymm22 +# INTEL: vpdpbsuds ymm22, ymm23, ymm24 +0x62,0x82,0x46,0x20,0x51,0xf0 + +# ATT: vpdpbsuds %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vpdpbsuds ymm22 {k7}, ymm23, ymm24 +0x62,0x82,0x46,0x27,0x51,0xf0 + +# ATT: vpdpbsuds %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpbsuds ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x82,0x46,0xa7,0x51,0xf0 + +# ATT: vpdpbsuds %zmm24, %zmm23, %zmm22 +# INTEL: vpdpbsuds zmm22, zmm23, zmm24 +0x62,0x82,0x46,0x40,0x51,0xf0 + +# ATT: vpdpbsuds %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vpdpbsuds zmm22 {k7}, zmm23, zmm24 +0x62,0x82,0x46,0x47,0x51,0xf0 + +# ATT: vpdpbsuds %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpbsuds zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x82,0x46,0xc7,0x51,0xf0 + +# ATT: vpdpbsuds 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vpdpbsuds xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x46,0x00,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbsuds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vpdpbsuds xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x46,0x07,0x51,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbsuds (%rip){1to4}, %xmm23, %xmm22 +# INTEL: vpdpbsuds xmm22, xmm23, dword ptr [rip]{1to4} +0x62,0xe2,0x46,0x10,0x51,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpbsuds -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vpdpbsuds xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe2,0x46,0x00,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpbsuds 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpbsuds xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe2,0x46,0x87,0x51,0x71,0x7f + +# ATT: vpdpbsuds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpbsuds xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4} +0x62,0xe2,0x46,0x97,0x51,0x72,0x80 + +# ATT: vpdpbsuds 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vpdpbsuds ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x46,0x20,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbsuds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vpdpbsuds ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x46,0x27,0x51,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbsuds (%rip){1to8}, %ymm23, %ymm22 +# INTEL: vpdpbsuds ymm22, ymm23, dword ptr [rip]{1to8} +0x62,0xe2,0x46,0x30,0x51,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpbsuds -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vpdpbsuds ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe2,0x46,0x20,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpbsuds 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpbsuds ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe2,0x46,0xa7,0x51,0x71,0x7f + +# ATT: vpdpbsuds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpbsuds ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8} +0x62,0xe2,0x46,0xb7,0x51,0x72,0x80 + +# ATT: vpdpbsuds 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vpdpbsuds zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x46,0x40,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbsuds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vpdpbsuds zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x46,0x47,0x51,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbsuds (%rip){1to16}, %zmm23, %zmm22 +# INTEL: vpdpbsuds zmm22, zmm23, dword ptr [rip]{1to16} +0x62,0xe2,0x46,0x50,0x51,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpbsuds -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vpdpbsuds zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe2,0x46,0x40,0x51,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vpdpbsuds 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpbsuds zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe2,0x46,0xc7,0x51,0x71,0x7f + +# ATT: vpdpbsuds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpbsuds zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16} +0x62,0xe2,0x46,0xd7,0x51,0x72,0x80 + +# ATT: vpdpbuud %xmm24, %xmm23, %xmm22 +# INTEL: vpdpbuud xmm22, xmm23, xmm24 +0x62,0x82,0x44,0x00,0x50,0xf0 + +# ATT: vpdpbuud %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vpdpbuud xmm22 {k7}, xmm23, xmm24 +0x62,0x82,0x44,0x07,0x50,0xf0 + +# ATT: vpdpbuud %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpbuud xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x82,0x44,0x87,0x50,0xf0 + +# ATT: vpdpbuud %ymm24, %ymm23, %ymm22 +# INTEL: vpdpbuud ymm22, ymm23, ymm24 +0x62,0x82,0x44,0x20,0x50,0xf0 + +# ATT: vpdpbuud %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vpdpbuud ymm22 {k7}, ymm23, ymm24 +0x62,0x82,0x44,0x27,0x50,0xf0 + +# ATT: vpdpbuud %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpbuud ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x82,0x44,0xa7,0x50,0xf0 + +# ATT: vpdpbuud %zmm24, %zmm23, %zmm22 +# INTEL: vpdpbuud zmm22, zmm23, zmm24 +0x62,0x82,0x44,0x40,0x50,0xf0 + +# ATT: vpdpbuud %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vpdpbuud zmm22 {k7}, zmm23, zmm24 +0x62,0x82,0x44,0x47,0x50,0xf0 + +# ATT: vpdpbuud %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpbuud zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x82,0x44,0xc7,0x50,0xf0 + +# ATT: vpdpbuud 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vpdpbuud xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x44,0x00,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbuud 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vpdpbuud xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x44,0x07,0x50,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbuud (%rip){1to4}, %xmm23, %xmm22 +# INTEL: vpdpbuud xmm22, xmm23, dword ptr [rip]{1to4} +0x62,0xe2,0x44,0x10,0x50,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpbuud -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vpdpbuud xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe2,0x44,0x00,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpbuud 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpbuud xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe2,0x44,0x87,0x50,0x71,0x7f + +# ATT: vpdpbuud -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpbuud xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4} +0x62,0xe2,0x44,0x97,0x50,0x72,0x80 + +# ATT: vpdpbuud 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vpdpbuud ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x44,0x20,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbuud 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vpdpbuud ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x44,0x27,0x50,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbuud (%rip){1to8}, %ymm23, %ymm22 +# INTEL: vpdpbuud ymm22, ymm23, dword ptr [rip]{1to8} +0x62,0xe2,0x44,0x30,0x50,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpbuud -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vpdpbuud ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe2,0x44,0x20,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpbuud 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpbuud ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe2,0x44,0xa7,0x50,0x71,0x7f + +# ATT: vpdpbuud -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpbuud ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8} +0x62,0xe2,0x44,0xb7,0x50,0x72,0x80 + +# ATT: vpdpbuud 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vpdpbuud zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x44,0x40,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbuud 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vpdpbuud zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x44,0x47,0x50,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbuud (%rip){1to16}, %zmm23, %zmm22 +# INTEL: vpdpbuud zmm22, zmm23, dword ptr [rip]{1to16} +0x62,0xe2,0x44,0x50,0x50,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpbuud -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vpdpbuud zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe2,0x44,0x40,0x50,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vpdpbuud 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpbuud zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe2,0x44,0xc7,0x50,0x71,0x7f + +# ATT: vpdpbuud -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpbuud zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16} +0x62,0xe2,0x44,0xd7,0x50,0x72,0x80 + +# ATT: vpdpbuuds %xmm24, %xmm23, %xmm22 +# INTEL: vpdpbuuds xmm22, xmm23, xmm24 +0x62,0x82,0x44,0x00,0x51,0xf0 + +# ATT: vpdpbuuds %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vpdpbuuds xmm22 {k7}, xmm23, xmm24 +0x62,0x82,0x44,0x07,0x51,0xf0 + +# ATT: vpdpbuuds %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpbuuds xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x82,0x44,0x87,0x51,0xf0 + +# ATT: vpdpbuuds %ymm24, %ymm23, %ymm22 +# INTEL: vpdpbuuds ymm22, ymm23, ymm24 +0x62,0x82,0x44,0x20,0x51,0xf0 + +# ATT: vpdpbuuds %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vpdpbuuds ymm22 {k7}, ymm23, ymm24 +0x62,0x82,0x44,0x27,0x51,0xf0 + +# ATT: vpdpbuuds %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpbuuds ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x82,0x44,0xa7,0x51,0xf0 + +# ATT: vpdpbuuds %zmm24, %zmm23, %zmm22 +# INTEL: vpdpbuuds zmm22, zmm23, zmm24 +0x62,0x82,0x44,0x40,0x51,0xf0 + +# ATT: vpdpbuuds %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vpdpbuuds zmm22 {k7}, zmm23, zmm24 +0x62,0x82,0x44,0x47,0x51,0xf0 + +# ATT: vpdpbuuds %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpbuuds zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x82,0x44,0xc7,0x51,0xf0 + +# ATT: vpdpbuuds 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vpdpbuuds xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x44,0x00,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbuuds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vpdpbuuds xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x44,0x07,0x51,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbuuds (%rip){1to4}, %xmm23, %xmm22 +# INTEL: vpdpbuuds xmm22, xmm23, dword ptr [rip]{1to4} +0x62,0xe2,0x44,0x10,0x51,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpbuuds -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vpdpbuuds xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe2,0x44,0x00,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpbuuds 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpbuuds xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe2,0x44,0x87,0x51,0x71,0x7f + +# ATT: vpdpbuuds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpbuuds xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4} +0x62,0xe2,0x44,0x97,0x51,0x72,0x80 + +# ATT: vpdpbuuds 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vpdpbuuds ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x44,0x20,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbuuds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vpdpbuuds ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x44,0x27,0x51,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbuuds (%rip){1to8}, %ymm23, %ymm22 +# INTEL: vpdpbuuds ymm22, ymm23, dword ptr [rip]{1to8} +0x62,0xe2,0x44,0x30,0x51,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpbuuds -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vpdpbuuds ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe2,0x44,0x20,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpbuuds 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpbuuds ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe2,0x44,0xa7,0x51,0x71,0x7f + +# ATT: vpdpbuuds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpbuuds ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8} +0x62,0xe2,0x44,0xb7,0x51,0x72,0x80 + +# ATT: vpdpbuuds 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vpdpbuuds zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x44,0x40,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbuuds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vpdpbuuds zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x44,0x47,0x51,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbuuds (%rip){1to16}, %zmm23, %zmm22 +# INTEL: vpdpbuuds zmm22, zmm23, dword ptr [rip]{1to16} +0x62,0xe2,0x44,0x50,0x51,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpbuuds -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vpdpbuuds zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe2,0x44,0x40,0x51,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vpdpbuuds 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpbuuds zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe2,0x44,0xc7,0x51,0x71,0x7f + +# ATT: vpdpbuuds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpbuuds zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16} +0x62,0xe2,0x44,0xd7,0x51,0x72,0x80 + +# VNNI INT16 + +# ATT: vpdpwsud %xmm24, %xmm23, %xmm22 +# INTEL: vpdpwsud xmm22, xmm23, xmm24 +0x62,0x82,0x46,0x00,0xd2,0xf0 + +# ATT: vpdpwsud %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vpdpwsud xmm22 {k7}, xmm23, xmm24 +0x62,0x82,0x46,0x07,0xd2,0xf0 + +# ATT: vpdpwsud %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpwsud xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x82,0x46,0x87,0xd2,0xf0 + +# ATT: vpdpwsud %ymm24, %ymm23, %ymm22 +# INTEL: vpdpwsud ymm22, ymm23, ymm24 +0x62,0x82,0x46,0x20,0xd2,0xf0 + +# ATT: vpdpwsud %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vpdpwsud ymm22 {k7}, ymm23, ymm24 +0x62,0x82,0x46,0x27,0xd2,0xf0 + +# ATT: vpdpwsud %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpwsud ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x82,0x46,0xa7,0xd2,0xf0 + +# ATT: vpdpwsud %zmm24, %zmm23, %zmm22 +# INTEL: vpdpwsud zmm22, zmm23, zmm24 +0x62,0x82,0x46,0x40,0xd2,0xf0 + +# ATT: vpdpwsud %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vpdpwsud zmm22 {k7}, zmm23, zmm24 +0x62,0x82,0x46,0x47,0xd2,0xf0 + +# ATT: vpdpwsud %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpwsud zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x82,0x46,0xc7,0xd2,0xf0 + +# ATT: vpdpwsud 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vpdpwsud xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x46,0x00,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwsud 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vpdpwsud xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x46,0x07,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwsud (%rip){1to4}, %xmm23, %xmm22 +# INTEL: vpdpwsud xmm22, xmm23, dword ptr [rip]{1to4} +0x62,0xe2,0x46,0x10,0xd2,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpwsud -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vpdpwsud xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe2,0x46,0x00,0xd2,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpwsud 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpwsud xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe2,0x46,0x87,0xd2,0x71,0x7f + +# ATT: vpdpwsud -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpwsud xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4} +0x62,0xe2,0x46,0x97,0xd2,0x72,0x80 + +# ATT: vpdpwsud 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vpdpwsud ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x46,0x20,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwsud 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vpdpwsud ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x46,0x27,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwsud (%rip){1to8}, %ymm23, %ymm22 +# INTEL: vpdpwsud ymm22, ymm23, dword ptr [rip]{1to8} +0x62,0xe2,0x46,0x30,0xd2,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpwsud -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vpdpwsud ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe2,0x46,0x20,0xd2,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpwsud 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpwsud ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe2,0x46,0xa7,0xd2,0x71,0x7f + +# ATT: vpdpwsud -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpwsud ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8} +0x62,0xe2,0x46,0xb7,0xd2,0x72,0x80 + +# ATT: vpdpwsud 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vpdpwsud zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x46,0x40,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwsud 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vpdpwsud zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x46,0x47,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwsud (%rip){1to16}, %zmm23, %zmm22 +# INTEL: vpdpwsud zmm22, zmm23, dword ptr [rip]{1to16} +0x62,0xe2,0x46,0x50,0xd2,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpwsud -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vpdpwsud zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe2,0x46,0x40,0xd2,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vpdpwsud 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpwsud zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe2,0x46,0xc7,0xd2,0x71,0x7f + +# ATT: vpdpwsud -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpwsud zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16} +0x62,0xe2,0x46,0xd7,0xd2,0x72,0x80 + +# ATT: vpdpwsuds %xmm24, %xmm23, %xmm22 +# INTEL: vpdpwsuds xmm22, xmm23, xmm24 +0x62,0x82,0x46,0x00,0xd3,0xf0 + +# ATT: vpdpwsuds %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vpdpwsuds xmm22 {k7}, xmm23, xmm24 +0x62,0x82,0x46,0x07,0xd3,0xf0 + +# ATT: vpdpwsuds %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpwsuds xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x82,0x46,0x87,0xd3,0xf0 + +# ATT: vpdpwsuds %ymm24, %ymm23, %ymm22 +# INTEL: vpdpwsuds ymm22, ymm23, ymm24 +0x62,0x82,0x46,0x20,0xd3,0xf0 + +# ATT: vpdpwsuds %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vpdpwsuds ymm22 {k7}, ymm23, ymm24 +0x62,0x82,0x46,0x27,0xd3,0xf0 + +# ATT: vpdpwsuds %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpwsuds ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x82,0x46,0xa7,0xd3,0xf0 + +# ATT: vpdpwsuds %zmm24, %zmm23, %zmm22 +# INTEL: vpdpwsuds zmm22, zmm23, zmm24 +0x62,0x82,0x46,0x40,0xd3,0xf0 + +# ATT: vpdpwsuds %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vpdpwsuds zmm22 {k7}, zmm23, zmm24 +0x62,0x82,0x46,0x47,0xd3,0xf0 + +# ATT: vpdpwsuds %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpwsuds zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x82,0x46,0xc7,0xd3,0xf0 + +# ATT: vpdpwsuds 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vpdpwsuds xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x46,0x00,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwsuds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vpdpwsuds xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x46,0x07,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwsuds (%rip){1to4}, %xmm23, %xmm22 +# INTEL: vpdpwsuds xmm22, xmm23, dword ptr [rip]{1to4} +0x62,0xe2,0x46,0x10,0xd3,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpwsuds -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vpdpwsuds xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe2,0x46,0x00,0xd3,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpwsuds 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpwsuds xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe2,0x46,0x87,0xd3,0x71,0x7f + +# ATT: vpdpwsuds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpwsuds xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4} +0x62,0xe2,0x46,0x97,0xd3,0x72,0x80 + +# ATT: vpdpwsuds 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vpdpwsuds ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x46,0x20,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwsuds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vpdpwsuds ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x46,0x27,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwsuds (%rip){1to8}, %ymm23, %ymm22 +# INTEL: vpdpwsuds ymm22, ymm23, dword ptr [rip]{1to8} +0x62,0xe2,0x46,0x30,0xd3,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpwsuds -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vpdpwsuds ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe2,0x46,0x20,0xd3,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpwsuds 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpwsuds ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe2,0x46,0xa7,0xd3,0x71,0x7f + +# ATT: vpdpwsuds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpwsuds ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8} +0x62,0xe2,0x46,0xb7,0xd3,0x72,0x80 + +# ATT: vpdpwsuds 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vpdpwsuds zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x46,0x40,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwsuds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vpdpwsuds zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x46,0x47,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwsuds (%rip){1to16}, %zmm23, %zmm22 +# INTEL: vpdpwsuds zmm22, zmm23, dword ptr [rip]{1to16} +0x62,0xe2,0x46,0x50,0xd3,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpwsuds -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vpdpwsuds zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe2,0x46,0x40,0xd3,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vpdpwsuds 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpwsuds zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe2,0x46,0xc7,0xd3,0x71,0x7f + +# ATT: vpdpwsuds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpwsuds zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16} +0x62,0xe2,0x46,0xd7,0xd3,0x72,0x80 + +# ATT: vpdpwusd %xmm24, %xmm23, %xmm22 +# INTEL: vpdpwusd xmm22, xmm23, xmm24 +0x62,0x82,0x45,0x00,0xd2,0xf0 + +# ATT: vpdpwusd %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vpdpwusd xmm22 {k7}, xmm23, xmm24 +0x62,0x82,0x45,0x07,0xd2,0xf0 + +# ATT: vpdpwusd %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpwusd xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x82,0x45,0x87,0xd2,0xf0 + +# ATT: vpdpwusd %ymm24, %ymm23, %ymm22 +# INTEL: vpdpwusd ymm22, ymm23, ymm24 +0x62,0x82,0x45,0x20,0xd2,0xf0 + +# ATT: vpdpwusd %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vpdpwusd ymm22 {k7}, ymm23, ymm24 +0x62,0x82,0x45,0x27,0xd2,0xf0 + +# ATT: vpdpwusd %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpwusd ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x82,0x45,0xa7,0xd2,0xf0 + +# ATT: vpdpwusd %zmm24, %zmm23, %zmm22 +# INTEL: vpdpwusd zmm22, zmm23, zmm24 +0x62,0x82,0x45,0x40,0xd2,0xf0 + +# ATT: vpdpwusd %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vpdpwusd zmm22 {k7}, zmm23, zmm24 +0x62,0x82,0x45,0x47,0xd2,0xf0 + +# ATT: vpdpwusd %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpwusd zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x82,0x45,0xc7,0xd2,0xf0 + +# ATT: vpdpwusd 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vpdpwusd xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x45,0x00,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwusd 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vpdpwusd xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x45,0x07,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwusd (%rip){1to4}, %xmm23, %xmm22 +# INTEL: vpdpwusd xmm22, xmm23, dword ptr [rip]{1to4} +0x62,0xe2,0x45,0x10,0xd2,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpwusd -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vpdpwusd xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe2,0x45,0x00,0xd2,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpwusd 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpwusd xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe2,0x45,0x87,0xd2,0x71,0x7f + +# ATT: vpdpwusd -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpwusd xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4} +0x62,0xe2,0x45,0x97,0xd2,0x72,0x80 + +# ATT: vpdpwusd 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vpdpwusd ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x45,0x20,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwusd 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vpdpwusd ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x45,0x27,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwusd (%rip){1to8}, %ymm23, %ymm22 +# INTEL: vpdpwusd ymm22, ymm23, dword ptr [rip]{1to8} +0x62,0xe2,0x45,0x30,0xd2,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpwusd -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vpdpwusd ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe2,0x45,0x20,0xd2,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpwusd 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpwusd ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe2,0x45,0xa7,0xd2,0x71,0x7f + +# ATT: vpdpwusd -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpwusd ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8} +0x62,0xe2,0x45,0xb7,0xd2,0x72,0x80 + +# ATT: vpdpwusd 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vpdpwusd zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x45,0x40,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwusd 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vpdpwusd zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x45,0x47,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwusd (%rip){1to16}, %zmm23, %zmm22 +# INTEL: vpdpwusd zmm22, zmm23, dword ptr [rip]{1to16} +0x62,0xe2,0x45,0x50,0xd2,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpwusd -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vpdpwusd zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe2,0x45,0x40,0xd2,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vpdpwusd 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpwusd zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe2,0x45,0xc7,0xd2,0x71,0x7f + +# ATT: vpdpwusd -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpwusd zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16} +0x62,0xe2,0x45,0xd7,0xd2,0x72,0x80 + +# ATT: vpdpwusds %xmm24, %xmm23, %xmm22 +# INTEL: vpdpwusds xmm22, xmm23, xmm24 +0x62,0x82,0x45,0x00,0xd3,0xf0 + +# ATT: vpdpwusds %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vpdpwusds xmm22 {k7}, xmm23, xmm24 +0x62,0x82,0x45,0x07,0xd3,0xf0 + +# ATT: vpdpwusds %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpwusds xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x82,0x45,0x87,0xd3,0xf0 + +# ATT: vpdpwusds %ymm24, %ymm23, %ymm22 +# INTEL: vpdpwusds ymm22, ymm23, ymm24 +0x62,0x82,0x45,0x20,0xd3,0xf0 + +# ATT: vpdpwusds %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vpdpwusds ymm22 {k7}, ymm23, ymm24 +0x62,0x82,0x45,0x27,0xd3,0xf0 + +# ATT: vpdpwusds %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpwusds ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x82,0x45,0xa7,0xd3,0xf0 + +# ATT: vpdpwusds %zmm24, %zmm23, %zmm22 +# INTEL: vpdpwusds zmm22, zmm23, zmm24 +0x62,0x82,0x45,0x40,0xd3,0xf0 + +# ATT: vpdpwusds %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vpdpwusds zmm22 {k7}, zmm23, zmm24 +0x62,0x82,0x45,0x47,0xd3,0xf0 + +# ATT: vpdpwusds %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpwusds zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x82,0x45,0xc7,0xd3,0xf0 + +# ATT: vpdpwusds 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vpdpwusds xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x45,0x00,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwusds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vpdpwusds xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x45,0x07,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwusds (%rip){1to4}, %xmm23, %xmm22 +# INTEL: vpdpwusds xmm22, xmm23, dword ptr [rip]{1to4} +0x62,0xe2,0x45,0x10,0xd3,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpwusds -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vpdpwusds xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe2,0x45,0x00,0xd3,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpwusds 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpwusds xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe2,0x45,0x87,0xd3,0x71,0x7f + +# ATT: vpdpwusds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpwusds xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4} +0x62,0xe2,0x45,0x97,0xd3,0x72,0x80 + +# ATT: vpdpwusds 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vpdpwusds ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x45,0x20,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwusds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vpdpwusds ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x45,0x27,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwusds (%rip){1to8}, %ymm23, %ymm22 +# INTEL: vpdpwusds ymm22, ymm23, dword ptr [rip]{1to8} +0x62,0xe2,0x45,0x30,0xd3,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpwusds -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vpdpwusds ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe2,0x45,0x20,0xd3,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpwusds 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpwusds ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe2,0x45,0xa7,0xd3,0x71,0x7f + +# ATT: vpdpwusds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpwusds ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8} +0x62,0xe2,0x45,0xb7,0xd3,0x72,0x80 + +# ATT: vpdpwusds 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vpdpwusds zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x45,0x40,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwusds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vpdpwusds zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x45,0x47,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwusds (%rip){1to16}, %zmm23, %zmm22 +# INTEL: vpdpwusds zmm22, zmm23, dword ptr [rip]{1to16} +0x62,0xe2,0x45,0x50,0xd3,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpwusds -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vpdpwusds zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe2,0x45,0x40,0xd3,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vpdpwusds 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpwusds zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe2,0x45,0xc7,0xd3,0x71,0x7f + +# ATT: vpdpwusds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpwusds zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16} +0x62,0xe2,0x45,0xd7,0xd3,0x72,0x80 + +# ATT: vpdpwuud %xmm24, %xmm23, %xmm22 +# INTEL: vpdpwuud xmm22, xmm23, xmm24 +0x62,0x82,0x44,0x00,0xd2,0xf0 + +# ATT: vpdpwuud %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vpdpwuud xmm22 {k7}, xmm23, xmm24 +0x62,0x82,0x44,0x07,0xd2,0xf0 + +# ATT: vpdpwuud %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpwuud xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x82,0x44,0x87,0xd2,0xf0 + +# ATT: vpdpwuud %ymm24, %ymm23, %ymm22 +# INTEL: vpdpwuud ymm22, ymm23, ymm24 +0x62,0x82,0x44,0x20,0xd2,0xf0 + +# ATT: vpdpwuud %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vpdpwuud ymm22 {k7}, ymm23, ymm24 +0x62,0x82,0x44,0x27,0xd2,0xf0 + +# ATT: vpdpwuud %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpwuud ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x82,0x44,0xa7,0xd2,0xf0 + +# ATT: vpdpwuud %zmm24, %zmm23, %zmm22 +# INTEL: vpdpwuud zmm22, zmm23, zmm24 +0x62,0x82,0x44,0x40,0xd2,0xf0 + +# ATT: vpdpwuud %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vpdpwuud zmm22 {k7}, zmm23, zmm24 +0x62,0x82,0x44,0x47,0xd2,0xf0 + +# ATT: vpdpwuud %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpwuud zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x82,0x44,0xc7,0xd2,0xf0 + +# ATT: vpdpwuud 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vpdpwuud xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x44,0x00,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwuud 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vpdpwuud xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x44,0x07,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwuud (%rip){1to4}, %xmm23, %xmm22 +# INTEL: vpdpwuud xmm22, xmm23, dword ptr [rip]{1to4} +0x62,0xe2,0x44,0x10,0xd2,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpwuud -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vpdpwuud xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe2,0x44,0x00,0xd2,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpwuud 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpwuud xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe2,0x44,0x87,0xd2,0x71,0x7f + +# ATT: vpdpwuud -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpwuud xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4} +0x62,0xe2,0x44,0x97,0xd2,0x72,0x80 + +# ATT: vpdpwuud 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vpdpwuud ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x44,0x20,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwuud 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vpdpwuud ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x44,0x27,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwuud (%rip){1to8}, %ymm23, %ymm22 +# INTEL: vpdpwuud ymm22, ymm23, dword ptr [rip]{1to8} +0x62,0xe2,0x44,0x30,0xd2,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpwuud -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vpdpwuud ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe2,0x44,0x20,0xd2,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpwuud 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpwuud ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe2,0x44,0xa7,0xd2,0x71,0x7f + +# ATT: vpdpwuud -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpwuud ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8} +0x62,0xe2,0x44,0xb7,0xd2,0x72,0x80 + +# ATT: vpdpwuud 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vpdpwuud zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x44,0x40,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwuud 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vpdpwuud zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x44,0x47,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwuud (%rip){1to16}, %zmm23, %zmm22 +# INTEL: vpdpwuud zmm22, zmm23, dword ptr [rip]{1to16} +0x62,0xe2,0x44,0x50,0xd2,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpwuud -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vpdpwuud zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe2,0x44,0x40,0xd2,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vpdpwuud 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpwuud zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe2,0x44,0xc7,0xd2,0x71,0x7f + +# ATT: vpdpwuud -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpwuud zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16} +0x62,0xe2,0x44,0xd7,0xd2,0x72,0x80 + +# ATT: vpdpwuuds %xmm24, %xmm23, %xmm22 +# INTEL: vpdpwuuds xmm22, xmm23, xmm24 +0x62,0x82,0x44,0x00,0xd3,0xf0 + +# ATT: vpdpwuuds %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vpdpwuuds xmm22 {k7}, xmm23, xmm24 +0x62,0x82,0x44,0x07,0xd3,0xf0 + +# ATT: vpdpwuuds %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpwuuds xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x82,0x44,0x87,0xd3,0xf0 + +# ATT: vpdpwuuds %ymm24, %ymm23, %ymm22 +# INTEL: vpdpwuuds ymm22, ymm23, ymm24 +0x62,0x82,0x44,0x20,0xd3,0xf0 + +# ATT: vpdpwuuds %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vpdpwuuds ymm22 {k7}, ymm23, ymm24 +0x62,0x82,0x44,0x27,0xd3,0xf0 + +# ATT: vpdpwuuds %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpwuuds ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x82,0x44,0xa7,0xd3,0xf0 + +# ATT: vpdpwuuds %zmm24, %zmm23, %zmm22 +# INTEL: vpdpwuuds zmm22, zmm23, zmm24 +0x62,0x82,0x44,0x40,0xd3,0xf0 + +# ATT: vpdpwuuds %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vpdpwuuds zmm22 {k7}, zmm23, zmm24 +0x62,0x82,0x44,0x47,0xd3,0xf0 + +# ATT: vpdpwuuds %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpwuuds zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x82,0x44,0xc7,0xd3,0xf0 + +# ATT: vpdpwuuds 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vpdpwuuds xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x44,0x00,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwuuds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vpdpwuuds xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x44,0x07,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwuuds (%rip){1to4}, %xmm23, %xmm22 +# INTEL: vpdpwuuds xmm22, xmm23, dword ptr [rip]{1to4} +0x62,0xe2,0x44,0x10,0xd3,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpwuuds -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vpdpwuuds xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe2,0x44,0x00,0xd3,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpwuuds 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpwuuds xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe2,0x44,0x87,0xd3,0x71,0x7f + +# ATT: vpdpwuuds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vpdpwuuds xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4} +0x62,0xe2,0x44,0x97,0xd3,0x72,0x80 + +# ATT: vpdpwuuds 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vpdpwuuds ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x44,0x20,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwuuds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vpdpwuuds ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x44,0x27,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwuuds (%rip){1to8}, %ymm23, %ymm22 +# INTEL: vpdpwuuds ymm22, ymm23, dword ptr [rip]{1to8} +0x62,0xe2,0x44,0x30,0xd3,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpwuuds -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vpdpwuuds ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe2,0x44,0x20,0xd3,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpwuuds 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpwuuds ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe2,0x44,0xa7,0xd3,0x71,0x7f + +# ATT: vpdpwuuds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vpdpwuuds ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8} +0x62,0xe2,0x44,0xb7,0xd3,0x72,0x80 + +# ATT: vpdpwuuds 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vpdpwuuds zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x44,0x40,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwuuds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vpdpwuuds zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x44,0x47,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwuuds (%rip){1to16}, %zmm23, %zmm22 +# INTEL: vpdpwuuds zmm22, zmm23, dword ptr [rip]{1to16} +0x62,0xe2,0x44,0x50,0xd3,0x35,0x00,0x00,0x00,0x00 + +# ATT: vpdpwuuds -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vpdpwuuds zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe2,0x44,0x40,0xd3,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vpdpwuuds 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpwuuds zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe2,0x44,0xc7,0xd3,0x71,0x7f + +# ATT: vpdpwuuds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vpdpwuuds zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16} +0x62,0xe2,0x44,0xd7,0xd3,0x72,0x80 + # VMPSADBW # ATT: vmpsadbw $123, %xmm24, %xmm23, %xmm22 diff --git a/llvm/test/MC/X86/avx10_2ni-32-intel.s b/llvm/test/MC/X86/avx10_2ni-32-intel.s index 5dbc1c226e67af9..123f57411acb043 100644 --- a/llvm/test/MC/X86/avx10_2ni-32-intel.s +++ b/llvm/test/MC/X86/avx10_2ni-32-intel.s @@ -1,5 +1,1415 @@ // RUN: llvm-mc -triple i386 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s +// VNNI FP16 + +// CHECK: vdpphps xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x08,0x52,0xd4] + vdpphps xmm2, xmm3, xmm4 + +// CHECK: vdpphps xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0x52,0xd4] + vdpphps xmm2 {k7}, xmm3, xmm4 + +// CHECK: vdpphps xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0x52,0xd4] + vdpphps xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vdpphps ymm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x28,0x52,0xd4] + vdpphps ymm2, ymm3, ymm4 + +// CHECK: vdpphps ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0x52,0xd4] + vdpphps ymm2 {k7}, ymm3, ymm4 + +// CHECK: vdpphps ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0x52,0xd4] + vdpphps ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vdpphps zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x52,0xd4] + vdpphps zmm2, zmm3, zmm4 + +// CHECK: vdpphps zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0x52,0xd4] + vdpphps zmm2 {k7}, zmm3, zmm4 + +// CHECK: vdpphps zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0x52,0xd4] + vdpphps zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vdpphps xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x64,0x08,0x52,0x94,0xf4,0x00,0x00,0x00,0x10] + vdpphps xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vdpphps xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0x52,0x94,0x87,0x23,0x01,0x00,0x00] + vdpphps xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vdpphps xmm2, xmm3, dword ptr [eax]{1to4} +// CHECK: encoding: [0x62,0xf2,0x64,0x18,0x52,0x10] + vdpphps xmm2, xmm3, dword ptr [eax]{1to4} + +// CHECK: vdpphps xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf2,0x64,0x08,0x52,0x14,0x6d,0x00,0xfe,0xff,0xff] + vdpphps xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vdpphps xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0x52,0x51,0x7f] + vdpphps xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vdpphps xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +// CHECK: encoding: [0x62,0xf2,0x64,0x9f,0x52,0x52,0x80] + vdpphps xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} + +// CHECK: vdpphps ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x64,0x28,0x52,0x94,0xf4,0x00,0x00,0x00,0x10] + vdpphps ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vdpphps ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0x52,0x94,0x87,0x23,0x01,0x00,0x00] + vdpphps ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vdpphps ymm2, ymm3, dword ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf2,0x64,0x38,0x52,0x10] + vdpphps ymm2, ymm3, dword ptr [eax]{1to8} + +// CHECK: vdpphps ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf2,0x64,0x28,0x52,0x14,0x6d,0x00,0xfc,0xff,0xff] + vdpphps ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vdpphps ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0x52,0x51,0x7f] + vdpphps ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vdpphps ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +// CHECK: encoding: [0x62,0xf2,0x64,0xbf,0x52,0x52,0x80] + vdpphps ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} + +// CHECK: vdpphps zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x52,0x94,0xf4,0x00,0x00,0x00,0x10] + vdpphps zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vdpphps zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0x52,0x94,0x87,0x23,0x01,0x00,0x00] + vdpphps zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vdpphps zmm2, zmm3, dword ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf2,0x64,0x58,0x52,0x10] + vdpphps zmm2, zmm3, dword ptr [eax]{1to16} + +// CHECK: vdpphps zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x52,0x14,0x6d,0x00,0xf8,0xff,0xff] + vdpphps zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vdpphps zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0x52,0x51,0x7f] + vdpphps zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vdpphps zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +// CHECK: encoding: [0x62,0xf2,0x64,0xdf,0x52,0x52,0x80] + vdpphps zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} + +// VNNI INT8 + +// CHECK: vpdpbssd xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0xd4] + vpdpbssd xmm2, xmm3, xmm4 + +// CHECK: vpdpbssd xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x67,0x0f,0x50,0xd4] + vpdpbssd xmm2 {k7}, xmm3, xmm4 + +// CHECK: vpdpbssd xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x67,0x8f,0x50,0xd4] + vpdpbssd xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vpdpbssd ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0xd4] + vpdpbssd ymm2, ymm3, ymm4 + +// CHECK: vpdpbssd ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x67,0x2f,0x50,0xd4] + vpdpbssd ymm2 {k7}, ymm3, ymm4 + +// CHECK: vpdpbssd ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x67,0xaf,0x50,0xd4] + vpdpbssd ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vpdpbssd zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x67,0x48,0x50,0xd4] + vpdpbssd zmm2, zmm3, zmm4 + +// CHECK: vpdpbssd zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x67,0x4f,0x50,0xd4] + vpdpbssd zmm2 {k7}, zmm3, zmm4 + +// CHECK: vpdpbssd zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x67,0xcf,0x50,0xd4] + vpdpbssd zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbssd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbssd xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x67,0x0f,0x50,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbssd xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbssd xmm2, xmm3, dword ptr [eax]{1to4} +// CHECK: encoding: [0x62,0xf2,0x67,0x18,0x50,0x10] + vpdpbssd xmm2, xmm3, dword ptr [eax]{1to4} + +// CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpbssd xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vpdpbssd xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf2,0x67,0x8f,0x50,0x51,0x7f] + vpdpbssd xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vpdpbssd xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +// CHECK: encoding: [0x62,0xf2,0x67,0x9f,0x50,0x52,0x80] + vpdpbssd xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} + +// CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbssd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbssd ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x67,0x2f,0x50,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbssd ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbssd ymm2, ymm3, dword ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf2,0x67,0x38,0x50,0x10] + vpdpbssd ymm2, ymm3, dword ptr [eax]{1to8} + +// CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpbssd ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vpdpbssd ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf2,0x67,0xaf,0x50,0x51,0x7f] + vpdpbssd ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vpdpbssd ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +// CHECK: encoding: [0x62,0xf2,0x67,0xbf,0x50,0x52,0x80] + vpdpbssd ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} + +// CHECK: vpdpbssd zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x67,0x48,0x50,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbssd zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbssd zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x67,0x4f,0x50,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbssd zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbssd zmm2, zmm3, dword ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf2,0x67,0x58,0x50,0x10] + vpdpbssd zmm2, zmm3, dword ptr [eax]{1to16} + +// CHECK: vpdpbssd zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf2,0x67,0x48,0x50,0x14,0x6d,0x00,0xf8,0xff,0xff] + vpdpbssd zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vpdpbssd zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf2,0x67,0xcf,0x50,0x51,0x7f] + vpdpbssd zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vpdpbssd zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +// CHECK: encoding: [0x62,0xf2,0x67,0xdf,0x50,0x52,0x80] + vpdpbssd zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} + +// CHECK: vpdpbssds xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0xd4] + vpdpbssds xmm2, xmm3, xmm4 + +// CHECK: vpdpbssds xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x67,0x0f,0x51,0xd4] + vpdpbssds xmm2 {k7}, xmm3, xmm4 + +// CHECK: vpdpbssds xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x67,0x8f,0x51,0xd4] + vpdpbssds xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vpdpbssds ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0xd4] + vpdpbssds ymm2, ymm3, ymm4 + +// CHECK: vpdpbssds ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x67,0x2f,0x51,0xd4] + vpdpbssds ymm2 {k7}, ymm3, ymm4 + +// CHECK: vpdpbssds ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x67,0xaf,0x51,0xd4] + vpdpbssds ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vpdpbssds zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x67,0x48,0x51,0xd4] + vpdpbssds zmm2, zmm3, zmm4 + +// CHECK: vpdpbssds zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x67,0x4f,0x51,0xd4] + vpdpbssds zmm2 {k7}, zmm3, zmm4 + +// CHECK: vpdpbssds zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x67,0xcf,0x51,0xd4] + vpdpbssds zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbssds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbssds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x67,0x0f,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbssds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbssds xmm2, xmm3, dword ptr [eax]{1to4} +// CHECK: encoding: [0x62,0xf2,0x67,0x18,0x51,0x10] + vpdpbssds xmm2, xmm3, dword ptr [eax]{1to4} + +// CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpbssds xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vpdpbssds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf2,0x67,0x8f,0x51,0x51,0x7f] + vpdpbssds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vpdpbssds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +// CHECK: encoding: [0x62,0xf2,0x67,0x9f,0x51,0x52,0x80] + vpdpbssds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} + +// CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbssds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbssds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x67,0x2f,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbssds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbssds ymm2, ymm3, dword ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf2,0x67,0x38,0x51,0x10] + vpdpbssds ymm2, ymm3, dword ptr [eax]{1to8} + +// CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpbssds ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vpdpbssds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf2,0x67,0xaf,0x51,0x51,0x7f] + vpdpbssds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vpdpbssds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +// CHECK: encoding: [0x62,0xf2,0x67,0xbf,0x51,0x52,0x80] + vpdpbssds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} + +// CHECK: vpdpbssds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x67,0x48,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbssds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbssds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x67,0x4f,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbssds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbssds zmm2, zmm3, dword ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf2,0x67,0x58,0x51,0x10] + vpdpbssds zmm2, zmm3, dword ptr [eax]{1to16} + +// CHECK: vpdpbssds zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf2,0x67,0x48,0x51,0x14,0x6d,0x00,0xf8,0xff,0xff] + vpdpbssds zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vpdpbssds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf2,0x67,0xcf,0x51,0x51,0x7f] + vpdpbssds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vpdpbssds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +// CHECK: encoding: [0x62,0xf2,0x67,0xdf,0x51,0x52,0x80] + vpdpbssds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} + +// CHECK: vpdpbsud xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0xd4] + vpdpbsud xmm2, xmm3, xmm4 + +// CHECK: vpdpbsud xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x66,0x0f,0x50,0xd4] + vpdpbsud xmm2 {k7}, xmm3, xmm4 + +// CHECK: vpdpbsud xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x66,0x8f,0x50,0xd4] + vpdpbsud xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vpdpbsud ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0xd4] + vpdpbsud ymm2, ymm3, ymm4 + +// CHECK: vpdpbsud ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x66,0x2f,0x50,0xd4] + vpdpbsud ymm2 {k7}, ymm3, ymm4 + +// CHECK: vpdpbsud ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x66,0xaf,0x50,0xd4] + vpdpbsud ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vpdpbsud zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x66,0x48,0x50,0xd4] + vpdpbsud zmm2, zmm3, zmm4 + +// CHECK: vpdpbsud zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x66,0x4f,0x50,0xd4] + vpdpbsud zmm2 {k7}, zmm3, zmm4 + +// CHECK: vpdpbsud zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x66,0xcf,0x50,0xd4] + vpdpbsud zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbsud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x66,0x0f,0x50,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbsud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbsud xmm2, xmm3, dword ptr [eax]{1to4} +// CHECK: encoding: [0x62,0xf2,0x66,0x18,0x50,0x10] + vpdpbsud xmm2, xmm3, dword ptr [eax]{1to4} + +// CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpbsud xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vpdpbsud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf2,0x66,0x8f,0x50,0x51,0x7f] + vpdpbsud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vpdpbsud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +// CHECK: encoding: [0x62,0xf2,0x66,0x9f,0x50,0x52,0x80] + vpdpbsud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} + +// CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbsud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x66,0x2f,0x50,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbsud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbsud ymm2, ymm3, dword ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf2,0x66,0x38,0x50,0x10] + vpdpbsud ymm2, ymm3, dword ptr [eax]{1to8} + +// CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpbsud ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vpdpbsud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf2,0x66,0xaf,0x50,0x51,0x7f] + vpdpbsud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vpdpbsud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +// CHECK: encoding: [0x62,0xf2,0x66,0xbf,0x50,0x52,0x80] + vpdpbsud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} + +// CHECK: vpdpbsud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x66,0x48,0x50,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbsud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbsud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x66,0x4f,0x50,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbsud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbsud zmm2, zmm3, dword ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf2,0x66,0x58,0x50,0x10] + vpdpbsud zmm2, zmm3, dword ptr [eax]{1to16} + +// CHECK: vpdpbsud zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf2,0x66,0x48,0x50,0x14,0x6d,0x00,0xf8,0xff,0xff] + vpdpbsud zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vpdpbsud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf2,0x66,0xcf,0x50,0x51,0x7f] + vpdpbsud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vpdpbsud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +// CHECK: encoding: [0x62,0xf2,0x66,0xdf,0x50,0x52,0x80] + vpdpbsud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} + +// CHECK: vpdpbsuds xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0xd4] + vpdpbsuds xmm2, xmm3, xmm4 + +// CHECK: vpdpbsuds xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x66,0x0f,0x51,0xd4] + vpdpbsuds xmm2 {k7}, xmm3, xmm4 + +// CHECK: vpdpbsuds xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x66,0x8f,0x51,0xd4] + vpdpbsuds xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vpdpbsuds ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0xd4] + vpdpbsuds ymm2, ymm3, ymm4 + +// CHECK: vpdpbsuds ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x66,0x2f,0x51,0xd4] + vpdpbsuds ymm2 {k7}, ymm3, ymm4 + +// CHECK: vpdpbsuds ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x66,0xaf,0x51,0xd4] + vpdpbsuds ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vpdpbsuds zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x66,0x48,0x51,0xd4] + vpdpbsuds zmm2, zmm3, zmm4 + +// CHECK: vpdpbsuds zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x66,0x4f,0x51,0xd4] + vpdpbsuds zmm2 {k7}, zmm3, zmm4 + +// CHECK: vpdpbsuds zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x66,0xcf,0x51,0xd4] + vpdpbsuds zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbsuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x66,0x0f,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbsuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbsuds xmm2, xmm3, dword ptr [eax]{1to4} +// CHECK: encoding: [0x62,0xf2,0x66,0x18,0x51,0x10] + vpdpbsuds xmm2, xmm3, dword ptr [eax]{1to4} + +// CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpbsuds xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vpdpbsuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf2,0x66,0x8f,0x51,0x51,0x7f] + vpdpbsuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vpdpbsuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +// CHECK: encoding: [0x62,0xf2,0x66,0x9f,0x51,0x52,0x80] + vpdpbsuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} + +// CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbsuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x66,0x2f,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbsuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbsuds ymm2, ymm3, dword ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf2,0x66,0x38,0x51,0x10] + vpdpbsuds ymm2, ymm3, dword ptr [eax]{1to8} + +// CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpbsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vpdpbsuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf2,0x66,0xaf,0x51,0x51,0x7f] + vpdpbsuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vpdpbsuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +// CHECK: encoding: [0x62,0xf2,0x66,0xbf,0x51,0x52,0x80] + vpdpbsuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} + +// CHECK: vpdpbsuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x66,0x48,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbsuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbsuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x66,0x4f,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbsuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbsuds zmm2, zmm3, dword ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf2,0x66,0x58,0x51,0x10] + vpdpbsuds zmm2, zmm3, dword ptr [eax]{1to16} + +// CHECK: vpdpbsuds zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf2,0x66,0x48,0x51,0x14,0x6d,0x00,0xf8,0xff,0xff] + vpdpbsuds zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vpdpbsuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf2,0x66,0xcf,0x51,0x51,0x7f] + vpdpbsuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vpdpbsuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +// CHECK: encoding: [0x62,0xf2,0x66,0xdf,0x51,0x52,0x80] + vpdpbsuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} + +// CHECK: vpdpbuud xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0xd4] + vpdpbuud xmm2, xmm3, xmm4 + +// CHECK: vpdpbuud xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0x50,0xd4] + vpdpbuud xmm2 {k7}, xmm3, xmm4 + +// CHECK: vpdpbuud xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0x50,0xd4] + vpdpbuud xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vpdpbuud ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0xd4] + vpdpbuud ymm2, ymm3, ymm4 + +// CHECK: vpdpbuud ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0x50,0xd4] + vpdpbuud ymm2 {k7}, ymm3, ymm4 + +// CHECK: vpdpbuud ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0x50,0xd4] + vpdpbuud ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vpdpbuud zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x50,0xd4] + vpdpbuud zmm2, zmm3, zmm4 + +// CHECK: vpdpbuud zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0x50,0xd4] + vpdpbuud zmm2 {k7}, zmm3, zmm4 + +// CHECK: vpdpbuud zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0x50,0xd4] + vpdpbuud zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbuud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0x50,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbuud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbuud xmm2, xmm3, dword ptr [eax]{1to4} +// CHECK: encoding: [0x62,0xf2,0x64,0x18,0x50,0x10] + vpdpbuud xmm2, xmm3, dword ptr [eax]{1to4} + +// CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpbuud xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vpdpbuud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0x50,0x51,0x7f] + vpdpbuud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vpdpbuud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +// CHECK: encoding: [0x62,0xf2,0x64,0x9f,0x50,0x52,0x80] + vpdpbuud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} + +// CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbuud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0x50,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbuud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbuud ymm2, ymm3, dword ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf2,0x64,0x38,0x50,0x10] + vpdpbuud ymm2, ymm3, dword ptr [eax]{1to8} + +// CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpbuud ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vpdpbuud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0x50,0x51,0x7f] + vpdpbuud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vpdpbuud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +// CHECK: encoding: [0x62,0xf2,0x64,0xbf,0x50,0x52,0x80] + vpdpbuud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} + +// CHECK: vpdpbuud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x50,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbuud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbuud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0x50,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbuud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbuud zmm2, zmm3, dword ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf2,0x64,0x58,0x50,0x10] + vpdpbuud zmm2, zmm3, dword ptr [eax]{1to16} + +// CHECK: vpdpbuud zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x50,0x14,0x6d,0x00,0xf8,0xff,0xff] + vpdpbuud zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vpdpbuud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0x50,0x51,0x7f] + vpdpbuud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vpdpbuud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +// CHECK: encoding: [0x62,0xf2,0x64,0xdf,0x50,0x52,0x80] + vpdpbuud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} + +// CHECK: vpdpbuuds xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0xd4] + vpdpbuuds xmm2, xmm3, xmm4 + +// CHECK: vpdpbuuds xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0x51,0xd4] + vpdpbuuds xmm2 {k7}, xmm3, xmm4 + +// CHECK: vpdpbuuds xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0x51,0xd4] + vpdpbuuds xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vpdpbuuds ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0xd4] + vpdpbuuds ymm2, ymm3, ymm4 + +// CHECK: vpdpbuuds ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0x51,0xd4] + vpdpbuuds ymm2 {k7}, ymm3, ymm4 + +// CHECK: vpdpbuuds ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0x51,0xd4] + vpdpbuuds ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vpdpbuuds zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x51,0xd4] + vpdpbuuds zmm2, zmm3, zmm4 + +// CHECK: vpdpbuuds zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0x51,0xd4] + vpdpbuuds zmm2 {k7}, zmm3, zmm4 + +// CHECK: vpdpbuuds zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0x51,0xd4] + vpdpbuuds zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbuuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbuuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbuuds xmm2, xmm3, dword ptr [eax]{1to4} +// CHECK: encoding: [0x62,0xf2,0x64,0x18,0x51,0x10] + vpdpbuuds xmm2, xmm3, dword ptr [eax]{1to4} + +// CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpbuuds xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vpdpbuuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0x51,0x51,0x7f] + vpdpbuuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vpdpbuuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +// CHECK: encoding: [0x62,0xf2,0x64,0x9f,0x51,0x52,0x80] + vpdpbuuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} + +// CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbuuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbuuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbuuds ymm2, ymm3, dword ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf2,0x64,0x38,0x51,0x10] + vpdpbuuds ymm2, ymm3, dword ptr [eax]{1to8} + +// CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpbuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vpdpbuuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0x51,0x51,0x7f] + vpdpbuuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vpdpbuuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +// CHECK: encoding: [0x62,0xf2,0x64,0xbf,0x51,0x52,0x80] + vpdpbuuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} + +// CHECK: vpdpbuuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbuuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbuuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbuuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbuuds zmm2, zmm3, dword ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf2,0x64,0x58,0x51,0x10] + vpdpbuuds zmm2, zmm3, dword ptr [eax]{1to16} + +// CHECK: vpdpbuuds zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x51,0x14,0x6d,0x00,0xf8,0xff,0xff] + vpdpbuuds zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vpdpbuuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0x51,0x51,0x7f] + vpdpbuuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vpdpbuuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +// CHECK: encoding: [0x62,0xf2,0x64,0xdf,0x51,0x52,0x80] + vpdpbuuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} + +// VNNI INT16 + +// CHECK: vpdpwsud xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0xd4] + vpdpwsud xmm2, xmm3, xmm4 + +// CHECK: vpdpwsud xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x66,0x0f,0xd2,0xd4] + vpdpwsud xmm2 {k7}, xmm3, xmm4 + +// CHECK: vpdpwsud xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x66,0x8f,0xd2,0xd4] + vpdpwsud xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vpdpwsud ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0xd4] + vpdpwsud ymm2, ymm3, ymm4 + +// CHECK: vpdpwsud ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x66,0x2f,0xd2,0xd4] + vpdpwsud ymm2 {k7}, ymm3, ymm4 + +// CHECK: vpdpwsud ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x66,0xaf,0xd2,0xd4] + vpdpwsud ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vpdpwsud zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x66,0x48,0xd2,0xd4] + vpdpwsud zmm2, zmm3, zmm4 + +// CHECK: vpdpwsud zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x66,0x4f,0xd2,0xd4] + vpdpwsud zmm2 {k7}, zmm3, zmm4 + +// CHECK: vpdpwsud zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x66,0xcf,0xd2,0xd4] + vpdpwsud zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vpdpwsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwsud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x66,0x0f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwsud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwsud xmm2, xmm3, dword ptr [eax]{1to4} +// CHECK: encoding: [0x62,0xf2,0x66,0x18,0xd2,0x10] + vpdpwsud xmm2, xmm3, dword ptr [eax]{1to4} + +// CHECK: vpdpwsud xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpwsud xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vpdpwsud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf2,0x66,0x8f,0xd2,0x51,0x7f] + vpdpwsud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vpdpwsud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +// CHECK: encoding: [0x62,0xf2,0x66,0x9f,0xd2,0x52,0x80] + vpdpwsud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} + +// CHECK: vpdpwsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwsud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x66,0x2f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwsud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwsud ymm2, ymm3, dword ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf2,0x66,0x38,0xd2,0x10] + vpdpwsud ymm2, ymm3, dword ptr [eax]{1to8} + +// CHECK: vpdpwsud ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpwsud ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vpdpwsud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf2,0x66,0xaf,0xd2,0x51,0x7f] + vpdpwsud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vpdpwsud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +// CHECK: encoding: [0x62,0xf2,0x66,0xbf,0xd2,0x52,0x80] + vpdpwsud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} + +// CHECK: vpdpwsud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x66,0x48,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwsud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwsud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x66,0x4f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwsud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwsud zmm2, zmm3, dword ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf2,0x66,0x58,0xd2,0x10] + vpdpwsud zmm2, zmm3, dword ptr [eax]{1to16} + +// CHECK: vpdpwsud zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf2,0x66,0x48,0xd2,0x14,0x6d,0x00,0xf8,0xff,0xff] + vpdpwsud zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vpdpwsud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf2,0x66,0xcf,0xd2,0x51,0x7f] + vpdpwsud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vpdpwsud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +// CHECK: encoding: [0x62,0xf2,0x66,0xdf,0xd2,0x52,0x80] + vpdpwsud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} + +// CHECK: vpdpwsuds xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0xd4] + vpdpwsuds xmm2, xmm3, xmm4 + +// CHECK: vpdpwsuds xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x66,0x0f,0xd3,0xd4] + vpdpwsuds xmm2 {k7}, xmm3, xmm4 + +// CHECK: vpdpwsuds xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x66,0x8f,0xd3,0xd4] + vpdpwsuds xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vpdpwsuds ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0xd4] + vpdpwsuds ymm2, ymm3, ymm4 + +// CHECK: vpdpwsuds ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x66,0x2f,0xd3,0xd4] + vpdpwsuds ymm2 {k7}, ymm3, ymm4 + +// CHECK: vpdpwsuds ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x66,0xaf,0xd3,0xd4] + vpdpwsuds ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vpdpwsuds zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x66,0x48,0xd3,0xd4] + vpdpwsuds zmm2, zmm3, zmm4 + +// CHECK: vpdpwsuds zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x66,0x4f,0xd3,0xd4] + vpdpwsuds zmm2 {k7}, zmm3, zmm4 + +// CHECK: vpdpwsuds zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x66,0xcf,0xd3,0xd4] + vpdpwsuds zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vpdpwsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwsuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x66,0x0f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwsuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwsuds xmm2, xmm3, dword ptr [eax]{1to4} +// CHECK: encoding: [0x62,0xf2,0x66,0x18,0xd3,0x10] + vpdpwsuds xmm2, xmm3, dword ptr [eax]{1to4} + +// CHECK: vpdpwsuds xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpwsuds xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vpdpwsuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf2,0x66,0x8f,0xd3,0x51,0x7f] + vpdpwsuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vpdpwsuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +// CHECK: encoding: [0x62,0xf2,0x66,0x9f,0xd3,0x52,0x80] + vpdpwsuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} + +// CHECK: vpdpwsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwsuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x66,0x2f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwsuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwsuds ymm2, ymm3, dword ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf2,0x66,0x38,0xd3,0x10] + vpdpwsuds ymm2, ymm3, dword ptr [eax]{1to8} + +// CHECK: vpdpwsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpwsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vpdpwsuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf2,0x66,0xaf,0xd3,0x51,0x7f] + vpdpwsuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vpdpwsuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +// CHECK: encoding: [0x62,0xf2,0x66,0xbf,0xd3,0x52,0x80] + vpdpwsuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} + +// CHECK: vpdpwsuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x66,0x48,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwsuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwsuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x66,0x4f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwsuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwsuds zmm2, zmm3, dword ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf2,0x66,0x58,0xd3,0x10] + vpdpwsuds zmm2, zmm3, dword ptr [eax]{1to16} + +// CHECK: vpdpwsuds zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf2,0x66,0x48,0xd3,0x14,0x6d,0x00,0xf8,0xff,0xff] + vpdpwsuds zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vpdpwsuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf2,0x66,0xcf,0xd3,0x51,0x7f] + vpdpwsuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vpdpwsuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +// CHECK: encoding: [0x62,0xf2,0x66,0xdf,0xd3,0x52,0x80] + vpdpwsuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} + +// CHECK: vpdpwusd xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0xd4] + vpdpwusd xmm2, xmm3, xmm4 + +// CHECK: vpdpwusd xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x65,0x0f,0xd2,0xd4] + vpdpwusd xmm2 {k7}, xmm3, xmm4 + +// CHECK: vpdpwusd xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x65,0x8f,0xd2,0xd4] + vpdpwusd xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vpdpwusd ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0xd4] + vpdpwusd ymm2, ymm3, ymm4 + +// CHECK: vpdpwusd ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x65,0x2f,0xd2,0xd4] + vpdpwusd ymm2 {k7}, ymm3, ymm4 + +// CHECK: vpdpwusd ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x65,0xaf,0xd2,0xd4] + vpdpwusd ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vpdpwusd zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x65,0x48,0xd2,0xd4] + vpdpwusd zmm2, zmm3, zmm4 + +// CHECK: vpdpwusd zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x65,0x4f,0xd2,0xd4] + vpdpwusd zmm2 {k7}, zmm3, zmm4 + +// CHECK: vpdpwusd zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x65,0xcf,0xd2,0xd4] + vpdpwusd zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vpdpwusd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwusd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwusd xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x65,0x0f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwusd xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwusd xmm2, xmm3, dword ptr [eax]{1to4} +// CHECK: encoding: [0x62,0xf2,0x65,0x18,0xd2,0x10] + vpdpwusd xmm2, xmm3, dword ptr [eax]{1to4} + +// CHECK: vpdpwusd xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpwusd xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vpdpwusd xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf2,0x65,0x8f,0xd2,0x51,0x7f] + vpdpwusd xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vpdpwusd xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +// CHECK: encoding: [0x62,0xf2,0x65,0x9f,0xd2,0x52,0x80] + vpdpwusd xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} + +// CHECK: vpdpwusd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwusd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwusd ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x65,0x2f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwusd ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwusd ymm2, ymm3, dword ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf2,0x65,0x38,0xd2,0x10] + vpdpwusd ymm2, ymm3, dword ptr [eax]{1to8} + +// CHECK: vpdpwusd ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpwusd ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vpdpwusd ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf2,0x65,0xaf,0xd2,0x51,0x7f] + vpdpwusd ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vpdpwusd ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +// CHECK: encoding: [0x62,0xf2,0x65,0xbf,0xd2,0x52,0x80] + vpdpwusd ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} + +// CHECK: vpdpwusd zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x65,0x48,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwusd zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwusd zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x65,0x4f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwusd zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwusd zmm2, zmm3, dword ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf2,0x65,0x58,0xd2,0x10] + vpdpwusd zmm2, zmm3, dword ptr [eax]{1to16} + +// CHECK: vpdpwusd zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf2,0x65,0x48,0xd2,0x14,0x6d,0x00,0xf8,0xff,0xff] + vpdpwusd zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vpdpwusd zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf2,0x65,0xcf,0xd2,0x51,0x7f] + vpdpwusd zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vpdpwusd zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +// CHECK: encoding: [0x62,0xf2,0x65,0xdf,0xd2,0x52,0x80] + vpdpwusd zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} + +// CHECK: vpdpwusds xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0xd4] + vpdpwusds xmm2, xmm3, xmm4 + +// CHECK: vpdpwusds xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x65,0x0f,0xd3,0xd4] + vpdpwusds xmm2 {k7}, xmm3, xmm4 + +// CHECK: vpdpwusds xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x65,0x8f,0xd3,0xd4] + vpdpwusds xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vpdpwusds ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0xd4] + vpdpwusds ymm2, ymm3, ymm4 + +// CHECK: vpdpwusds ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x65,0x2f,0xd3,0xd4] + vpdpwusds ymm2 {k7}, ymm3, ymm4 + +// CHECK: vpdpwusds ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x65,0xaf,0xd3,0xd4] + vpdpwusds ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vpdpwusds zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x65,0x48,0xd3,0xd4] + vpdpwusds zmm2, zmm3, zmm4 + +// CHECK: vpdpwusds zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x65,0x4f,0xd3,0xd4] + vpdpwusds zmm2 {k7}, zmm3, zmm4 + +// CHECK: vpdpwusds zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x65,0xcf,0xd3,0xd4] + vpdpwusds zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vpdpwusds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwusds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwusds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x65,0x0f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwusds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwusds xmm2, xmm3, dword ptr [eax]{1to4} +// CHECK: encoding: [0x62,0xf2,0x65,0x18,0xd3,0x10] + vpdpwusds xmm2, xmm3, dword ptr [eax]{1to4} + +// CHECK: vpdpwusds xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpwusds xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vpdpwusds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf2,0x65,0x8f,0xd3,0x51,0x7f] + vpdpwusds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vpdpwusds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +// CHECK: encoding: [0x62,0xf2,0x65,0x9f,0xd3,0x52,0x80] + vpdpwusds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} + +// CHECK: vpdpwusds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwusds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwusds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x65,0x2f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwusds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwusds ymm2, ymm3, dword ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf2,0x65,0x38,0xd3,0x10] + vpdpwusds ymm2, ymm3, dword ptr [eax]{1to8} + +// CHECK: vpdpwusds ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpwusds ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vpdpwusds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf2,0x65,0xaf,0xd3,0x51,0x7f] + vpdpwusds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vpdpwusds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +// CHECK: encoding: [0x62,0xf2,0x65,0xbf,0xd3,0x52,0x80] + vpdpwusds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} + +// CHECK: vpdpwusds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x65,0x48,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwusds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwusds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x65,0x4f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwusds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwusds zmm2, zmm3, dword ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf2,0x65,0x58,0xd3,0x10] + vpdpwusds zmm2, zmm3, dword ptr [eax]{1to16} + +// CHECK: vpdpwusds zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf2,0x65,0x48,0xd3,0x14,0x6d,0x00,0xf8,0xff,0xff] + vpdpwusds zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vpdpwusds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf2,0x65,0xcf,0xd3,0x51,0x7f] + vpdpwusds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vpdpwusds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +// CHECK: encoding: [0x62,0xf2,0x65,0xdf,0xd3,0x52,0x80] + vpdpwusds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} + +// CHECK: vpdpwuud xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0xd4] + vpdpwuud xmm2, xmm3, xmm4 + +// CHECK: vpdpwuud xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0xd2,0xd4] + vpdpwuud xmm2 {k7}, xmm3, xmm4 + +// CHECK: vpdpwuud xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0xd2,0xd4] + vpdpwuud xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vpdpwuud ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0xd4] + vpdpwuud ymm2, ymm3, ymm4 + +// CHECK: vpdpwuud ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0xd2,0xd4] + vpdpwuud ymm2 {k7}, ymm3, ymm4 + +// CHECK: vpdpwuud ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0xd2,0xd4] + vpdpwuud ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vpdpwuud zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x48,0xd2,0xd4] + vpdpwuud zmm2, zmm3, zmm4 + +// CHECK: vpdpwuud zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0xd2,0xd4] + vpdpwuud zmm2 {k7}, zmm3, zmm4 + +// CHECK: vpdpwuud zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0xd2,0xd4] + vpdpwuud zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vpdpwuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwuud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwuud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwuud xmm2, xmm3, dword ptr [eax]{1to4} +// CHECK: encoding: [0x62,0xf2,0x64,0x18,0xd2,0x10] + vpdpwuud xmm2, xmm3, dword ptr [eax]{1to4} + +// CHECK: vpdpwuud xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpwuud xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vpdpwuud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0xd2,0x51,0x7f] + vpdpwuud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vpdpwuud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +// CHECK: encoding: [0x62,0xf2,0x64,0x9f,0xd2,0x52,0x80] + vpdpwuud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} + +// CHECK: vpdpwuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwuud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwuud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwuud ymm2, ymm3, dword ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf2,0x64,0x38,0xd2,0x10] + vpdpwuud ymm2, ymm3, dword ptr [eax]{1to8} + +// CHECK: vpdpwuud ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpwuud ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vpdpwuud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0xd2,0x51,0x7f] + vpdpwuud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vpdpwuud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +// CHECK: encoding: [0x62,0xf2,0x64,0xbf,0xd2,0x52,0x80] + vpdpwuud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} + +// CHECK: vpdpwuud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x64,0x48,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwuud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwuud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwuud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwuud zmm2, zmm3, dword ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf2,0x64,0x58,0xd2,0x10] + vpdpwuud zmm2, zmm3, dword ptr [eax]{1to16} + +// CHECK: vpdpwuud zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf2,0x64,0x48,0xd2,0x14,0x6d,0x00,0xf8,0xff,0xff] + vpdpwuud zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vpdpwuud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0xd2,0x51,0x7f] + vpdpwuud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vpdpwuud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +// CHECK: encoding: [0x62,0xf2,0x64,0xdf,0xd2,0x52,0x80] + vpdpwuud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} + +// CHECK: vpdpwuuds xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0xd4] + vpdpwuuds xmm2, xmm3, xmm4 + +// CHECK: vpdpwuuds xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0xd3,0xd4] + vpdpwuuds xmm2 {k7}, xmm3, xmm4 + +// CHECK: vpdpwuuds xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0xd3,0xd4] + vpdpwuuds xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vpdpwuuds ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0xd4] + vpdpwuuds ymm2, ymm3, ymm4 + +// CHECK: vpdpwuuds ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0xd3,0xd4] + vpdpwuuds ymm2 {k7}, ymm3, ymm4 + +// CHECK: vpdpwuuds ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0xd3,0xd4] + vpdpwuuds ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vpdpwuuds zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x48,0xd3,0xd4] + vpdpwuuds zmm2, zmm3, zmm4 + +// CHECK: vpdpwuuds zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0xd3,0xd4] + vpdpwuuds zmm2 {k7}, zmm3, zmm4 + +// CHECK: vpdpwuuds zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0xd3,0xd4] + vpdpwuuds zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vpdpwuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwuuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwuuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwuuds xmm2, xmm3, dword ptr [eax]{1to4} +// CHECK: encoding: [0x62,0xf2,0x64,0x18,0xd3,0x10] + vpdpwuuds xmm2, xmm3, dword ptr [eax]{1to4} + +// CHECK: vpdpwuuds xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpwuuds xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vpdpwuuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0xd3,0x51,0x7f] + vpdpwuuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vpdpwuuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +// CHECK: encoding: [0x62,0xf2,0x64,0x9f,0xd3,0x52,0x80] + vpdpwuuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} + +// CHECK: vpdpwuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwuuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwuuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwuuds ymm2, ymm3, dword ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf2,0x64,0x38,0xd3,0x10] + vpdpwuuds ymm2, ymm3, dword ptr [eax]{1to8} + +// CHECK: vpdpwuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpwuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vpdpwuuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0xd3,0x51,0x7f] + vpdpwuuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vpdpwuuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +// CHECK: encoding: [0x62,0xf2,0x64,0xbf,0xd3,0x52,0x80] + vpdpwuuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} + +// CHECK: vpdpwuuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x64,0x48,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwuuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwuuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwuuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwuuds zmm2, zmm3, dword ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf2,0x64,0x58,0xd3,0x10] + vpdpwuuds zmm2, zmm3, dword ptr [eax]{1to16} + +// CHECK: vpdpwuuds zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf2,0x64,0x48,0xd3,0x14,0x6d,0x00,0xf8,0xff,0xff] + vpdpwuuds zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vpdpwuuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0xd3,0x51,0x7f] + vpdpwuuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vpdpwuuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +// CHECK: encoding: [0x62,0xf2,0x64,0xdf,0xd3,0x52,0x80] + vpdpwuuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} + // VMPSADBW // CHECK: vmpsadbw xmm2, xmm3, xmm4, 123 diff --git a/llvm/test/MC/X86/avx10_2ni-64-att.s b/llvm/test/MC/X86/avx10_2ni-64-att.s index 09566eb50ddad04..4fa7c0a918528c3 100644 --- a/llvm/test/MC/X86/avx10_2ni-64-att.s +++ b/llvm/test/MC/X86/avx10_2ni-64-att.s @@ -1,5 +1,1415 @@ // RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s +// VNNI FP16 + +// CHECK: vdpphps %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x82,0x44,0x00,0x52,0xf0] + vdpphps %xmm24, %xmm23, %xmm22 + +// CHECK: vdpphps %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x44,0x07,0x52,0xf0] + vdpphps %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vdpphps %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x44,0x87,0x52,0xf0] + vdpphps %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vdpphps %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x82,0x44,0x20,0x52,0xf0] + vdpphps %ymm24, %ymm23, %ymm22 + +// CHECK: vdpphps %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x44,0x27,0x52,0xf0] + vdpphps %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vdpphps %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x44,0xa7,0x52,0xf0] + vdpphps %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vdpphps %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x82,0x44,0x40,0x52,0xf0] + vdpphps %zmm24, %zmm23, %zmm22 + +// CHECK: vdpphps %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x44,0x47,0x52,0xf0] + vdpphps %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vdpphps %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x44,0xc7,0x52,0xf0] + vdpphps %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vdpphps 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa2,0x44,0x00,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10] + vdpphps 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vdpphps 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x44,0x07,0x52,0xb4,0x80,0x23,0x01,0x00,0x00] + vdpphps 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vdpphps (%rip){1to4}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x10,0x52,0x35,0x00,0x00,0x00,0x00] + vdpphps (%rip){1to4}, %xmm23, %xmm22 + +// CHECK: vdpphps -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x00,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff] + vdpphps -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vdpphps 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0x87,0x52,0x71,0x7f] + vdpphps 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vdpphps -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0x97,0x52,0x72,0x80] + vdpphps -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vdpphps 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa2,0x44,0x20,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10] + vdpphps 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vdpphps 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x44,0x27,0x52,0xb4,0x80,0x23,0x01,0x00,0x00] + vdpphps 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vdpphps (%rip){1to8}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x30,0x52,0x35,0x00,0x00,0x00,0x00] + vdpphps (%rip){1to8}, %ymm23, %ymm22 + +// CHECK: vdpphps -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x20,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff] + vdpphps -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vdpphps 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0xa7,0x52,0x71,0x7f] + vdpphps 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vdpphps -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0xb7,0x52,0x72,0x80] + vdpphps -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vdpphps 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa2,0x44,0x40,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10] + vdpphps 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vdpphps 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x44,0x47,0x52,0xb4,0x80,0x23,0x01,0x00,0x00] + vdpphps 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vdpphps (%rip){1to16}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x50,0x52,0x35,0x00,0x00,0x00,0x00] + vdpphps (%rip){1to16}, %zmm23, %zmm22 + +// CHECK: vdpphps -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x40,0x52,0x34,0x6d,0x00,0xf8,0xff,0xff] + vdpphps -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vdpphps 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0xc7,0x52,0x71,0x7f] + vdpphps 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vdpphps -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0xd7,0x52,0x72,0x80] + vdpphps -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} + +// VNNI INT8 + +// CHECK: vpdpbssd %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x82,0x47,0x00,0x50,0xf0] + vpdpbssd %xmm24, %xmm23, %xmm22 + +// CHECK: vpdpbssd %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x47,0x07,0x50,0xf0] + vpdpbssd %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vpdpbssd %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x47,0x87,0x50,0xf0] + vpdpbssd %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpbssd %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x82,0x47,0x20,0x50,0xf0] + vpdpbssd %ymm24, %ymm23, %ymm22 + +// CHECK: vpdpbssd %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x47,0x27,0x50,0xf0] + vpdpbssd %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vpdpbssd %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x47,0xa7,0x50,0xf0] + vpdpbssd %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpbssd %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x82,0x47,0x40,0x50,0xf0] + vpdpbssd %zmm24, %zmm23, %zmm22 + +// CHECK: vpdpbssd %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x47,0x47,0x50,0xf0] + vpdpbssd %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vpdpbssd %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x47,0xc7,0x50,0xf0] + vpdpbssd %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpbssd 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa2,0x47,0x00,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpbssd 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vpdpbssd 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x47,0x07,0x50,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpbssd 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vpdpbssd (%rip){1to4}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x47,0x10,0x50,0x35,0x00,0x00,0x00,0x00] + vpdpbssd (%rip){1to4}, %xmm23, %xmm22 + +// CHECK: vpdpbssd -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x47,0x00,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff] + vpdpbssd -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vpdpbssd 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x47,0x87,0x50,0x71,0x7f] + vpdpbssd 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpbssd -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x47,0x97,0x50,0x72,0x80] + vpdpbssd -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpbssd 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa2,0x47,0x20,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpbssd 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vpdpbssd 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x47,0x27,0x50,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpbssd 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vpdpbssd (%rip){1to8}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x47,0x30,0x50,0x35,0x00,0x00,0x00,0x00] + vpdpbssd (%rip){1to8}, %ymm23, %ymm22 + +// CHECK: vpdpbssd -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x47,0x20,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff] + vpdpbssd -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vpdpbssd 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x47,0xa7,0x50,0x71,0x7f] + vpdpbssd 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpbssd -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x47,0xb7,0x50,0x72,0x80] + vpdpbssd -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpbssd 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa2,0x47,0x40,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpbssd 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vpdpbssd 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x47,0x47,0x50,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpbssd 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vpdpbssd (%rip){1to16}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x47,0x50,0x50,0x35,0x00,0x00,0x00,0x00] + vpdpbssd (%rip){1to16}, %zmm23, %zmm22 + +// CHECK: vpdpbssd -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x47,0x40,0x50,0x34,0x6d,0x00,0xf8,0xff,0xff] + vpdpbssd -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vpdpbssd 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x47,0xc7,0x50,0x71,0x7f] + vpdpbssd 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpbssd -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x47,0xd7,0x50,0x72,0x80] + vpdpbssd -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpbssds %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x82,0x47,0x00,0x51,0xf0] + vpdpbssds %xmm24, %xmm23, %xmm22 + +// CHECK: vpdpbssds %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x47,0x07,0x51,0xf0] + vpdpbssds %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vpdpbssds %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x47,0x87,0x51,0xf0] + vpdpbssds %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpbssds %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x82,0x47,0x20,0x51,0xf0] + vpdpbssds %ymm24, %ymm23, %ymm22 + +// CHECK: vpdpbssds %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x47,0x27,0x51,0xf0] + vpdpbssds %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vpdpbssds %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x47,0xa7,0x51,0xf0] + vpdpbssds %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpbssds %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x82,0x47,0x40,0x51,0xf0] + vpdpbssds %zmm24, %zmm23, %zmm22 + +// CHECK: vpdpbssds %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x47,0x47,0x51,0xf0] + vpdpbssds %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vpdpbssds %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x47,0xc7,0x51,0xf0] + vpdpbssds %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpbssds 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa2,0x47,0x00,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpbssds 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vpdpbssds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x47,0x07,0x51,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpbssds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vpdpbssds (%rip){1to4}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x47,0x10,0x51,0x35,0x00,0x00,0x00,0x00] + vpdpbssds (%rip){1to4}, %xmm23, %xmm22 + +// CHECK: vpdpbssds -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x47,0x00,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff] + vpdpbssds -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vpdpbssds 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x47,0x87,0x51,0x71,0x7f] + vpdpbssds 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpbssds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x47,0x97,0x51,0x72,0x80] + vpdpbssds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpbssds 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa2,0x47,0x20,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpbssds 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vpdpbssds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x47,0x27,0x51,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpbssds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vpdpbssds (%rip){1to8}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x47,0x30,0x51,0x35,0x00,0x00,0x00,0x00] + vpdpbssds (%rip){1to8}, %ymm23, %ymm22 + +// CHECK: vpdpbssds -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x47,0x20,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff] + vpdpbssds -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vpdpbssds 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x47,0xa7,0x51,0x71,0x7f] + vpdpbssds 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpbssds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x47,0xb7,0x51,0x72,0x80] + vpdpbssds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpbssds 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa2,0x47,0x40,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpbssds 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vpdpbssds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x47,0x47,0x51,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpbssds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vpdpbssds (%rip){1to16}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x47,0x50,0x51,0x35,0x00,0x00,0x00,0x00] + vpdpbssds (%rip){1to16}, %zmm23, %zmm22 + +// CHECK: vpdpbssds -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x47,0x40,0x51,0x34,0x6d,0x00,0xf8,0xff,0xff] + vpdpbssds -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vpdpbssds 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x47,0xc7,0x51,0x71,0x7f] + vpdpbssds 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpbssds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x47,0xd7,0x51,0x72,0x80] + vpdpbssds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpbsud %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x82,0x46,0x00,0x50,0xf0] + vpdpbsud %xmm24, %xmm23, %xmm22 + +// CHECK: vpdpbsud %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x46,0x07,0x50,0xf0] + vpdpbsud %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vpdpbsud %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x46,0x87,0x50,0xf0] + vpdpbsud %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpbsud %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x82,0x46,0x20,0x50,0xf0] + vpdpbsud %ymm24, %ymm23, %ymm22 + +// CHECK: vpdpbsud %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x46,0x27,0x50,0xf0] + vpdpbsud %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vpdpbsud %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x46,0xa7,0x50,0xf0] + vpdpbsud %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpbsud %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x82,0x46,0x40,0x50,0xf0] + vpdpbsud %zmm24, %zmm23, %zmm22 + +// CHECK: vpdpbsud %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x46,0x47,0x50,0xf0] + vpdpbsud %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vpdpbsud %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x46,0xc7,0x50,0xf0] + vpdpbsud %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpbsud 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa2,0x46,0x00,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpbsud 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vpdpbsud 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x46,0x07,0x50,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpbsud 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vpdpbsud (%rip){1to4}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x46,0x10,0x50,0x35,0x00,0x00,0x00,0x00] + vpdpbsud (%rip){1to4}, %xmm23, %xmm22 + +// CHECK: vpdpbsud -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x46,0x00,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff] + vpdpbsud -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vpdpbsud 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x46,0x87,0x50,0x71,0x7f] + vpdpbsud 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpbsud -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x46,0x97,0x50,0x72,0x80] + vpdpbsud -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpbsud 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa2,0x46,0x20,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpbsud 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vpdpbsud 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x46,0x27,0x50,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpbsud 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vpdpbsud (%rip){1to8}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x46,0x30,0x50,0x35,0x00,0x00,0x00,0x00] + vpdpbsud (%rip){1to8}, %ymm23, %ymm22 + +// CHECK: vpdpbsud -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x46,0x20,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff] + vpdpbsud -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vpdpbsud 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x46,0xa7,0x50,0x71,0x7f] + vpdpbsud 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpbsud -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x46,0xb7,0x50,0x72,0x80] + vpdpbsud -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpbsud 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa2,0x46,0x40,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpbsud 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vpdpbsud 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x46,0x47,0x50,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpbsud 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vpdpbsud (%rip){1to16}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x46,0x50,0x50,0x35,0x00,0x00,0x00,0x00] + vpdpbsud (%rip){1to16}, %zmm23, %zmm22 + +// CHECK: vpdpbsud -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x46,0x40,0x50,0x34,0x6d,0x00,0xf8,0xff,0xff] + vpdpbsud -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vpdpbsud 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x46,0xc7,0x50,0x71,0x7f] + vpdpbsud 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpbsud -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x46,0xd7,0x50,0x72,0x80] + vpdpbsud -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpbsuds %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x82,0x46,0x00,0x51,0xf0] + vpdpbsuds %xmm24, %xmm23, %xmm22 + +// CHECK: vpdpbsuds %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x46,0x07,0x51,0xf0] + vpdpbsuds %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vpdpbsuds %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x46,0x87,0x51,0xf0] + vpdpbsuds %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpbsuds %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x82,0x46,0x20,0x51,0xf0] + vpdpbsuds %ymm24, %ymm23, %ymm22 + +// CHECK: vpdpbsuds %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x46,0x27,0x51,0xf0] + vpdpbsuds %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vpdpbsuds %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x46,0xa7,0x51,0xf0] + vpdpbsuds %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpbsuds %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x82,0x46,0x40,0x51,0xf0] + vpdpbsuds %zmm24, %zmm23, %zmm22 + +// CHECK: vpdpbsuds %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x46,0x47,0x51,0xf0] + vpdpbsuds %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vpdpbsuds %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x46,0xc7,0x51,0xf0] + vpdpbsuds %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpbsuds 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa2,0x46,0x00,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpbsuds 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vpdpbsuds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x46,0x07,0x51,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpbsuds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vpdpbsuds (%rip){1to4}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x46,0x10,0x51,0x35,0x00,0x00,0x00,0x00] + vpdpbsuds (%rip){1to4}, %xmm23, %xmm22 + +// CHECK: vpdpbsuds -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x46,0x00,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff] + vpdpbsuds -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vpdpbsuds 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x46,0x87,0x51,0x71,0x7f] + vpdpbsuds 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpbsuds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x46,0x97,0x51,0x72,0x80] + vpdpbsuds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpbsuds 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa2,0x46,0x20,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpbsuds 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vpdpbsuds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x46,0x27,0x51,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpbsuds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vpdpbsuds (%rip){1to8}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x46,0x30,0x51,0x35,0x00,0x00,0x00,0x00] + vpdpbsuds (%rip){1to8}, %ymm23, %ymm22 + +// CHECK: vpdpbsuds -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x46,0x20,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff] + vpdpbsuds -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vpdpbsuds 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x46,0xa7,0x51,0x71,0x7f] + vpdpbsuds 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpbsuds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x46,0xb7,0x51,0x72,0x80] + vpdpbsuds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpbsuds 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa2,0x46,0x40,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpbsuds 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vpdpbsuds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x46,0x47,0x51,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpbsuds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vpdpbsuds (%rip){1to16}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x46,0x50,0x51,0x35,0x00,0x00,0x00,0x00] + vpdpbsuds (%rip){1to16}, %zmm23, %zmm22 + +// CHECK: vpdpbsuds -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x46,0x40,0x51,0x34,0x6d,0x00,0xf8,0xff,0xff] + vpdpbsuds -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vpdpbsuds 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x46,0xc7,0x51,0x71,0x7f] + vpdpbsuds 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpbsuds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x46,0xd7,0x51,0x72,0x80] + vpdpbsuds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpbuud %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x82,0x44,0x00,0x50,0xf0] + vpdpbuud %xmm24, %xmm23, %xmm22 + +// CHECK: vpdpbuud %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x44,0x07,0x50,0xf0] + vpdpbuud %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vpdpbuud %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x44,0x87,0x50,0xf0] + vpdpbuud %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpbuud %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x82,0x44,0x20,0x50,0xf0] + vpdpbuud %ymm24, %ymm23, %ymm22 + +// CHECK: vpdpbuud %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x44,0x27,0x50,0xf0] + vpdpbuud %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vpdpbuud %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x44,0xa7,0x50,0xf0] + vpdpbuud %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpbuud %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x82,0x44,0x40,0x50,0xf0] + vpdpbuud %zmm24, %zmm23, %zmm22 + +// CHECK: vpdpbuud %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x44,0x47,0x50,0xf0] + vpdpbuud %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vpdpbuud %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x44,0xc7,0x50,0xf0] + vpdpbuud %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpbuud 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa2,0x44,0x00,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpbuud 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vpdpbuud 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x44,0x07,0x50,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpbuud 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vpdpbuud (%rip){1to4}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x10,0x50,0x35,0x00,0x00,0x00,0x00] + vpdpbuud (%rip){1to4}, %xmm23, %xmm22 + +// CHECK: vpdpbuud -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x00,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff] + vpdpbuud -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vpdpbuud 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0x87,0x50,0x71,0x7f] + vpdpbuud 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpbuud -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0x97,0x50,0x72,0x80] + vpdpbuud -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpbuud 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa2,0x44,0x20,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpbuud 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vpdpbuud 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x44,0x27,0x50,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpbuud 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vpdpbuud (%rip){1to8}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x30,0x50,0x35,0x00,0x00,0x00,0x00] + vpdpbuud (%rip){1to8}, %ymm23, %ymm22 + +// CHECK: vpdpbuud -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x20,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff] + vpdpbuud -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vpdpbuud 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0xa7,0x50,0x71,0x7f] + vpdpbuud 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpbuud -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0xb7,0x50,0x72,0x80] + vpdpbuud -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpbuud 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa2,0x44,0x40,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpbuud 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vpdpbuud 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x44,0x47,0x50,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpbuud 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vpdpbuud (%rip){1to16}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x50,0x50,0x35,0x00,0x00,0x00,0x00] + vpdpbuud (%rip){1to16}, %zmm23, %zmm22 + +// CHECK: vpdpbuud -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x40,0x50,0x34,0x6d,0x00,0xf8,0xff,0xff] + vpdpbuud -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vpdpbuud 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0xc7,0x50,0x71,0x7f] + vpdpbuud 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpbuud -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0xd7,0x50,0x72,0x80] + vpdpbuud -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpbuuds %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x82,0x44,0x00,0x51,0xf0] + vpdpbuuds %xmm24, %xmm23, %xmm22 + +// CHECK: vpdpbuuds %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x44,0x07,0x51,0xf0] + vpdpbuuds %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vpdpbuuds %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x44,0x87,0x51,0xf0] + vpdpbuuds %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpbuuds %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x82,0x44,0x20,0x51,0xf0] + vpdpbuuds %ymm24, %ymm23, %ymm22 + +// CHECK: vpdpbuuds %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x44,0x27,0x51,0xf0] + vpdpbuuds %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vpdpbuuds %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x44,0xa7,0x51,0xf0] + vpdpbuuds %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpbuuds %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x82,0x44,0x40,0x51,0xf0] + vpdpbuuds %zmm24, %zmm23, %zmm22 + +// CHECK: vpdpbuuds %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x44,0x47,0x51,0xf0] + vpdpbuuds %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vpdpbuuds %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x44,0xc7,0x51,0xf0] + vpdpbuuds %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpbuuds 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa2,0x44,0x00,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpbuuds 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vpdpbuuds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x44,0x07,0x51,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpbuuds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vpdpbuuds (%rip){1to4}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x10,0x51,0x35,0x00,0x00,0x00,0x00] + vpdpbuuds (%rip){1to4}, %xmm23, %xmm22 + +// CHECK: vpdpbuuds -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x00,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff] + vpdpbuuds -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vpdpbuuds 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0x87,0x51,0x71,0x7f] + vpdpbuuds 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpbuuds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0x97,0x51,0x72,0x80] + vpdpbuuds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpbuuds 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa2,0x44,0x20,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpbuuds 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vpdpbuuds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x44,0x27,0x51,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpbuuds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vpdpbuuds (%rip){1to8}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x30,0x51,0x35,0x00,0x00,0x00,0x00] + vpdpbuuds (%rip){1to8}, %ymm23, %ymm22 + +// CHECK: vpdpbuuds -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x20,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff] + vpdpbuuds -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vpdpbuuds 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0xa7,0x51,0x71,0x7f] + vpdpbuuds 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpbuuds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0xb7,0x51,0x72,0x80] + vpdpbuuds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpbuuds 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa2,0x44,0x40,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpbuuds 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vpdpbuuds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x44,0x47,0x51,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpbuuds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vpdpbuuds (%rip){1to16}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x50,0x51,0x35,0x00,0x00,0x00,0x00] + vpdpbuuds (%rip){1to16}, %zmm23, %zmm22 + +// CHECK: vpdpbuuds -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x40,0x51,0x34,0x6d,0x00,0xf8,0xff,0xff] + vpdpbuuds -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vpdpbuuds 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0xc7,0x51,0x71,0x7f] + vpdpbuuds 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpbuuds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0xd7,0x51,0x72,0x80] + vpdpbuuds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} + +// VNNI INT16 + +// CHECK: vpdpwsud %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x82,0x46,0x00,0xd2,0xf0] + vpdpwsud %xmm24, %xmm23, %xmm22 + +// CHECK: vpdpwsud %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x46,0x07,0xd2,0xf0] + vpdpwsud %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vpdpwsud %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x46,0x87,0xd2,0xf0] + vpdpwsud %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpwsud %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x82,0x46,0x20,0xd2,0xf0] + vpdpwsud %ymm24, %ymm23, %ymm22 + +// CHECK: vpdpwsud %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x46,0x27,0xd2,0xf0] + vpdpwsud %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vpdpwsud %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x46,0xa7,0xd2,0xf0] + vpdpwsud %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpwsud %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x82,0x46,0x40,0xd2,0xf0] + vpdpwsud %zmm24, %zmm23, %zmm22 + +// CHECK: vpdpwsud %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x46,0x47,0xd2,0xf0] + vpdpwsud %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vpdpwsud %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x46,0xc7,0xd2,0xf0] + vpdpwsud %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpwsud 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa2,0x46,0x00,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpwsud 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vpdpwsud 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x46,0x07,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpwsud 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vpdpwsud (%rip){1to4}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x46,0x10,0xd2,0x35,0x00,0x00,0x00,0x00] + vpdpwsud (%rip){1to4}, %xmm23, %xmm22 + +// CHECK: vpdpwsud -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x46,0x00,0xd2,0x34,0x6d,0x00,0xfe,0xff,0xff] + vpdpwsud -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vpdpwsud 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x46,0x87,0xd2,0x71,0x7f] + vpdpwsud 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpwsud -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x46,0x97,0xd2,0x72,0x80] + vpdpwsud -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpwsud 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa2,0x46,0x20,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpwsud 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vpdpwsud 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x46,0x27,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpwsud 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vpdpwsud (%rip){1to8}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x46,0x30,0xd2,0x35,0x00,0x00,0x00,0x00] + vpdpwsud (%rip){1to8}, %ymm23, %ymm22 + +// CHECK: vpdpwsud -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x46,0x20,0xd2,0x34,0x6d,0x00,0xfc,0xff,0xff] + vpdpwsud -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vpdpwsud 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x46,0xa7,0xd2,0x71,0x7f] + vpdpwsud 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpwsud -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x46,0xb7,0xd2,0x72,0x80] + vpdpwsud -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpwsud 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa2,0x46,0x40,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpwsud 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vpdpwsud 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x46,0x47,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpwsud 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vpdpwsud (%rip){1to16}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x46,0x50,0xd2,0x35,0x00,0x00,0x00,0x00] + vpdpwsud (%rip){1to16}, %zmm23, %zmm22 + +// CHECK: vpdpwsud -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x46,0x40,0xd2,0x34,0x6d,0x00,0xf8,0xff,0xff] + vpdpwsud -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vpdpwsud 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x46,0xc7,0xd2,0x71,0x7f] + vpdpwsud 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpwsud -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x46,0xd7,0xd2,0x72,0x80] + vpdpwsud -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpwsuds %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x82,0x46,0x00,0xd3,0xf0] + vpdpwsuds %xmm24, %xmm23, %xmm22 + +// CHECK: vpdpwsuds %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x46,0x07,0xd3,0xf0] + vpdpwsuds %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vpdpwsuds %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x46,0x87,0xd3,0xf0] + vpdpwsuds %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpwsuds %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x82,0x46,0x20,0xd3,0xf0] + vpdpwsuds %ymm24, %ymm23, %ymm22 + +// CHECK: vpdpwsuds %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x46,0x27,0xd3,0xf0] + vpdpwsuds %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vpdpwsuds %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x46,0xa7,0xd3,0xf0] + vpdpwsuds %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpwsuds %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x82,0x46,0x40,0xd3,0xf0] + vpdpwsuds %zmm24, %zmm23, %zmm22 + +// CHECK: vpdpwsuds %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x46,0x47,0xd3,0xf0] + vpdpwsuds %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vpdpwsuds %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x46,0xc7,0xd3,0xf0] + vpdpwsuds %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpwsuds 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa2,0x46,0x00,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpwsuds 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vpdpwsuds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x46,0x07,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpwsuds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vpdpwsuds (%rip){1to4}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x46,0x10,0xd3,0x35,0x00,0x00,0x00,0x00] + vpdpwsuds (%rip){1to4}, %xmm23, %xmm22 + +// CHECK: vpdpwsuds -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x46,0x00,0xd3,0x34,0x6d,0x00,0xfe,0xff,0xff] + vpdpwsuds -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vpdpwsuds 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x46,0x87,0xd3,0x71,0x7f] + vpdpwsuds 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpwsuds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x46,0x97,0xd3,0x72,0x80] + vpdpwsuds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpwsuds 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa2,0x46,0x20,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpwsuds 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vpdpwsuds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x46,0x27,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpwsuds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vpdpwsuds (%rip){1to8}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x46,0x30,0xd3,0x35,0x00,0x00,0x00,0x00] + vpdpwsuds (%rip){1to8}, %ymm23, %ymm22 + +// CHECK: vpdpwsuds -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x46,0x20,0xd3,0x34,0x6d,0x00,0xfc,0xff,0xff] + vpdpwsuds -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vpdpwsuds 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x46,0xa7,0xd3,0x71,0x7f] + vpdpwsuds 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpwsuds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x46,0xb7,0xd3,0x72,0x80] + vpdpwsuds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpwsuds 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa2,0x46,0x40,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpwsuds 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vpdpwsuds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x46,0x47,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpwsuds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vpdpwsuds (%rip){1to16}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x46,0x50,0xd3,0x35,0x00,0x00,0x00,0x00] + vpdpwsuds (%rip){1to16}, %zmm23, %zmm22 + +// CHECK: vpdpwsuds -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x46,0x40,0xd3,0x34,0x6d,0x00,0xf8,0xff,0xff] + vpdpwsuds -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vpdpwsuds 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x46,0xc7,0xd3,0x71,0x7f] + vpdpwsuds 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpwsuds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x46,0xd7,0xd3,0x72,0x80] + vpdpwsuds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpwusd %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x82,0x45,0x00,0xd2,0xf0] + vpdpwusd %xmm24, %xmm23, %xmm22 + +// CHECK: vpdpwusd %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x45,0x07,0xd2,0xf0] + vpdpwusd %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vpdpwusd %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x45,0x87,0xd2,0xf0] + vpdpwusd %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpwusd %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x82,0x45,0x20,0xd2,0xf0] + vpdpwusd %ymm24, %ymm23, %ymm22 + +// CHECK: vpdpwusd %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x45,0x27,0xd2,0xf0] + vpdpwusd %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vpdpwusd %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x45,0xa7,0xd2,0xf0] + vpdpwusd %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpwusd %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x82,0x45,0x40,0xd2,0xf0] + vpdpwusd %zmm24, %zmm23, %zmm22 + +// CHECK: vpdpwusd %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x45,0x47,0xd2,0xf0] + vpdpwusd %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vpdpwusd %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x45,0xc7,0xd2,0xf0] + vpdpwusd %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpwusd 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa2,0x45,0x00,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpwusd 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vpdpwusd 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x45,0x07,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpwusd 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vpdpwusd (%rip){1to4}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x45,0x10,0xd2,0x35,0x00,0x00,0x00,0x00] + vpdpwusd (%rip){1to4}, %xmm23, %xmm22 + +// CHECK: vpdpwusd -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x45,0x00,0xd2,0x34,0x6d,0x00,0xfe,0xff,0xff] + vpdpwusd -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vpdpwusd 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x45,0x87,0xd2,0x71,0x7f] + vpdpwusd 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpwusd -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x45,0x97,0xd2,0x72,0x80] + vpdpwusd -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpwusd 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa2,0x45,0x20,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpwusd 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vpdpwusd 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x45,0x27,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpwusd 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vpdpwusd (%rip){1to8}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x45,0x30,0xd2,0x35,0x00,0x00,0x00,0x00] + vpdpwusd (%rip){1to8}, %ymm23, %ymm22 + +// CHECK: vpdpwusd -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x45,0x20,0xd2,0x34,0x6d,0x00,0xfc,0xff,0xff] + vpdpwusd -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vpdpwusd 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x45,0xa7,0xd2,0x71,0x7f] + vpdpwusd 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpwusd -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x45,0xb7,0xd2,0x72,0x80] + vpdpwusd -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpwusd 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa2,0x45,0x40,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpwusd 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vpdpwusd 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x45,0x47,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpwusd 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vpdpwusd (%rip){1to16}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x45,0x50,0xd2,0x35,0x00,0x00,0x00,0x00] + vpdpwusd (%rip){1to16}, %zmm23, %zmm22 + +// CHECK: vpdpwusd -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x45,0x40,0xd2,0x34,0x6d,0x00,0xf8,0xff,0xff] + vpdpwusd -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vpdpwusd 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x45,0xc7,0xd2,0x71,0x7f] + vpdpwusd 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpwusd -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x45,0xd7,0xd2,0x72,0x80] + vpdpwusd -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpwusds %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x82,0x45,0x00,0xd3,0xf0] + vpdpwusds %xmm24, %xmm23, %xmm22 + +// CHECK: vpdpwusds %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x45,0x07,0xd3,0xf0] + vpdpwusds %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vpdpwusds %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x45,0x87,0xd3,0xf0] + vpdpwusds %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpwusds %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x82,0x45,0x20,0xd3,0xf0] + vpdpwusds %ymm24, %ymm23, %ymm22 + +// CHECK: vpdpwusds %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x45,0x27,0xd3,0xf0] + vpdpwusds %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vpdpwusds %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x45,0xa7,0xd3,0xf0] + vpdpwusds %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpwusds %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x82,0x45,0x40,0xd3,0xf0] + vpdpwusds %zmm24, %zmm23, %zmm22 + +// CHECK: vpdpwusds %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x45,0x47,0xd3,0xf0] + vpdpwusds %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vpdpwusds %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x45,0xc7,0xd3,0xf0] + vpdpwusds %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpwusds 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa2,0x45,0x00,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpwusds 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vpdpwusds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x45,0x07,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpwusds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vpdpwusds (%rip){1to4}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x45,0x10,0xd3,0x35,0x00,0x00,0x00,0x00] + vpdpwusds (%rip){1to4}, %xmm23, %xmm22 + +// CHECK: vpdpwusds -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x45,0x00,0xd3,0x34,0x6d,0x00,0xfe,0xff,0xff] + vpdpwusds -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vpdpwusds 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x45,0x87,0xd3,0x71,0x7f] + vpdpwusds 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpwusds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x45,0x97,0xd3,0x72,0x80] + vpdpwusds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpwusds 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa2,0x45,0x20,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpwusds 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vpdpwusds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x45,0x27,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpwusds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vpdpwusds (%rip){1to8}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x45,0x30,0xd3,0x35,0x00,0x00,0x00,0x00] + vpdpwusds (%rip){1to8}, %ymm23, %ymm22 + +// CHECK: vpdpwusds -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x45,0x20,0xd3,0x34,0x6d,0x00,0xfc,0xff,0xff] + vpdpwusds -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vpdpwusds 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x45,0xa7,0xd3,0x71,0x7f] + vpdpwusds 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpwusds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x45,0xb7,0xd3,0x72,0x80] + vpdpwusds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpwusds 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa2,0x45,0x40,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpwusds 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vpdpwusds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x45,0x47,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpwusds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vpdpwusds (%rip){1to16}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x45,0x50,0xd3,0x35,0x00,0x00,0x00,0x00] + vpdpwusds (%rip){1to16}, %zmm23, %zmm22 + +// CHECK: vpdpwusds -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x45,0x40,0xd3,0x34,0x6d,0x00,0xf8,0xff,0xff] + vpdpwusds -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vpdpwusds 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x45,0xc7,0xd3,0x71,0x7f] + vpdpwusds 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpwusds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x45,0xd7,0xd3,0x72,0x80] + vpdpwusds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpwuud %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x82,0x44,0x00,0xd2,0xf0] + vpdpwuud %xmm24, %xmm23, %xmm22 + +// CHECK: vpdpwuud %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x44,0x07,0xd2,0xf0] + vpdpwuud %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vpdpwuud %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x44,0x87,0xd2,0xf0] + vpdpwuud %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpwuud %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x82,0x44,0x20,0xd2,0xf0] + vpdpwuud %ymm24, %ymm23, %ymm22 + +// CHECK: vpdpwuud %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x44,0x27,0xd2,0xf0] + vpdpwuud %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vpdpwuud %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x44,0xa7,0xd2,0xf0] + vpdpwuud %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpwuud %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x82,0x44,0x40,0xd2,0xf0] + vpdpwuud %zmm24, %zmm23, %zmm22 + +// CHECK: vpdpwuud %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x44,0x47,0xd2,0xf0] + vpdpwuud %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vpdpwuud %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x44,0xc7,0xd2,0xf0] + vpdpwuud %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpwuud 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa2,0x44,0x00,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpwuud 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vpdpwuud 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x44,0x07,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpwuud 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vpdpwuud (%rip){1to4}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x10,0xd2,0x35,0x00,0x00,0x00,0x00] + vpdpwuud (%rip){1to4}, %xmm23, %xmm22 + +// CHECK: vpdpwuud -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x00,0xd2,0x34,0x6d,0x00,0xfe,0xff,0xff] + vpdpwuud -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vpdpwuud 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0x87,0xd2,0x71,0x7f] + vpdpwuud 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpwuud -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0x97,0xd2,0x72,0x80] + vpdpwuud -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpwuud 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa2,0x44,0x20,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpwuud 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vpdpwuud 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x44,0x27,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpwuud 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vpdpwuud (%rip){1to8}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x30,0xd2,0x35,0x00,0x00,0x00,0x00] + vpdpwuud (%rip){1to8}, %ymm23, %ymm22 + +// CHECK: vpdpwuud -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x20,0xd2,0x34,0x6d,0x00,0xfc,0xff,0xff] + vpdpwuud -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vpdpwuud 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0xa7,0xd2,0x71,0x7f] + vpdpwuud 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpwuud -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0xb7,0xd2,0x72,0x80] + vpdpwuud -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpwuud 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa2,0x44,0x40,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpwuud 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vpdpwuud 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x44,0x47,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpwuud 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vpdpwuud (%rip){1to16}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x50,0xd2,0x35,0x00,0x00,0x00,0x00] + vpdpwuud (%rip){1to16}, %zmm23, %zmm22 + +// CHECK: vpdpwuud -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x40,0xd2,0x34,0x6d,0x00,0xf8,0xff,0xff] + vpdpwuud -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vpdpwuud 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0xc7,0xd2,0x71,0x7f] + vpdpwuud 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpwuud -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0xd7,0xd2,0x72,0x80] + vpdpwuud -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpwuuds %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x82,0x44,0x00,0xd3,0xf0] + vpdpwuuds %xmm24, %xmm23, %xmm22 + +// CHECK: vpdpwuuds %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x44,0x07,0xd3,0xf0] + vpdpwuuds %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vpdpwuuds %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x44,0x87,0xd3,0xf0] + vpdpwuuds %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpwuuds %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x82,0x44,0x20,0xd3,0xf0] + vpdpwuuds %ymm24, %ymm23, %ymm22 + +// CHECK: vpdpwuuds %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x44,0x27,0xd3,0xf0] + vpdpwuuds %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vpdpwuuds %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x44,0xa7,0xd3,0xf0] + vpdpwuuds %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpwuuds %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x82,0x44,0x40,0xd3,0xf0] + vpdpwuuds %zmm24, %zmm23, %zmm22 + +// CHECK: vpdpwuuds %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x44,0x47,0xd3,0xf0] + vpdpwuuds %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vpdpwuuds %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x44,0xc7,0xd3,0xf0] + vpdpwuuds %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpwuuds 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa2,0x44,0x00,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpwuuds 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vpdpwuuds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x44,0x07,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpwuuds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vpdpwuuds (%rip){1to4}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x10,0xd3,0x35,0x00,0x00,0x00,0x00] + vpdpwuuds (%rip){1to4}, %xmm23, %xmm22 + +// CHECK: vpdpwuuds -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x00,0xd3,0x34,0x6d,0x00,0xfe,0xff,0xff] + vpdpwuuds -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vpdpwuuds 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0x87,0xd3,0x71,0x7f] + vpdpwuuds 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpwuuds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0x97,0xd3,0x72,0x80] + vpdpwuuds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vpdpwuuds 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa2,0x44,0x20,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpwuuds 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vpdpwuuds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x44,0x27,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpwuuds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vpdpwuuds (%rip){1to8}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x30,0xd3,0x35,0x00,0x00,0x00,0x00] + vpdpwuuds (%rip){1to8}, %ymm23, %ymm22 + +// CHECK: vpdpwuuds -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x20,0xd3,0x34,0x6d,0x00,0xfc,0xff,0xff] + vpdpwuuds -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vpdpwuuds 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0xa7,0xd3,0x71,0x7f] + vpdpwuuds 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpwuuds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0xb7,0xd3,0x72,0x80] + vpdpwuuds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vpdpwuuds 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa2,0x44,0x40,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10] + vpdpwuuds 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vpdpwuuds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x44,0x47,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00] + vpdpwuuds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vpdpwuuds (%rip){1to16}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x50,0xd3,0x35,0x00,0x00,0x00,0x00] + vpdpwuuds (%rip){1to16}, %zmm23, %zmm22 + +// CHECK: vpdpwuuds -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x40,0xd3,0x34,0x6d,0x00,0xf8,0xff,0xff] + vpdpwuuds -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vpdpwuuds 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0xc7,0xd3,0x71,0x7f] + vpdpwuuds 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vpdpwuuds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0xd7,0xd3,0x72,0x80] + vpdpwuuds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} + // VMPSADBW // CHECK: vmpsadbw $123, %xmm24, %xmm23, %xmm22 diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc index e85cde314059484..3b7caeff44e5789 100644 --- a/llvm/test/TableGen/x86-fold-tables.inc +++ b/llvm/test/TableGen/x86-fold-tables.inc @@ -4175,6 +4175,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VDPBF16PSZ128r, X86::VDPBF16PSZ128m, 0}, {X86::VDPBF16PSZ256r, X86::VDPBF16PSZ256m, 0}, {X86::VDPBF16PSZr, X86::VDPBF16PSZm, 0}, + {X86::VDPPHPSZ128r, X86::VDPPHPSZ128m, 0}, + {X86::VDPPHPSZ256r, X86::VDPPHPSZ256m, 0}, + {X86::VDPPHPSZr, X86::VDPPHPSZm, 0}, {X86::VEXP2PDZrk, X86::VEXP2PDZmk, 0}, {X86::VEXP2PSZrk, X86::VEXP2PSZmk, 0}, {X86::VEXPANDPDZ128rrk, X86::VEXPANDPDZ128rmk, TB_NO_REVERSE}, @@ -4913,12 +4916,24 @@ static const X86FoldTableEntry Table3[] = { {X86::VPCONFLICTQZ256rrk, X86::VPCONFLICTQZ256rmk, 0}, {X86::VPCONFLICTQZrrk, X86::VPCONFLICTQZrmk, 0}, {X86::VPDPBSSDSYrr, X86::VPDPBSSDSYrm, 0}, + {X86::VPDPBSSDSZ128r, X86::VPDPBSSDSZ128m, 0}, + {X86::VPDPBSSDSZ256r, X86::VPDPBSSDSZ256m, 0}, + {X86::VPDPBSSDSZr, X86::VPDPBSSDSZm, 0}, {X86::VPDPBSSDSrr, X86::VPDPBSSDSrm, 0}, {X86::VPDPBSSDYrr, X86::VPDPBSSDYrm, 0}, + {X86::VPDPBSSDZ128r, X86::VPDPBSSDZ128m, 0}, + {X86::VPDPBSSDZ256r, X86::VPDPBSSDZ256m, 0}, + {X86::VPDPBSSDZr, X86::VPDPBSSDZm, 0}, {X86::VPDPBSSDrr, X86::VPDPBSSDrm, 0}, {X86::VPDPBSUDSYrr, X86::VPDPBSUDSYrm, 0}, + {X86::VPDPBSUDSZ128r, X86::VPDPBSUDSZ128m, 0}, + {X86::VPDPBSUDSZ256r, X86::VPDPBSUDSZ256m, 0}, + {X86::VPDPBSUDSZr, X86::VPDPBSUDSZm, 0}, {X86::VPDPBSUDSrr, X86::VPDPBSUDSrm, 0}, {X86::VPDPBSUDYrr, X86::VPDPBSUDYrm, 0}, + {X86::VPDPBSUDZ128r, X86::VPDPBSUDZ128m, 0}, + {X86::VPDPBSUDZ256r, X86::VPDPBSUDZ256m, 0}, + {X86::VPDPBSUDZr, X86::VPDPBSUDZm, 0}, {X86::VPDPBSUDrr, X86::VPDPBSUDrm, 0}, {X86::VPDPBUSDSYrr, X86::VPDPBUSDSYrm, 0}, {X86::VPDPBUSDSZ128r, X86::VPDPBUSDSZ128m, 0}, @@ -4931,8 +4946,14 @@ static const X86FoldTableEntry Table3[] = { {X86::VPDPBUSDZr, X86::VPDPBUSDZm, 0}, {X86::VPDPBUSDrr, X86::VPDPBUSDrm, 0}, {X86::VPDPBUUDSYrr, X86::VPDPBUUDSYrm, 0}, + {X86::VPDPBUUDSZ128r, X86::VPDPBUUDSZ128m, 0}, + {X86::VPDPBUUDSZ256r, X86::VPDPBUUDSZ256m, 0}, + {X86::VPDPBUUDSZr, X86::VPDPBUUDSZm, 0}, {X86::VPDPBUUDSrr, X86::VPDPBUUDSrm, 0}, {X86::VPDPBUUDYrr, X86::VPDPBUUDYrm, 0}, + {X86::VPDPBUUDZ128r, X86::VPDPBUUDZ128m, 0}, + {X86::VPDPBUUDZ256r, X86::VPDPBUUDZ256m, 0}, + {X86::VPDPBUUDZr, X86::VPDPBUUDZm, 0}, {X86::VPDPBUUDrr, X86::VPDPBUUDrm, 0}, {X86::VPDPWSSDSYrr, X86::VPDPWSSDSYrm, 0}, {X86::VPDPWSSDSZ128r, X86::VPDPWSSDSZ128m, 0}, @@ -4945,16 +4966,34 @@ static const X86FoldTableEntry Table3[] = { {X86::VPDPWSSDZr, X86::VPDPWSSDZm, 0}, {X86::VPDPWSSDrr, X86::VPDPWSSDrm, 0}, {X86::VPDPWSUDSYrr, X86::VPDPWSUDSYrm, 0}, + {X86::VPDPWSUDSZ128r, X86::VPDPWSUDSZ128m, 0}, + {X86::VPDPWSUDSZ256r, X86::VPDPWSUDSZ256m, 0}, + {X86::VPDPWSUDSZr, X86::VPDPWSUDSZm, 0}, {X86::VPDPWSUDSrr, X86::VPDPWSUDSrm, 0}, {X86::VPDPWSUDYrr, X86::VPDPWSUDYrm, 0}, + {X86::VPDPWSUDZ128r, X86::VPDPWSUDZ128m, 0}, + {X86::VPDPWSUDZ256r, X86::VPDPWSUDZ256m, 0}, + {X86::VPDPWSUDZr, X86::VPDPWSUDZm, 0}, {X86::VPDPWSUDrr, X86::VPDPWSUDrm, 0}, {X86::VPDPWUSDSYrr, X86::VPDPWUSDSYrm, 0}, + {X86::VPDPWUSDSZ128r, X86::VPDPWUSDSZ128m, 0}, + {X86::VPDPWUSDSZ256r, X86::VPDPWUSDSZ256m, 0}, + {X86::VPDPWUSDSZr, X86::VPDPWUSDSZm, 0}, {X86::VPDPWUSDSrr, X86::VPDPWUSDSrm, 0}, {X86::VPDPWUSDYrr, X86::VPDPWUSDYrm, 0}, + {X86::VPDPWUSDZ128r, X86::VPDPWUSDZ128m, 0}, + {X86::VPDPWUSDZ256r, X86::VPDPWUSDZ256m, 0}, + {X86::VPDPWUSDZr, X86::VPDPWUSDZm, 0}, {X86::VPDPWUSDrr, X86::VPDPWUSDrm, 0}, {X86::VPDPWUUDSYrr, X86::VPDPWUUDSYrm, 0}, + {X86::VPDPWUUDSZ128r, X86::VPDPWUUDSZ128m, 0}, + {X86::VPDPWUUDSZ256r, X86::VPDPWUUDSZ256m, 0}, + {X86::VPDPWUUDSZr, X86::VPDPWUUDSZm, 0}, {X86::VPDPWUUDSrr, X86::VPDPWUUDSrm, 0}, {X86::VPDPWUUDYrr, X86::VPDPWUUDYrm, 0}, + {X86::VPDPWUUDZ128r, X86::VPDPWUUDZ128m, 0}, + {X86::VPDPWUUDZ256r, X86::VPDPWUUDZ256m, 0}, + {X86::VPDPWUUDZr, X86::VPDPWUUDZm, 0}, {X86::VPDPWUUDrr, X86::VPDPWUUDrm, 0}, {X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0}, {X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0}, @@ -5628,6 +5667,12 @@ static const X86FoldTableEntry Table4[] = { {X86::VDPBF16PSZ256rkz, X86::VDPBF16PSZ256mkz, 0}, {X86::VDPBF16PSZrk, X86::VDPBF16PSZmk, 0}, {X86::VDPBF16PSZrkz, X86::VDPBF16PSZmkz, 0}, + {X86::VDPPHPSZ128rk, X86::VDPPHPSZ128mk, 0}, + {X86::VDPPHPSZ128rkz, X86::VDPPHPSZ128mkz, 0}, + {X86::VDPPHPSZ256rk, X86::VDPPHPSZ256mk, 0}, + {X86::VDPPHPSZ256rkz, X86::VDPPHPSZ256mkz, 0}, + {X86::VDPPHPSZrk, X86::VDPPHPSZmk, 0}, + {X86::VDPPHPSZrkz, X86::VDPPHPSZmkz, 0}, {X86::VFCMADDCPHZ128rk, X86::VFCMADDCPHZ128mk, 0}, {X86::VFCMADDCPHZ128rkz, X86::VFCMADDCPHZ128mkz, 0}, {X86::VFCMADDCPHZ256rk, X86::VFCMADDCPHZ256mk, 0}, @@ -6226,6 +6271,30 @@ static const X86FoldTableEntry Table4[] = { {X86::VPAVGWZ128rrk, X86::VPAVGWZ128rmk, 0}, {X86::VPAVGWZ256rrk, X86::VPAVGWZ256rmk, 0}, {X86::VPAVGWZrrk, X86::VPAVGWZrmk, 0}, + {X86::VPDPBSSDSZ128rk, X86::VPDPBSSDSZ128mk, 0}, + {X86::VPDPBSSDSZ128rkz, X86::VPDPBSSDSZ128mkz, 0}, + {X86::VPDPBSSDSZ256rk, X86::VPDPBSSDSZ256mk, 0}, + {X86::VPDPBSSDSZ256rkz, X86::VPDPBSSDSZ256mkz, 0}, + {X86::VPDPBSSDSZrk, X86::VPDPBSSDSZmk, 0}, + {X86::VPDPBSSDSZrkz, X86::VPDPBSSDSZmkz, 0}, + {X86::VPDPBSSDZ128rk, X86::VPDPBSSDZ128mk, 0}, + {X86::VPDPBSSDZ128rkz, X86::VPDPBSSDZ128mkz, 0}, + {X86::VPDPBSSDZ256rk, X86::VPDPBSSDZ256mk, 0}, + {X86::VPDPBSSDZ256rkz, X86::VPDPBSSDZ256mkz, 0}, + {X86::VPDPBSSDZrk, X86::VPDPBSSDZmk, 0}, + {X86::VPDPBSSDZrkz, X86::VPDPBSSDZmkz, 0}, + {X86::VPDPBSUDSZ128rk, X86::VPDPBSUDSZ128mk, 0}, + {X86::VPDPBSUDSZ128rkz, X86::VPDPBSUDSZ128mkz, 0}, + {X86::VPDPBSUDSZ256rk, X86::VPDPBSUDSZ256mk, 0}, + {X86::VPDPBSUDSZ256rkz, X86::VPDPBSUDSZ256mkz, 0}, + {X86::VPDPBSUDSZrk, X86::VPDPBSUDSZmk, 0}, + {X86::VPDPBSUDSZrkz, X86::VPDPBSUDSZmkz, 0}, + {X86::VPDPBSUDZ128rk, X86::VPDPBSUDZ128mk, 0}, + {X86::VPDPBSUDZ128rkz, X86::VPDPBSUDZ128mkz, 0}, + {X86::VPDPBSUDZ256rk, X86::VPDPBSUDZ256mk, 0}, + {X86::VPDPBSUDZ256rkz, X86::VPDPBSUDZ256mkz, 0}, + {X86::VPDPBSUDZrk, X86::VPDPBSUDZmk, 0}, + {X86::VPDPBSUDZrkz, X86::VPDPBSUDZmkz, 0}, {X86::VPDPBUSDSZ128rk, X86::VPDPBUSDSZ128mk, 0}, {X86::VPDPBUSDSZ128rkz, X86::VPDPBUSDSZ128mkz, 0}, {X86::VPDPBUSDSZ256rk, X86::VPDPBUSDSZ256mk, 0}, @@ -6238,6 +6307,18 @@ static const X86FoldTableEntry Table4[] = { {X86::VPDPBUSDZ256rkz, X86::VPDPBUSDZ256mkz, 0}, {X86::VPDPBUSDZrk, X86::VPDPBUSDZmk, 0}, {X86::VPDPBUSDZrkz, X86::VPDPBUSDZmkz, 0}, + {X86::VPDPBUUDSZ128rk, X86::VPDPBUUDSZ128mk, 0}, + {X86::VPDPBUUDSZ128rkz, X86::VPDPBUUDSZ128mkz, 0}, + {X86::VPDPBUUDSZ256rk, X86::VPDPBUUDSZ256mk, 0}, + {X86::VPDPBUUDSZ256rkz, X86::VPDPBUUDSZ256mkz, 0}, + {X86::VPDPBUUDSZrk, X86::VPDPBUUDSZmk, 0}, + {X86::VPDPBUUDSZrkz, X86::VPDPBUUDSZmkz, 0}, + {X86::VPDPBUUDZ128rk, X86::VPDPBUUDZ128mk, 0}, + {X86::VPDPBUUDZ128rkz, X86::VPDPBUUDZ128mkz, 0}, + {X86::VPDPBUUDZ256rk, X86::VPDPBUUDZ256mk, 0}, + {X86::VPDPBUUDZ256rkz, X86::VPDPBUUDZ256mkz, 0}, + {X86::VPDPBUUDZrk, X86::VPDPBUUDZmk, 0}, + {X86::VPDPBUUDZrkz, X86::VPDPBUUDZmkz, 0}, {X86::VPDPWSSDSZ128rk, X86::VPDPWSSDSZ128mk, 0}, {X86::VPDPWSSDSZ128rkz, X86::VPDPWSSDSZ128mkz, 0}, {X86::VPDPWSSDSZ256rk, X86::VPDPWSSDSZ256mk, 0}, @@ -6250,6 +6331,42 @@ static const X86FoldTableEntry Table4[] = { {X86::VPDPWSSDZ256rkz, X86::VPDPWSSDZ256mkz, 0}, {X86::VPDPWSSDZrk, X86::VPDPWSSDZmk, 0}, {X86::VPDPWSSDZrkz, X86::VPDPWSSDZmkz, 0}, + {X86::VPDPWSUDSZ128rk, X86::VPDPWSUDSZ128mk, 0}, + {X86::VPDPWSUDSZ128rkz, X86::VPDPWSUDSZ128mkz, 0}, + {X86::VPDPWSUDSZ256rk, X86::VPDPWSUDSZ256mk, 0}, + {X86::VPDPWSUDSZ256rkz, X86::VPDPWSUDSZ256mkz, 0}, + {X86::VPDPWSUDSZrk, X86::VPDPWSUDSZmk, 0}, + {X86::VPDPWSUDSZrkz, X86::VPDPWSUDSZmkz, 0}, + {X86::VPDPWSUDZ128rk, X86::VPDPWSUDZ128mk, 0}, + {X86::VPDPWSUDZ128rkz, X86::VPDPWSUDZ128mkz, 0}, + {X86::VPDPWSUDZ256rk, X86::VPDPWSUDZ256mk, 0}, + {X86::VPDPWSUDZ256rkz, X86::VPDPWSUDZ256mkz, 0}, + {X86::VPDPWSUDZrk, X86::VPDPWSUDZmk, 0}, + {X86::VPDPWSUDZrkz, X86::VPDPWSUDZmkz, 0}, + {X86::VPDPWUSDSZ128rk, X86::VPDPWUSDSZ128mk, 0}, + {X86::VPDPWUSDSZ128rkz, X86::VPDPWUSDSZ128mkz, 0}, + {X86::VPDPWUSDSZ256rk, X86::VPDPWUSDSZ256mk, 0}, + {X86::VPDPWUSDSZ256rkz, X86::VPDPWUSDSZ256mkz, 0}, + {X86::VPDPWUSDSZrk, X86::VPDPWUSDSZmk, 0}, + {X86::VPDPWUSDSZrkz, X86::VPDPWUSDSZmkz, 0}, + {X86::VPDPWUSDZ128rk, X86::VPDPWUSDZ128mk, 0}, + {X86::VPDPWUSDZ128rkz, X86::VPDPWUSDZ128mkz, 0}, + {X86::VPDPWUSDZ256rk, X86::VPDPWUSDZ256mk, 0}, + {X86::VPDPWUSDZ256rkz, X86::VPDPWUSDZ256mkz, 0}, + {X86::VPDPWUSDZrk, X86::VPDPWUSDZmk, 0}, + {X86::VPDPWUSDZrkz, X86::VPDPWUSDZmkz, 0}, + {X86::VPDPWUUDSZ128rk, X86::VPDPWUUDSZ128mk, 0}, + {X86::VPDPWUUDSZ128rkz, X86::VPDPWUUDSZ128mkz, 0}, + {X86::VPDPWUUDSZ256rk, X86::VPDPWUUDSZ256mk, 0}, + {X86::VPDPWUUDSZ256rkz, X86::VPDPWUUDSZ256mkz, 0}, + {X86::VPDPWUUDSZrk, X86::VPDPWUUDSZmk, 0}, + {X86::VPDPWUUDSZrkz, X86::VPDPWUUDSZmkz, 0}, + {X86::VPDPWUUDZ128rk, X86::VPDPWUUDZ128mk, 0}, + {X86::VPDPWUUDZ128rkz, X86::VPDPWUUDZ128mkz, 0}, + {X86::VPDPWUUDZ256rk, X86::VPDPWUUDZ256mk, 0}, + {X86::VPDPWUUDZ256rkz, X86::VPDPWUUDZ256mkz, 0}, + {X86::VPDPWUUDZrk, X86::VPDPWUUDZmk, 0}, + {X86::VPDPWUUDZrkz, X86::VPDPWUUDZmkz, 0}, {X86::VPERMBZ128rrk, X86::VPERMBZ128rmk, 0}, {X86::VPERMBZ256rrk, X86::VPERMBZ256rmk, 0}, {X86::VPERMBZrrk, X86::VPERMBZrmk, 0}, @@ -7892,6 +8009,9 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VDPBF16PSZ128r, X86::VDPBF16PSZ128mb, TB_BCAST_SS}, {X86::VDPBF16PSZ256r, X86::VDPBF16PSZ256mb, TB_BCAST_SS}, {X86::VDPBF16PSZr, X86::VDPBF16PSZmb, TB_BCAST_SS}, + {X86::VDPPHPSZ128r, X86::VDPPHPSZ128mb, TB_BCAST_SS}, + {X86::VDPPHPSZ256r, X86::VDPPHPSZ256mb, TB_BCAST_SS}, + {X86::VDPPHPSZr, X86::VDPPHPSZmb, TB_BCAST_SS}, {X86::VEXP2PDZrk, X86::VEXP2PDZmbk, TB_BCAST_SD}, {X86::VEXP2PSZrk, X86::VEXP2PSZmbk, TB_BCAST_SS}, {X86::VFCMADDCPHZ128r, X86::VFCMADDCPHZ128mb, TB_BCAST_SS}, @@ -8227,18 +8347,54 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VPCONFLICTQZ128rrk, X86::VPCONFLICTQZ128rmbk, TB_BCAST_Q}, {X86::VPCONFLICTQZ256rrk, X86::VPCONFLICTQZ256rmbk, TB_BCAST_Q}, {X86::VPCONFLICTQZrrk, X86::VPCONFLICTQZrmbk, TB_BCAST_Q}, + {X86::VPDPBSSDSZ128r, X86::VPDPBSSDSZ128mb, TB_BCAST_D}, + {X86::VPDPBSSDSZ256r, X86::VPDPBSSDSZ256mb, TB_BCAST_D}, + {X86::VPDPBSSDSZr, X86::VPDPBSSDSZmb, TB_BCAST_D}, + {X86::VPDPBSSDZ128r, X86::VPDPBSSDZ128mb, TB_BCAST_D}, + {X86::VPDPBSSDZ256r, X86::VPDPBSSDZ256mb, TB_BCAST_D}, + {X86::VPDPBSSDZr, X86::VPDPBSSDZmb, TB_BCAST_D}, + {X86::VPDPBSUDSZ128r, X86::VPDPBSUDSZ128mb, TB_BCAST_D}, + {X86::VPDPBSUDSZ256r, X86::VPDPBSUDSZ256mb, TB_BCAST_D}, + {X86::VPDPBSUDSZr, X86::VPDPBSUDSZmb, TB_BCAST_D}, + {X86::VPDPBSUDZ128r, X86::VPDPBSUDZ128mb, TB_BCAST_D}, + {X86::VPDPBSUDZ256r, X86::VPDPBSUDZ256mb, TB_BCAST_D}, + {X86::VPDPBSUDZr, X86::VPDPBSUDZmb, TB_BCAST_D}, {X86::VPDPBUSDSZ128r, X86::VPDPBUSDSZ128mb, TB_BCAST_D}, {X86::VPDPBUSDSZ256r, X86::VPDPBUSDSZ256mb, TB_BCAST_D}, {X86::VPDPBUSDSZr, X86::VPDPBUSDSZmb, TB_BCAST_D}, {X86::VPDPBUSDZ128r, X86::VPDPBUSDZ128mb, TB_BCAST_D}, {X86::VPDPBUSDZ256r, X86::VPDPBUSDZ256mb, TB_BCAST_D}, {X86::VPDPBUSDZr, X86::VPDPBUSDZmb, TB_BCAST_D}, + {X86::VPDPBUUDSZ128r, X86::VPDPBUUDSZ128mb, TB_BCAST_D}, + {X86::VPDPBUUDSZ256r, X86::VPDPBUUDSZ256mb, TB_BCAST_D}, + {X86::VPDPBUUDSZr, X86::VPDPBUUDSZmb, TB_BCAST_D}, + {X86::VPDPBUUDZ128r, X86::VPDPBUUDZ128mb, TB_BCAST_D}, + {X86::VPDPBUUDZ256r, X86::VPDPBUUDZ256mb, TB_BCAST_D}, + {X86::VPDPBUUDZr, X86::VPDPBUUDZmb, TB_BCAST_D}, {X86::VPDPWSSDSZ128r, X86::VPDPWSSDSZ128mb, TB_BCAST_D}, {X86::VPDPWSSDSZ256r, X86::VPDPWSSDSZ256mb, TB_BCAST_D}, {X86::VPDPWSSDSZr, X86::VPDPWSSDSZmb, TB_BCAST_D}, {X86::VPDPWSSDZ128r, X86::VPDPWSSDZ128mb, TB_BCAST_D}, {X86::VPDPWSSDZ256r, X86::VPDPWSSDZ256mb, TB_BCAST_D}, {X86::VPDPWSSDZr, X86::VPDPWSSDZmb, TB_BCAST_D}, + {X86::VPDPWSUDSZ128r, X86::VPDPWSUDSZ128mb, TB_BCAST_D}, + {X86::VPDPWSUDSZ256r, X86::VPDPWSUDSZ256mb, TB_BCAST_D}, + {X86::VPDPWSUDSZr, X86::VPDPWSUDSZmb, TB_BCAST_D}, + {X86::VPDPWSUDZ128r, X86::VPDPWSUDZ128mb, TB_BCAST_D}, + {X86::VPDPWSUDZ256r, X86::VPDPWSUDZ256mb, TB_BCAST_D}, + {X86::VPDPWSUDZr, X86::VPDPWSUDZmb, TB_BCAST_D}, + {X86::VPDPWUSDSZ128r, X86::VPDPWUSDSZ128mb, TB_BCAST_D}, + {X86::VPDPWUSDSZ256r, X86::VPDPWUSDSZ256mb, TB_BCAST_D}, + {X86::VPDPWUSDSZr, X86::VPDPWUSDSZmb, TB_BCAST_D}, + {X86::VPDPWUSDZ128r, X86::VPDPWUSDZ128mb, TB_BCAST_D}, + {X86::VPDPWUSDZ256r, X86::VPDPWUSDZ256mb, TB_BCAST_D}, + {X86::VPDPWUSDZr, X86::VPDPWUSDZmb, TB_BCAST_D}, + {X86::VPDPWUUDSZ128r, X86::VPDPWUUDSZ128mb, TB_BCAST_D}, + {X86::VPDPWUUDSZ256r, X86::VPDPWUUDSZ256mb, TB_BCAST_D}, + {X86::VPDPWUUDSZr, X86::VPDPWUUDSZmb, TB_BCAST_D}, + {X86::VPDPWUUDZ128r, X86::VPDPWUUDZ128mb, TB_BCAST_D}, + {X86::VPDPWUUDZ256r, X86::VPDPWUUDZ256mb, TB_BCAST_D}, + {X86::VPDPWUUDZr, X86::VPDPWUUDZmb, TB_BCAST_D}, {X86::VPERMDZ256rrkz, X86::VPERMDZ256rmbkz, TB_BCAST_D}, {X86::VPERMDZrrkz, X86::VPERMDZrmbkz, TB_BCAST_D}, {X86::VPERMI2DZ128rr, X86::VPERMI2DZ128rmb, TB_BCAST_D}, @@ -8632,6 +8788,12 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VDPBF16PSZ256rkz, X86::VDPBF16PSZ256mbkz, TB_BCAST_SS}, {X86::VDPBF16PSZrk, X86::VDPBF16PSZmbk, TB_BCAST_SS}, {X86::VDPBF16PSZrkz, X86::VDPBF16PSZmbkz, TB_BCAST_SS}, + {X86::VDPPHPSZ128rk, X86::VDPPHPSZ128mbk, TB_BCAST_SS}, + {X86::VDPPHPSZ128rkz, X86::VDPPHPSZ128mbkz, TB_BCAST_SS}, + {X86::VDPPHPSZ256rk, X86::VDPPHPSZ256mbk, TB_BCAST_SS}, + {X86::VDPPHPSZ256rkz, X86::VDPPHPSZ256mbkz, TB_BCAST_SS}, + {X86::VDPPHPSZrk, X86::VDPPHPSZmbk, TB_BCAST_SS}, + {X86::VDPPHPSZrkz, X86::VDPPHPSZmbkz, TB_BCAST_SS}, {X86::VFCMADDCPHZ128rk, X86::VFCMADDCPHZ128mbk, TB_BCAST_SS}, {X86::VFCMADDCPHZ128rkz, X86::VFCMADDCPHZ128mbkz, TB_BCAST_SS}, {X86::VFCMADDCPHZ256rk, X86::VFCMADDCPHZ256mbk, TB_BCAST_SS}, @@ -9079,6 +9241,30 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VPANDQZ128rrk, X86::VPANDQZ128rmbk, TB_BCAST_Q}, {X86::VPANDQZ256rrk, X86::VPANDQZ256rmbk, TB_BCAST_Q}, {X86::VPANDQZrrk, X86::VPANDQZrmbk, TB_BCAST_Q}, + {X86::VPDPBSSDSZ128rk, X86::VPDPBSSDSZ128mbk, TB_BCAST_D}, + {X86::VPDPBSSDSZ128rkz, X86::VPDPBSSDSZ128mbkz, TB_BCAST_D}, + {X86::VPDPBSSDSZ256rk, X86::VPDPBSSDSZ256mbk, TB_BCAST_D}, + {X86::VPDPBSSDSZ256rkz, X86::VPDPBSSDSZ256mbkz, TB_BCAST_D}, + {X86::VPDPBSSDSZrk, X86::VPDPBSSDSZmbk, TB_BCAST_D}, + {X86::VPDPBSSDSZrkz, X86::VPDPBSSDSZmbkz, TB_BCAST_D}, + {X86::VPDPBSSDZ128rk, X86::VPDPBSSDZ128mbk, TB_BCAST_D}, + {X86::VPDPBSSDZ128rkz, X86::VPDPBSSDZ128mbkz, TB_BCAST_D}, + {X86::VPDPBSSDZ256rk, X86::VPDPBSSDZ256mbk, TB_BCAST_D}, + {X86::VPDPBSSDZ256rkz, X86::VPDPBSSDZ256mbkz, TB_BCAST_D}, + {X86::VPDPBSSDZrk, X86::VPDPBSSDZmbk, TB_BCAST_D}, + {X86::VPDPBSSDZrkz, X86::VPDPBSSDZmbkz, TB_BCAST_D}, + {X86::VPDPBSUDSZ128rk, X86::VPDPBSUDSZ128mbk, TB_BCAST_D}, + {X86::VPDPBSUDSZ128rkz, X86::VPDPBSUDSZ128mbkz, TB_BCAST_D}, + {X86::VPDPBSUDSZ256rk, X86::VPDPBSUDSZ256mbk, TB_BCAST_D}, + {X86::VPDPBSUDSZ256rkz, X86::VPDPBSUDSZ256mbkz, TB_BCAST_D}, + {X86::VPDPBSUDSZrk, X86::VPDPBSUDSZmbk, TB_BCAST_D}, + {X86::VPDPBSUDSZrkz, X86::VPDPBSUDSZmbkz, TB_BCAST_D}, + {X86::VPDPBSUDZ128rk, X86::VPDPBSUDZ128mbk, TB_BCAST_D}, + {X86::VPDPBSUDZ128rkz, X86::VPDPBSUDZ128mbkz, TB_BCAST_D}, + {X86::VPDPBSUDZ256rk, X86::VPDPBSUDZ256mbk, TB_BCAST_D}, + {X86::VPDPBSUDZ256rkz, X86::VPDPBSUDZ256mbkz, TB_BCAST_D}, + {X86::VPDPBSUDZrk, X86::VPDPBSUDZmbk, TB_BCAST_D}, + {X86::VPDPBSUDZrkz, X86::VPDPBSUDZmbkz, TB_BCAST_D}, {X86::VPDPBUSDSZ128rk, X86::VPDPBUSDSZ128mbk, TB_BCAST_D}, {X86::VPDPBUSDSZ128rkz, X86::VPDPBUSDSZ128mbkz, TB_BCAST_D}, {X86::VPDPBUSDSZ256rk, X86::VPDPBUSDSZ256mbk, TB_BCAST_D}, @@ -9091,6 +9277,18 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VPDPBUSDZ256rkz, X86::VPDPBUSDZ256mbkz, TB_BCAST_D}, {X86::VPDPBUSDZrk, X86::VPDPBUSDZmbk, TB_BCAST_D}, {X86::VPDPBUSDZrkz, X86::VPDPBUSDZmbkz, TB_BCAST_D}, + {X86::VPDPBUUDSZ128rk, X86::VPDPBUUDSZ128mbk, TB_BCAST_D}, + {X86::VPDPBUUDSZ128rkz, X86::VPDPBUUDSZ128mbkz, TB_BCAST_D}, + {X86::VPDPBUUDSZ256rk, X86::VPDPBUUDSZ256mbk, TB_BCAST_D}, + {X86::VPDPBUUDSZ256rkz, X86::VPDPBUUDSZ256mbkz, TB_BCAST_D}, + {X86::VPDPBUUDSZrk, X86::VPDPBUUDSZmbk, TB_BCAST_D}, + {X86::VPDPBUUDSZrkz, X86::VPDPBUUDSZmbkz, TB_BCAST_D}, + {X86::VPDPBUUDZ128rk, X86::VPDPBUUDZ128mbk, TB_BCAST_D}, + {X86::VPDPBUUDZ128rkz, X86::VPDPBUUDZ128mbkz, TB_BCAST_D}, + {X86::VPDPBUUDZ256rk, X86::VPDPBUUDZ256mbk, TB_BCAST_D}, + {X86::VPDPBUUDZ256rkz, X86::VPDPBUUDZ256mbkz, TB_BCAST_D}, + {X86::VPDPBUUDZrk, X86::VPDPBUUDZmbk, TB_BCAST_D}, + {X86::VPDPBUUDZrkz, X86::VPDPBUUDZmbkz, TB_BCAST_D}, {X86::VPDPWSSDSZ128rk, X86::VPDPWSSDSZ128mbk, TB_BCAST_D}, {X86::VPDPWSSDSZ128rkz, X86::VPDPWSSDSZ128mbkz, TB_BCAST_D}, {X86::VPDPWSSDSZ256rk, X86::VPDPWSSDSZ256mbk, TB_BCAST_D}, @@ -9103,6 +9301,42 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VPDPWSSDZ256rkz, X86::VPDPWSSDZ256mbkz, TB_BCAST_D}, {X86::VPDPWSSDZrk, X86::VPDPWSSDZmbk, TB_BCAST_D}, {X86::VPDPWSSDZrkz, X86::VPDPWSSDZmbkz, TB_BCAST_D}, + {X86::VPDPWSUDSZ128rk, X86::VPDPWSUDSZ128mbk, TB_BCAST_D}, + {X86::VPDPWSUDSZ128rkz, X86::VPDPWSUDSZ128mbkz, TB_BCAST_D}, + {X86::VPDPWSUDSZ256rk, X86::VPDPWSUDSZ256mbk, TB_BCAST_D}, + {X86::VPDPWSUDSZ256rkz, X86::VPDPWSUDSZ256mbkz, TB_BCAST_D}, + {X86::VPDPWSUDSZrk, X86::VPDPWSUDSZmbk, TB_BCAST_D}, + {X86::VPDPWSUDSZrkz, X86::VPDPWSUDSZmbkz, TB_BCAST_D}, + {X86::VPDPWSUDZ128rk, X86::VPDPWSUDZ128mbk, TB_BCAST_D}, + {X86::VPDPWSUDZ128rkz, X86::VPDPWSUDZ128mbkz, TB_BCAST_D}, + {X86::VPDPWSUDZ256rk, X86::VPDPWSUDZ256mbk, TB_BCAST_D}, + {X86::VPDPWSUDZ256rkz, X86::VPDPWSUDZ256mbkz, TB_BCAST_D}, + {X86::VPDPWSUDZrk, X86::VPDPWSUDZmbk, TB_BCAST_D}, + {X86::VPDPWSUDZrkz, X86::VPDPWSUDZmbkz, TB_BCAST_D}, + {X86::VPDPWUSDSZ128rk, X86::VPDPWUSDSZ128mbk, TB_BCAST_D}, + {X86::VPDPWUSDSZ128rkz, X86::VPDPWUSDSZ128mbkz, TB_BCAST_D}, + {X86::VPDPWUSDSZ256rk, X86::VPDPWUSDSZ256mbk, TB_BCAST_D}, + {X86::VPDPWUSDSZ256rkz, X86::VPDPWUSDSZ256mbkz, TB_BCAST_D}, + {X86::VPDPWUSDSZrk, X86::VPDPWUSDSZmbk, TB_BCAST_D}, + {X86::VPDPWUSDSZrkz, X86::VPDPWUSDSZmbkz, TB_BCAST_D}, + {X86::VPDPWUSDZ128rk, X86::VPDPWUSDZ128mbk, TB_BCAST_D}, + {X86::VPDPWUSDZ128rkz, X86::VPDPWUSDZ128mbkz, TB_BCAST_D}, + {X86::VPDPWUSDZ256rk, X86::VPDPWUSDZ256mbk, TB_BCAST_D}, + {X86::VPDPWUSDZ256rkz, X86::VPDPWUSDZ256mbkz, TB_BCAST_D}, + {X86::VPDPWUSDZrk, X86::VPDPWUSDZmbk, TB_BCAST_D}, + {X86::VPDPWUSDZrkz, X86::VPDPWUSDZmbkz, TB_BCAST_D}, + {X86::VPDPWUUDSZ128rk, X86::VPDPWUUDSZ128mbk, TB_BCAST_D}, + {X86::VPDPWUUDSZ128rkz, X86::VPDPWUUDSZ128mbkz, TB_BCAST_D}, + {X86::VPDPWUUDSZ256rk, X86::VPDPWUUDSZ256mbk, TB_BCAST_D}, + {X86::VPDPWUUDSZ256rkz, X86::VPDPWUUDSZ256mbkz, TB_BCAST_D}, + {X86::VPDPWUUDSZrk, X86::VPDPWUUDSZmbk, TB_BCAST_D}, + {X86::VPDPWUUDSZrkz, X86::VPDPWUUDSZmbkz, TB_BCAST_D}, + {X86::VPDPWUUDZ128rk, X86::VPDPWUUDZ128mbk, TB_BCAST_D}, + {X86::VPDPWUUDZ128rkz, X86::VPDPWUUDZ128mbkz, TB_BCAST_D}, + {X86::VPDPWUUDZ256rk, X86::VPDPWUUDZ256mbk, TB_BCAST_D}, + {X86::VPDPWUUDZ256rkz, X86::VPDPWUUDZ256mbkz, TB_BCAST_D}, + {X86::VPDPWUUDZrk, X86::VPDPWUUDZmbk, TB_BCAST_D}, + {X86::VPDPWUUDZrkz, X86::VPDPWUUDZmbkz, TB_BCAST_D}, {X86::VPERMDZ256rrk, X86::VPERMDZ256rmbk, TB_BCAST_D}, {X86::VPERMDZrrk, X86::VPERMDZrmbk, TB_BCAST_D}, {X86::VPERMI2DZ128rrk, X86::VPERMI2DZ128rmbk, TB_BCAST_D}, diff --git a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp index f967344135553b6..60b1a48721653f9 100644 --- a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp +++ b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp @@ -242,7 +242,8 @@ void X86InstrMappingEmitter::emitCompressEVEXTable( auto It = llvm::find_if(Predicates, [](const Record *R) { StringRef Name = R->getName(); return Name == "HasAVXNECONVERT" || Name == "HasAVXVNNI" || - Name == "HasAVXIFMA"; + Name == "HasAVXIFMA" || Name == "HasAVXVNNIINT8" || + Name == "HasAVXVNNIINT16"; }); if (It != Predicates.end()) PredicateInsts[(*It)->getValueAsString("CondString")].push_back(NewInst);