diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 896075a57aae87..97f9cabaf6f471 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -2348,11 +2348,6 @@ void Compiler::compSetProcessor() #ifdef TARGET_XARCH if (!compIsForInlining()) { - if (canUseEvexEncoding()) - { - codeGen->GetEmitter()->SetUseEvexEncoding(true); - // TODO-XArch-AVX512 : Revisit other flags to be set once avx512 instructions are added. - } if (canUseVexEncoding()) { codeGen->GetEmitter()->SetUseVEXEncoding(true); @@ -2360,6 +2355,11 @@ void Compiler::compSetProcessor() codeGen->GetEmitter()->SetContainsAVX(false); codeGen->GetEmitter()->SetContains256bitOrMoreAVX(false); } + if (canUseEvexEncoding()) + { + codeGen->GetEmitter()->SetUseEvexEncoding(true); + // TODO-XArch-AVX512 : Revisit other flags to be set once avx512 instructions are added. + } } #endif // TARGET_XARCH } diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 9a255c6f975a97..585adda464ca29 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -9197,6 +9197,13 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #ifdef TARGET_XARCH bool canUseVexEncoding() const { +#ifdef DEBUG + if (JitConfig.JitForceEVEXEncoding()) + { + return true; + } +#endif // DEBUG + return compOpportunisticallyDependsOn(InstructionSet_AVX); } diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index e6bc3b68a56089..36ffe15229f65a 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -1109,7 +1109,7 @@ class emitter } #endif // TARGET_LOONGARCH64 - emitAttr idOpSize() + emitAttr idOpSize() const { return emitDecodeSize(_idOpSize); } diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index e7f4b5d1cd5b3e..e239810c575ed8 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -161,7 +161,16 @@ regNumber emitter::getSseShiftRegNumber(instruction ins) bool emitter::IsVexEncodedInstruction(instruction ins) const { - return UseVEXEncoding() && IsSSEOrAVXInstruction(ins); + if (!UseVEXEncoding()) + { + return false; + } + + // There is also Translate_VEX, however we expect codegen to have handled + // this difference and given us the VEX exclusive version of the instruction + + insFlags flags = CodeGenInterface::instInfo[ins]; + return (flags & Encoding_VEX) != 0; } //------------------------------------------------------------------------ @@ -180,133 +189,11 @@ bool emitter::IsEvexEncodedInstruction(instruction ins) const return false; } - // TODO-XArch-AVX512: Explore adding this as a flag to instr table. - switch (ins) - { - // No EVEX Encoding exists at all. - case INS_pmovmskb: - case INS_movmskpd: - case INS_movmskps: - case INS_dppd: - case INS_dpps: - case INS_maskmovdqu: - case INS_haddps: - case INS_haddpd: - case INS_hsubps: - case INS_hsubpd: - case INS_addsubps: - case INS_addsubpd: - case INS_rcpps: - case INS_rcpss: - case INS_rsqrtps: - case INS_rsqrtss: - case INS_psignb: - case INS_psignd: - case INS_psignw: - case INS_roundps: - case INS_roundss: - case INS_roundpd: - case INS_roundsd: - case INS_blendps: - case INS_blendpd: - case INS_blendvps: - case INS_pblendw: - case INS_pblendvb: - case INS_blendvpd: - case INS_ptest: - case INS_phaddw: - case INS_phsubw: - case INS_phaddd: - case INS_phsubd: - case INS_phaddsw: - case INS_phsubsw: - case INS_lddqu: - case INS_phminposuw: - case INS_mpsadbw: - case INS_pclmulqdq: - case INS_aesenc: - case INS_aesenclast: - case INS_aesdec: - case INS_aesdeclast: - case INS_aesimc: - case INS_aeskeygenassist: - case INS_vzeroupper: - case INS_vperm2i128: - case INS_vperm2f128: - case INS_vpblendd: - case INS_vblendvps: - case INS_vblendvpd: - case INS_vpblendvb: - case INS_vtestps: - case INS_vtestpd: - case INS_vmaskmovps: - case INS_vmaskmovpd: - case INS_vpmaskmovd: - case INS_vpmaskmovq: - case INS_andn: - case INS_blsi: - case INS_blsmsk: - case INS_blsr: - case INS_bextr: - case INS_rorx: - case INS_pdep: - case INS_pext: - case INS_bzhi: - case INS_mulx: -#ifdef TARGET_AMD64 - case INS_shlx: - case INS_sarx: - case INS_shrx: -#endif - case INS_lfence: - case INS_mfence: - case INS_movnti: - case INS_prefetchnta: - case INS_prefetcht0: - case INS_prefetcht1: - case INS_prefetcht2: - case INS_sfence: - // Might need new INS_*suffix* instructions for these. - case INS_vbroadcastf128: // INS_vbroadcastf32x4, INS_vbroadcastf64x2. - case INS_vbroadcasti128: // INS_vbroadcasti32x4, INS_vbroadcasti64x2. - - case INS_kmovb_msk: - case INS_kmovw_msk: - case INS_kmovd_msk: - case INS_kmovq_msk: - case INS_kmovb_gpr: - case INS_kmovw_gpr: - case INS_kmovd_gpr: - case INS_kmovq_gpr: - - // TODO-XARCH-AVX512 these need to be encoded with the proper individual EVEX instructions (movdqu8, - // movdqu16 etc) - // For implementation speed, I have set it up so the standing instruction will default to the 32-bit operand - // type - // i.e., movdqu => movdqu32 etc - // Since we are not using k registers yet, this will have no impact on correctness but will affect things - // once - // k registers are used (as that is the point of the "break out operand type" of these instructions) - // case INS_movdqa: // INS_vmovdqa32, INS_vmovdqa64. - // case INS_movdqu: // INS_movdqu8, INS_movdqu16, INS_vmovdqu32, INS_vmovdqu64. - // case INS_pand: // INS_vpandd, INS_vpandq. - // case INS_pandn: // INS_vpandnd, INS_vpandnq. - // case INS_por: // INS_vpord, INS_vporq. - // case INS_pxor: // INS_vpxord, INS_vpxorq - // case INS_vextractf128: // INS_vextractf32x4, INS_vextractf64x2. - // case INS_vextracti128: // INS_vextracti32x4, INS_vextracti64x2. - // case INS_vinsertf128: // INS_vinsertf32x4, INS_vinsertf64x2. - // case INS_vinserti128: // INS_vinserti32x4, INS_vinserti64x2. - { - return false; - } - default: - { - break; - } - } + // There is also Translate_EVEX, however we expect codegen to have handled + // this difference and given us the EVEX exclusive version of the instruction - return IsAvx512OrPriorInstruction(ins); + insFlags flags = CodeGenInterface::instInfo[ins]; + return (flags & Encoding_EVEX) != 0; } //------------------------------------------------------------------------ @@ -320,30 +207,34 @@ bool emitter::IsEvexEncodedInstruction(instruction ins) const // bool emitter::IsVexOrEvexEncodedInstruction(instruction ins) const { - return IsEvexEncodedInstruction(ins) || IsVexEncodedInstruction(ins); + return IsVexEncodedInstruction(ins) || IsEvexEncodedInstruction(ins); } // Returns true if the AVX instruction is a binary operator that requires 3 operands. // When we emit an instruction with only two operands, we will duplicate the destination // as a source. -// TODO-XArch-Cleanup: This is a temporary solution for now. Eventually this needs to -// be formalized by adding an additional field to instruction table to -// to indicate whether a 3-operand instruction. -bool emitter::IsDstDstSrcAVXInstruction(instruction ins) +bool emitter::IsDstDstSrcAVXInstruction(instruction ins) const { - return ((CodeGenInterface::instInfo[ins] & INS_Flags_IsDstDstSrcAVXInstruction) != 0) && - IsVexOrEvexEncodedInstruction(ins); + if (!UseVEXEncoding()) + { + return false; + } + + insFlags flags = CodeGenInterface::instInfo[ins]; + return (flags & INS_Flags_IsDstDstSrcAVXInstruction) != 0; } // Returns true if the AVX instruction requires 3 operands that duplicate the source // register in the vvvv field. -// TODO-XArch-Cleanup: This is a temporary solution for now. Eventually this needs to -// be formalized by adding an additional field to instruction table to -// to indicate whether a 3-operand instruction. -bool emitter::IsDstSrcSrcAVXInstruction(instruction ins) +bool emitter::IsDstSrcSrcAVXInstruction(instruction ins) const { - return ((CodeGenInterface::instInfo[ins] & INS_Flags_IsDstSrcSrcAVXInstruction) != 0) && - IsVexOrEvexEncodedInstruction(ins); + if (!UseVEXEncoding()) + { + return false; + } + + insFlags flags = CodeGenInterface::instInfo[ins]; + return (flags & INS_Flags_IsDstSrcSrcAVXInstruction) != 0; } //------------------------------------------------------------------------ @@ -363,7 +254,8 @@ bool emitter::IsDstSrcSrcAVXInstruction(instruction ins) // true if instruction has a regular form where the 'w' bit needs to be set. bool emitter::HasRegularWideForm(instruction ins) { - return ((CodeGenInterface::instInfo[ins] & INS_FLAGS_Has_Wbit) != 0); + insFlags flags = CodeGenInterface::instInfo[ins]; + return (flags & INS_FLAGS_Has_Wbit) != 0; } //------------------------------------------------------------------------ @@ -382,7 +274,8 @@ bool emitter::HasRegularWideForm(instruction ins) // true if instruction has a regular wide immediate form where the 's' bit needs to set. bool emitter::HasRegularWideImmediateForm(instruction ins) { - return ((CodeGenInterface::instInfo[ins] & INS_FLAGS_Has_Sbit) != 0); + insFlags flags = CodeGenInterface::instInfo[ins]; + return (flags & INS_FLAGS_Has_Sbit) != 0; } //------------------------------------------------------------------------ @@ -397,7 +290,8 @@ bool emitter::HasRegularWideImmediateForm(instruction ins) // bool emitter::DoesWriteZeroFlag(instruction ins) { - return (CodeGenInterface::instInfo[ins] & Writes_ZF) != 0; + insFlags flags = CodeGenInterface::instInfo[ins]; + return (flags & Writes_ZF) != 0; } //------------------------------------------------------------------------ @@ -412,7 +306,8 @@ bool emitter::DoesWriteZeroFlag(instruction ins) // bool emitter::DoesWriteSignFlag(instruction ins) { - return (CodeGenInterface::instInfo[ins] & Writes_SF) != 0; + insFlags flags = CodeGenInterface::instInfo[ins]; + return (flags & Writes_SF) != 0; } //------------------------------------------------------------------------ @@ -427,7 +322,8 @@ bool emitter::DoesWriteSignFlag(instruction ins) // bool emitter::DoesResetOverflowAndCarryFlags(instruction ins) { - return (CodeGenInterface::instInfo[ins] & (Resets_OF | Resets_CF)) == (Resets_OF | Resets_CF); + insFlags flags = CodeGenInterface::instInfo[ins]; + return (flags & (Resets_OF | Resets_CF)) == (Resets_OF | Resets_CF); } //------------------------------------------------------------------------ @@ -507,7 +403,7 @@ bool emitter::IsRexW0Instruction(instruction ins) if ((flags & REX_W0) != 0) { - assert((flags & (REX_W1 | REX_WX)) == 0); + assert((flags & (REX_W1 | REX_WX | REX_W1_EVEX)) == 0); return true; } @@ -529,7 +425,7 @@ bool emitter::IsRexW1Instruction(instruction ins) if ((flags & REX_W1) != 0) { - assert((flags & (REX_W0 | REX_WX)) == 0); + assert((flags & (REX_W0 | REX_WX | REX_W1_EVEX)) == 0); return true; } @@ -551,7 +447,29 @@ bool emitter::IsRexWXInstruction(instruction ins) if ((flags & REX_WX) != 0) { - assert((flags & (REX_W0 | REX_W1)) == 0); + assert((flags & (REX_W0 | REX_W1 | REX_W1_EVEX)) == 0); + return true; + } + + return false; +} + +//------------------------------------------------------------------------ +// IsRexW1EvexInstruction: check if the instruction always encodes REX.W as 1 for EVEX +// +// Arguments: +// id - instruction to test +// +// Return Value: +// true if the instruction always encodes REX.W as 1 for EVEX; othwerwise, false +// +bool emitter::IsRexW1EvexInstruction(instruction ins) +{ + insFlags flags = CodeGenInterface::instInfo[ins]; + + if ((flags & REX_W1_EVEX) != 0) + { + assert((flags & (REX_W0 | REX_W1 | REX_WX)) == 0); return true; } @@ -1090,8 +1008,7 @@ bool emitter::Is4ByteSSEInstruction(instruction ins) const bool emitter::TakesSimdPrefix(const instrDesc* id) const { instruction ins = id->idIns(); - - return TakesEvexPrefix(id) || TakesVexPrefix(ins); + return TakesVexPrefix(ins) || TakesEvexPrefix(id); } //------------------------------------------------------------------------ @@ -1115,23 +1032,27 @@ bool emitter::TakesSimdPrefix(const instrDesc* id) const // bool emitter::TakesEvexPrefix(const instrDesc* id) const { - if (!emitComp->DoJitStressEvexEncoding()) + instruction ins = id->idIns(); + + if (!IsEvexEncodedInstruction(ins)) { return false; } - instruction ins = id->idIns(); + if (!emitComp->DoJitStressEvexEncoding()) + { + return false; + } if (HasHighSIMDReg(id)) { - assert(IsEvexEncodedInstruction(ins)); // TODO-XARCH-AVX512 remove this check once k registers have been implemented assert(!HasKMaskRegisterDest(ins)); return true; } // TODO-XArch-AVX512: Revisit 'HasKMaskRegisterDest()' check once KMask support is added. - return IsEvexEncodedInstruction(ins) && !HasKMaskRegisterDest(ins); + return !HasKMaskRegisterDest(ins); } // Intel AVX-512 encoding is defined in "Intel 64 and ia-32 architectures software developer's manual volume 2", Section @@ -1281,112 +1202,106 @@ emitter::code_t emitter::AddVexPrefix(instruction ins, code_t code, emitAttr att } // Returns true if this instruction, for the given EA_SIZE(attr), will require a REX.W prefix -bool emitter::TakesRexWPrefix(instruction ins, emitAttr attr) +bool emitter::TakesRexWPrefix(const instrDesc* id) const { - // Because the current implementation of AVX does not have a way to distinguish between the register - // size specification (128 vs. 256 bits) and the operand size specification (32 vs. 64 bits), where both are - // required, the instruction must be created with the register size attribute (EA_16BYTE or EA_32BYTE), - // and here we must special case these by the opcode. - switch (ins) +#if defined(TARGET_X86) + if (!UseVEXEncoding()) { - case INS_vpermpd: - case INS_vpermq: - case INS_vpsrlvq: - case INS_vpsllvq: - case INS_pinsrq: - case INS_pextrq: - case INS_vfmadd132pd: - case INS_vfmadd213pd: - case INS_vfmadd231pd: - case INS_vfmadd132sd: - case INS_vfmadd213sd: - case INS_vfmadd231sd: - case INS_vfmaddsub132pd: - case INS_vfmaddsub213pd: - case INS_vfmaddsub231pd: - case INS_vfmsubadd132pd: - case INS_vfmsubadd213pd: - case INS_vfmsubadd231pd: - case INS_vfmsub132pd: - case INS_vfmsub213pd: - case INS_vfmsub231pd: - case INS_vfmsub132sd: - case INS_vfmsub213sd: - case INS_vfmsub231sd: - case INS_vfnmadd132pd: - case INS_vfnmadd213pd: - case INS_vfnmadd231pd: - case INS_vfnmadd132sd: - case INS_vfnmadd213sd: - case INS_vfnmadd231sd: - case INS_vfnmsub132pd: - case INS_vfnmsub213pd: - case INS_vfnmsub231pd: - case INS_vfnmsub132sd: - case INS_vfnmsub213sd: - case INS_vfnmsub231sd: - case INS_vpmaskmovq: - case INS_vpgatherdq: - case INS_vpgatherqq: - case INS_vgatherdpd: - case INS_vgatherqpd: - case INS_vpmovw2m: - case INS_vpmovq2m: - return true; - default: - break; + return false; } +#endif // TARGET_X86 -#ifdef TARGET_AMD64 - // movsx should always sign extend out to 8 bytes just because we don't track - // whether the dest should be 4 bytes or 8 bytes (attr indicates the size - // of the source, not the dest). - // A 4-byte movzx is equivalent to an 8 byte movzx, so it is not special - // cased here. - if (ins == INS_movsx) + instruction ins = id->idIns(); + emitAttr attr = id->idOpSize(); + + if (IsRexW0Instruction(ins)) + { + return false; + } + else if (IsRexW1Instruction(ins)) { return true; } - - if (EA_SIZE(attr) != EA_8BYTE) + else if (IsRexW1EvexInstruction(ins)) { - return false; + return TakesEvexPrefix(id); } - if (IsSSEOrAVXInstruction(ins)) + if (IsRexWXInstruction(ins)) { switch (ins) { - case INS_movd: // TODO-Cleanup: replace with movq, https://github.com/dotnet/runtime/issues/47943. + case INS_cvtss2si: + case INS_cvttss2si: + case INS_cvtsd2si: + case INS_cvttsd2si: + case INS_movd: + case INS_movnti: case INS_andn: case INS_bextr: case INS_blsi: case INS_blsmsk: case INS_blsr: case INS_bzhi: - case INS_cvttsd2si: - case INS_cvttss2si: - case INS_cvtsd2si: - case INS_cvtss2si: - case INS_cvtsi2sd64: - case INS_cvtsi2ss64: - case INS_movnti: case INS_mulx: case INS_pdep: case INS_pext: case INS_rorx: - case INS_shlx: +#if defined(TARGET_AMD64) case INS_sarx: + case INS_shlx: case INS_shrx: - case INS_kmovq_msk: - case INS_kmovq_gpr: - case INS_kmovd_msk: - return true; - default: +#endif // TARGET_AMD64 + { + if (attr == EA_8BYTE) + { + return true; + } + + // TODO-Cleanup: This should really only ever be EA_4BYTE + assert((attr == EA_4BYTE) || (attr == EA_16BYTE)); return false; + } + + case INS_vbroadcastsd: + case INS_vpbroadcastq: + { + // TODO-XARCH-AVX512: These use W1 if a kmask is involved + return TakesEvexPrefix(id); + } + + case INS_vpermilpd: + case INS_vpermilpdvar: + { + // TODO-XARCH-AVX512: These use W1 if a kmask or broaadcast from memory is involved + return TakesEvexPrefix(id); + } + + default: + { + unreached(); + } } } + assert(!IsAvx512OrPriorInstruction(ins)); + +#ifdef TARGET_AMD64 + // movsx should always sign extend out to 8 bytes just because we don't track + // whether the dest should be 4 bytes or 8 bytes (attr indicates the size + // of the source, not the dest). + // A 4-byte movzx is equivalent to an 8 byte movzx, so it is not special + // cased here. + if (ins == INS_movsx) + { + return true; + } + + if (EA_SIZE(attr) != EA_8BYTE) + { + return false; + } + // TODO-XArch-Cleanup: Better way to not emit REX.W when we don't need it, than just testing all these // opcodes... // These are all the instructions that default to 8-byte operand without the REX.W bit @@ -2818,7 +2733,7 @@ unsigned emitter::emitGetVexPrefixSize(instrDesc* id) const emitAttr size = id->idOpSize(); - if (TakesRexWPrefix(ins, size)) + if (TakesRexWPrefix(id)) { // When the REX.W bit is present, we must use the 3-byte encoding return 3; @@ -3493,7 +3408,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id, code_t code) bool includeRexPrefixSize = true; // REX prefix - if (TakesRexWPrefix(ins, attr) || IsExtendedReg(id->idReg1(), attr) || IsExtendedReg(id->idReg2(), attr) || + if (TakesRexWPrefix(id) || IsExtendedReg(id->idReg1(), attr) || IsExtendedReg(id->idReg2(), attr) || (!id->idIsSmallDsc() && (IsExtendedReg(id->idReg3(), attr) || IsExtendedReg(id->idReg4(), attr)))) { sz += emitGetRexPrefixSize(ins); @@ -3581,7 +3496,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id) emitAttr attr = id->idOpSize(); emitAttr size = EA_SIZE(attr); - if ((TakesRexWPrefix(ins, size) && ((ins != INS_xor) || (reg1 != reg2))) || IsExtendedReg(reg1, attr) || + if ((TakesRexWPrefix(id) && ((ins != INS_xor) || (reg1 != reg2))) || IsExtendedReg(reg1, attr) || IsExtendedReg(reg2, attr)) { sz += emitGetRexPrefixSize(ins); @@ -3800,8 +3715,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSV(instrDesc* id, code_t code, int var UNATIVE_OFFSET prefix = emitGetAdjustedSize(id, code); // REX prefix - if (TakesRexWPrefix(ins, attrSize) || IsExtendedReg(id->idReg1(), attrSize) || - IsExtendedReg(id->idReg2(), attrSize)) + if (TakesRexWPrefix(id) || IsExtendedReg(id->idReg1(), attrSize) || IsExtendedReg(id->idReg2(), attrSize)) { prefix += emitGetRexPrefixSize(ins); } @@ -3845,8 +3759,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSV(instrDesc* id, code_t code, int var } // 64-bit operand instructions will need a REX.W prefix - if (TakesRexWPrefix(ins, attrSize) || IsExtendedReg(id->idReg1(), attrSize) || - IsExtendedReg(id->idReg2(), attrSize)) + if (TakesRexWPrefix(id) || IsExtendedReg(id->idReg1(), attrSize) || IsExtendedReg(id->idReg2(), attrSize)) { prefix += emitGetRexPrefixSize(ins); } @@ -3955,7 +3868,7 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code) // REX prefix size += emitGetRexPrefixSize(ins); } - else if (TakesRexWPrefix(ins, attrSize)) + else if (TakesRexWPrefix(id)) { // REX.W prefix size += emitGetRexPrefixSize(ins); @@ -4171,8 +4084,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeCV(instrDesc* id, code_t code) bool includeRexPrefixSize = true; // 64-bit operand instructions will need a REX.W prefix - if (TakesRexWPrefix(ins, attrSize) || IsExtendedReg(id->idReg1(), attrSize) || - IsExtendedReg(id->idReg2(), attrSize)) + if (TakesRexWPrefix(id) || IsExtendedReg(id->idReg1(), attrSize) || IsExtendedReg(id->idReg2(), attrSize)) { size += emitGetRexPrefixSize(ins); includeRexPrefixSize = false; @@ -4412,7 +4324,7 @@ void emitter::emitIns(instruction ins, emitAttr attr) id->idInsFmt(fmt); sz += emitGetAdjustedSize(id, code); - if (TakesRexWPrefix(ins, attr)) + if (TakesRexWPrefix(id)) { sz += emitGetRexPrefixSize(ins); } @@ -5554,7 +5466,7 @@ void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg) sz += emitGetAdjustedSize(id, insEncodeMRreg(id, reg, attr, insCodeMR(ins))); // REX byte - if (IsExtendedReg(reg, attr) || TakesRexWPrefix(ins, attr)) + if (IsExtendedReg(reg, attr) || TakesRexWPrefix(id)) { sz += emitGetRexPrefixSize(ins); } @@ -5706,7 +5618,7 @@ void emitter::emitIns_R_I(instruction ins, // Do not get the RexSize() but just decide if it will be included down further and if yes, // do not include it again. - if (IsExtendedReg(reg, attr) || TakesRexWPrefix(ins, size) || instrIsExtendedReg3opImul(ins)) + if (IsExtendedReg(reg, attr) || TakesRexWPrefix(id) || instrIsExtendedReg3opImul(ins)) { includeRexPrefixSize = false; } @@ -5719,7 +5631,7 @@ void emitter::emitIns_R_I(instruction ins, // Do we need a REX prefix for AMD64? We need one if we are using any extended register (REX.R), or if we have a // 64-bit sized operand (REX.W). Note that IMUL in our encoding is special, with a "built-in", implicit, target // register. So we also need to check if that built-in register is an extended register. - if (IsExtendedReg(reg, attr) || TakesRexWPrefix(ins, size) || instrIsExtendedReg3opImul(ins)) + if (IsExtendedReg(reg, attr) || TakesRexWPrefix(id) || instrIsExtendedReg3opImul(ins)) { sz += emitGetRexPrefixSize(ins); } @@ -5863,7 +5775,7 @@ void emitter::emitIns_C(instruction ins, emitAttr attr, CORINFO_FIELD_HANDLE fld sz = emitInsSizeCV(id, insCodeMR(ins)); } - if (TakesRexWPrefix(ins, attr)) + if (TakesRexWPrefix(id)) { // REX.W prefix sz += emitGetRexPrefixSize(ins); @@ -5934,14 +5846,12 @@ bool emitter::IsMovInstruction(instruction ins) case INS_movaps: case INS_movd: case INS_movdqa: - case INS_vmovdqa32: case INS_vmovdqa64: case INS_movdqu: - case INS_movdqu8: - case INS_movdqu16: - case INS_vmovdqu32: + case INS_vmovdqu8: + case INS_vmovdqu16: case INS_vmovdqu64: - case INS_movsdsse2: + case INS_movsd_simd: case INS_movss: case INS_movsx: case INS_movupd: @@ -6059,7 +5969,7 @@ bool emitter::HasSideEffect(instruction ins, emitAttr size) break; } - case INS_movsdsse2: + case INS_movsd_simd: case INS_movss: { // Clears the upper bits under VEX encoding @@ -6083,11 +5993,9 @@ bool emitter::HasSideEffect(instruction ins, emitAttr size) break; } - case INS_vmovdqa32: case INS_vmovdqa64: - case INS_movdqu8: - case INS_movdqu16: - case INS_vmovdqu32: + case INS_vmovdqu8: + case INS_vmovdqu16: case INS_vmovdqu64: { // These EVEX instructions merges/masks based on k-register @@ -6299,14 +6207,12 @@ void emitter::emitIns_Mov(instruction ins, emitAttr attr, regNumber dstReg, regN case INS_movapd: case INS_movaps: case INS_movdqa: - case INS_vmovdqa32: case INS_vmovdqa64: case INS_movdqu: - case INS_movdqu8: - case INS_movdqu16: - case INS_vmovdqu32: + case INS_vmovdqu8: + case INS_vmovdqu16: case INS_vmovdqu64: - case INS_movsdsse2: + case INS_movsd_simd: case INS_movss: case INS_movupd: case INS_movups: @@ -7320,7 +7226,7 @@ void emitter::emitIns_C_R(instruction ins, emitAttr attr, CORINFO_FIELD_HANDLE f sz += 1; // REX prefix - if (TakesRexWPrefix(ins, attr) || IsExtendedReg(reg, attr)) + if (TakesRexWPrefix(id) || IsExtendedReg(reg, attr)) { sz += emitGetRexPrefixSize(ins); } @@ -12021,10 +11927,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } // Emit the REX prefix if required - // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. - // Explore moving IsWEvexOpcodeExtension() logic inside TakesRexWPrefix(). Not doing so currently - // since we cannot differentiate EVEX vs VEX without 'code' until all paths have EVEX support. - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) + if (TakesRexWPrefix(id)) { code = AddRexWPrefix(id, code); } @@ -12857,11 +12760,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) code = AddSimdPrefixIfNeededAndNotPresent(id, code, size); // Compute the REX prefix - // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. - // Explore moving IsWEvexOpcodeExtension() logic inside TakesRexWPrefix(). - // Not doing so currently since we cannot differentiate EVEX vs VEX without - // 'code' until all paths have EVEX support. - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) + if (TakesRexWPrefix(id)) { code = AddRexWPrefix(id, code); } @@ -13306,7 +13205,7 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) code = AddSimdPrefixIfNeededAndNotPresent(id, code, size); // Compute the REX prefix - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) + if (TakesRexWPrefix(id)) { code = AddRexWPrefix(id, code); } @@ -13769,7 +13668,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) code |= 0x1; } - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) + if (TakesRexWPrefix(id)) { code = AddRexWPrefix(id, code); } @@ -13802,7 +13701,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) code = insEncodeOpreg(id, reg, size); assert(!TakesSimdPrefix(id)); - assert(!TakesRexWPrefix(ins, size)); + assert(!TakesRexWPrefix(id)); // Output the REX prefix dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -13821,7 +13720,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) code = insCodeRR(ins); - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) + if (TakesRexWPrefix(id)) { code = AddRexWPrefix(id, code); } @@ -13898,7 +13797,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) code = AddSimdPrefixIfNeeded(id, code, size); - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) + if (TakesRexWPrefix(id)) { code = AddRexWPrefix(id, code); } @@ -14001,9 +13900,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) code = AddSimdPrefixIfNeeded(id, code, size); code = insEncodeRMreg(id, code); - // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. - // Explore moving IsWEvexOpcodeExtension() logic to instruction table as flag. - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) + if (TakesRexWPrefix(id)) { code = AddRexWPrefix(id, code); } @@ -14392,9 +14289,7 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id) code = insEncodeRMreg(id, code); - // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. - // Explore moving IsWEvexOpcodeExtension() logic to instruction table as flag. - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) + if (TakesRexWPrefix(id)) { code = AddRexWPrefix(id, code); } @@ -14540,7 +14435,7 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) // This is INS_mov and will not take VEX prefix assert(!TakesVexPrefix(ins)); - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) + if (TakesRexWPrefix(id)) { code = AddRexWPrefix(id, code); } @@ -14873,7 +14768,7 @@ BYTE* emitter::emitOutputIV(BYTE* dst, instrDesc* id) } else { - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) + if (TakesRexWPrefix(id)) { code = AddRexWPrefix(id, code); dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -15512,8 +15407,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) // Support only scalar AVX instructions and hence size is hard coded to 4-byte. code = AddSimdPrefixIfNeeded(id, code, EA_4BYTE); - if (((ins == INS_cdq) || (ins == INS_cwde)) && - (TakesRexWPrefix(ins, id->idOpSize()) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id)))) + if (((ins == INS_cdq) || (ins == INS_cwde)) && TakesRexWPrefix(id)) { code = AddRexWPrefix(id, code); } @@ -15799,7 +15693,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } // Emit the REX prefix if it exists - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) + if (TakesRexWPrefix(id)) { code = AddRexWPrefix(id, code); } @@ -15888,7 +15782,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } assert(code & 0x00FF0000); - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) + if (TakesRexWPrefix(id)) { code = AddRexWPrefix(id, code); } @@ -17538,12 +17432,10 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; case INS_movdqa: - case INS_vmovdqa32: case INS_vmovdqa64: case INS_movdqu: - case INS_movdqu8: - case INS_movdqu16: - case INS_vmovdqu32: + case INS_vmovdqu8: + case INS_vmovdqu16: case INS_vmovdqu64: case INS_movaps: case INS_movups: @@ -17618,7 +17510,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; case INS_movss: - case INS_movsdsse2: + case INS_movsd_simd: case INS_movddup: if (memAccessKind == PERFSCORE_MEMORY_NONE) { @@ -17757,16 +17649,12 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_paddusw: case INS_psubusw: case INS_pand: - case INS_vpandd: case INS_vpandq: case INS_pandn: - case INS_vpandnd: case INS_vpandnq: case INS_por: - case INS_vpord: case INS_vporq: case INS_pxor: - case INS_vpxord: case INS_vpxorq: case INS_andpd: case INS_andps: diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 76197ee1ca8253..2f6e89ed800649 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -156,7 +156,7 @@ bool hasRexPrefix(code_t code) #define VEX_PREFIX_CODE_3BYTE 0xC4000000000000ULL bool TakesVexPrefix(instruction ins) const; -static bool TakesRexWPrefix(instruction ins, emitAttr attr); +bool TakesRexWPrefix(const instrDesc* id) const; // Returns true if the instruction encoding already contains VEX prefix bool hasVexPrefix(code_t code) @@ -181,260 +181,6 @@ code_t AddVexPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr return code; } -//------------------------------------------------------------------------ -// IsWEvexOpcodeExtension: Some instructions use W bit as an opcode extension bit. -// Identify instructions which requires W bit to be set to 1 -// for Evex encoding. -// TODO-XArch-AVX512: Explore adding this as a flag to instr table. -// -// Arguments: -// ins - The instruction to check. -// -// Returns: -// `true` if W bit needs to be set to 1. -// -bool IsWEvexOpcodeExtension(const instrDesc* id) -{ - if (!TakesEvexPrefix(id)) - { - return false; - } - - instruction ins = id->idIns(); - - if (IsRexW0Instruction(ins)) - { - return false; - } - else if (IsRexW1Instruction(ins)) - { - return true; - } - - if (IsRexWXInstruction(ins)) - { - // TODO: Make this a simple assert once all instructions are annotated - unreached(); - } - - switch (ins) - { - case INS_movq: - case INS_addpd: - case INS_addsd: - case INS_movsd: - case INS_movsdsse2: - case INS_mulsd: - case INS_mulpd: - case INS_movntpd: - case INS_movlpd: - case INS_movhpd: - case INS_movapd: - case INS_movupd: - case INS_shufpd: - case INS_subsd: - case INS_subpd: - case INS_minsd: - case INS_minpd: - case INS_divsd: - case INS_divpd: - case INS_maxsd: - case INS_maxpd: - case INS_xorpd: - case INS_andpd: - case INS_sqrtsd: - case INS_sqrtpd: - case INS_andnpd: - case INS_orpd: - case INS_cvtpd2ps: - case INS_cvtsd2ss: - case INS_cvtpd2dq: - case INS_cvttpd2dq: - case INS_comisd: - case INS_ucomisd: - case INS_paddq: - case INS_psubq: - case INS_pmuludq: - case INS_psllq: - case INS_psrlq: - case INS_punpckhqdq: - case INS_punpcklqdq: - case INS_unpckhpd: - case INS_pmuldq: - case INS_movddup: - case INS_pinsrq: - case INS_pextrq: - case INS_vpbroadcastq: - case INS_vpermq: - case INS_vpsrlvq: - case INS_vpsllvq: - case INS_vpermilpd: - case INS_vpermpd: - case INS_vpgatherdq: - case INS_vpgatherqq: - case INS_vgatherdpd: - case INS_vgatherqpd: - case INS_vfmadd132pd: - case INS_vfmadd213pd: - case INS_vfmadd231pd: - case INS_vfmadd132sd: - case INS_vfmadd213sd: - case INS_vfmadd231sd: - case INS_vfmaddsub132pd: - case INS_vfmaddsub213pd: - case INS_vfmaddsub231pd: - case INS_vfmsubadd132pd: - case INS_vfmsubadd213pd: - case INS_vfmsubadd231pd: - case INS_vfmsub132pd: - case INS_vfmsub213pd: - case INS_vfmsub231pd: - case INS_vfmsub132sd: - case INS_vfmsub213sd: - case INS_vfmsub231sd: - case INS_vfnmadd132pd: - case INS_vfnmadd213pd: - case INS_vfnmadd231pd: - case INS_vfnmadd132sd: - case INS_vfnmadd213sd: - case INS_vfnmadd231sd: - case INS_vfnmsub132pd: - case INS_vfnmsub213pd: - case INS_vfnmsub231pd: - case INS_vfnmsub132sd: - case INS_vfnmsub213sd: - case INS_vfnmsub231sd: - case INS_unpcklpd: - case INS_vpermilpdvar: - case INS_movdqu16: - case INS_vinsertf64x4: - case INS_vinserti64x4: - { - return true; // W1 - } - case INS_movd: - case INS_punpckldq: - case INS_movntdq: - case INS_movntps: - case INS_movlps: - case INS_movhps: - case INS_movss: - case INS_movaps: - case INS_movups: - case INS_movhlps: - case INS_movlhps: - case INS_unpckhps: - case INS_unpcklps: - case INS_shufps: - case INS_punpckhdq: - case INS_addps: - case INS_addss: - case INS_mulss: - case INS_mulps: - case INS_subss: - case INS_subps: - case INS_minss: - case INS_minps: - case INS_divss: - case INS_divps: - case INS_maxss: - case INS_maxps: - case INS_xorps: - case INS_andps: - case INS_sqrtss: - case INS_sqrtps: - case INS_andnps: - case INS_orps: - case INS_cvtss2sd: - case INS_cvtdq2ps: - case INS_cvtps2dq: - case INS_cvttps2dq: - case INS_cvtdq2pd: - case INS_comiss: - case INS_ucomiss: - case INS_paddd: - case INS_psubd: - case INS_pslld: - case INS_psrld: - case INS_psrad: - case INS_pshufd: - case INS_packssdw: - case INS_insertps: - case INS_pmulld: - case INS_pabsd: - case INS_pminsd: - case INS_pminud: - case INS_pmaxud: - case INS_pmovsxdq: - case INS_pmovzxdq: - case INS_packusdw: - case INS_movntdqa: - case INS_movsldup: - case INS_movshdup: - case INS_pinsrd: - case INS_pextrd: - case INS_vbroadcastss: - case INS_vbroadcastsd: - case INS_vpbroadcastb: - case INS_vpbroadcastw: - case INS_vpbroadcastd: - case INS_vpsravd: - case INS_vpsllvd: - case INS_vpermilps: - case INS_vpermd: - case INS_vpermps: - case INS_vpgatherdd: - case INS_vpgatherqd: - case INS_vgatherdps: - case INS_vgatherqps: - case INS_vfmadd132ps: - case INS_vfmadd213ps: - case INS_vfmadd231ps: - case INS_vfmadd132ss: - case INS_vfmadd213ss: - case INS_vfmadd231ss: - case INS_vfmaddsub132ps: - case INS_vfmaddsub213ps: - case INS_vfmaddsub231ps: - case INS_vfmsubadd132ps: - case INS_vfmsubadd213ps: - case INS_vfmsubadd231ps: - case INS_vfmsub132ps: - case INS_vfmsub213ps: - case INS_vfmsub231ps: - case INS_vfmsub132ss: - case INS_vfmsub213ss: - case INS_vfmsub231ss: - case INS_vfnmadd132ps: - case INS_vfnmadd213ps: - case INS_vfnmadd231ps: - case INS_vfnmadd132ss: - case INS_vfnmadd213ss: - case INS_vfnmadd231ss: - case INS_vfnmsub132ps: - case INS_vfnmsub213ps: - case INS_vfnmsub231ps: - case INS_vfnmsub132ss: - case INS_vfnmsub213ss: - case INS_vfnmsub231ss: - case INS_vpdpbusd: - case INS_vpdpwssd: - case INS_vpdpbusds: - case INS_vpdpwssds: - case INS_vpermilpsvar: - case INS_movdqu8: - case INS_vinsertf32x8: - case INS_vinserti32x8: - { - return false; // W0 - } - default: - { - return false; // WIG - } - } -} - //------------------------------------------------------------------------ // HasKMaskRegisterDest: Temporary check to identify instructions that can // be Evex encoded but require Opmask(KMask) register support. @@ -502,6 +248,8 @@ bool UseEvexEncoding() const } void SetUseEvexEncoding(bool value) { + // We expect UseVEXEncoding to be true if UseEvexEncoding is true + assert(!value || UseVEXEncoding()); useEvexEncodings = value; } @@ -642,7 +390,7 @@ void SetContainsAVX(bool value) } bool contains256bitOrMoreAVXInstruction = false; -bool Contains256bitOrMoreAVX() +bool Contains256bitOrMoreAVX() const { return contains256bitOrMoreAVXInstruction; } @@ -651,17 +399,18 @@ void SetContains256bitOrMoreAVX(bool value) contains256bitOrMoreAVXInstruction = value; } -bool IsDstDstSrcAVXInstruction(instruction ins); -bool IsDstSrcSrcAVXInstruction(instruction ins); -bool HasRegularWideForm(instruction ins); -bool HasRegularWideImmediateForm(instruction ins); +bool IsDstDstSrcAVXInstruction(instruction ins) const; +bool IsDstSrcSrcAVXInstruction(instruction ins) const; +static bool HasRegularWideForm(instruction ins); +static bool HasRegularWideImmediateForm(instruction ins); static bool DoesWriteZeroFlag(instruction ins); -bool DoesWriteSignFlag(instruction ins); -bool DoesResetOverflowAndCarryFlags(instruction ins); +static bool DoesWriteSignFlag(instruction ins); +static bool DoesResetOverflowAndCarryFlags(instruction ins); bool IsFlagsAlwaysModified(instrDesc* id); static bool IsRexW0Instruction(instruction ins); static bool IsRexW1Instruction(instruction ins); static bool IsRexWXInstruction(instruction ins); +static bool IsRexW1EvexInstruction(instruction ins); bool IsThreeOperandAVXInstruction(instruction ins) { diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 72603878059dea..6dbcf3f0f41ccf 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -44,7 +44,7 @@ HARDWARE_INTRINSIC(Vector128, AsUInt16, HARDWARE_INTRINSIC(Vector128, AsUInt32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, AsUInt64, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, AsVector, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, AsVector2, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsdsse2, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(Vector128, AsVector2, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsd_simd, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Vector128, AsVector3, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movups, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Vector128, AsVector4, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, AsVector128, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) @@ -60,7 +60,7 @@ HARDWARE_INTRINSIC(Vector128, ConvertToUInt32, HARDWARE_INTRINSIC(Vector128, ConvertToUInt64, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector128, Create, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, CreateScalar, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, CreateScalarUnsafe, 16, 1, {INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movss, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Vector128, CreateScalarUnsafe, 16, 1, {INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector128, Divide, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, Dot, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, Equals, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) @@ -120,7 +120,7 @@ HARDWARE_INTRINSIC(Vector128, StoreAlignedNonTemporal, HARDWARE_INTRINSIC(Vector128, StoreUnsafe, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, Subtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, Sum, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, ToScalar, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsdsse2}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Vector128, ToScalar, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsd_simd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector128, ToVector256, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector128, ToVector256Unsafe, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector128, WidenLower, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) @@ -163,7 +163,7 @@ HARDWARE_INTRINSIC(Vector256, ConvertToUInt32, HARDWARE_INTRINSIC(Vector256, ConvertToUInt64, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector256, Create, 32, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, CreateScalar, 32, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) -HARDWARE_INTRINSIC(Vector256, CreateScalarUnsafe, 32, 1, {INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movss, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics|HW_Flag_AvxOnlyCompatible) +HARDWARE_INTRINSIC(Vector256, CreateScalarUnsafe, 32, 1, {INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, Divide, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, Dot, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, Equals, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) @@ -224,8 +224,8 @@ HARDWARE_INTRINSIC(Vector256, StoreAlignedNonTemporal, HARDWARE_INTRINSIC(Vector256, StoreUnsafe, 32, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, Subtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, Sum, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector256, ToScalar, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsdsse2}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_AvxOnlyCompatible) -HARDWARE_INTRINSIC(Vector256, ToVector512Unsafe, 32, 1, {INS_movdqu8, INS_movdqu8, INS_movdqu16, INS_movdqu16, INS_vmovdqu32, INS_vmovdqu32, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Vector256, ToScalar, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsd_simd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_AvxOnlyCompatible) +HARDWARE_INTRINSIC(Vector256, ToVector512Unsafe, 32, 1, {INS_vmovdqu8, INS_vmovdqu8, INS_vmovdqu16, INS_vmovdqu16, INS_movdqu, INS_movdqu, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector256, WidenLower, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector256, WidenUpper, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector256, WithElement, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible) @@ -241,7 +241,7 @@ HARDWARE_INTRINSIC(Vector512, BitwiseAnd, HARDWARE_INTRINSIC(Vector512, BitwiseOr, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, Create, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, CreateScalar, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector512, CreateScalarUnsafe, 64, 1, {INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movss, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Vector512, CreateScalarUnsafe, 64, 1, {INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector512, ExtractMostSignificantBits, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, get_AllBitsSet, 64, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(Vector512, get_One, 64, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) @@ -445,7 +445,7 @@ HARDWARE_INTRINSIC(SSE2, LoadAlignedVector128, HARDWARE_INTRINSIC(SSE2, LoadFence, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, LoadHigh, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhpd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, LoadLow, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlpd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, LoadScalarVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_movd, INS_movq, INS_movq, INS_invalid, INS_movsdsse2}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE2, LoadScalarVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_movd, INS_movq, INS_movq, INS_invalid, INS_movsd_simd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, LoadVector128, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_invalid, INS_movupd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(SSE2, MaskMove, 16, 3, {INS_maskmovdqu, INS_maskmovdqu, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(SSE2, Max, 16, 2, {INS_invalid, INS_pmaxub, INS_pmaxsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxpd}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative) @@ -454,7 +454,7 @@ HARDWARE_INTRINSIC(SSE2, MaxScalar, HARDWARE_INTRINSIC(SSE2, Min, 16, 2, {INS_invalid, INS_pminub, INS_pminsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minpd}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative) HARDWARE_INTRINSIC(SSE2, MinScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE2, MoveMask, 16, 1, {INS_pmovmskb, INS_pmovmskb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskpd}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE2, MoveScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movq, INS_movq, INS_invalid, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE2, MoveScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movq, INS_movq, INS_invalid, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_NoContainment) HARDWARE_INTRINSIC(SSE2, Multiply, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuludq, INS_invalid, INS_mulpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE2, MultiplyHigh, 16, 2, {INS_invalid, INS_invalid, INS_pmulhw, INS_pmulhuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE2, MultiplyAddAdjacent, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmaddwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) @@ -480,7 +480,7 @@ HARDWARE_INTRINSIC(SSE2, StoreAlignedNonTemporal, HARDWARE_INTRINSIC(SSE2, StoreHigh, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhpd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(SSE2, StoreLow, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlpd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(SSE2, StoreNonTemporal, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movnti, INS_movnti, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromSecondArg) -HARDWARE_INTRINSIC(SSE2, StoreScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_movd, INS_movq, INS_movq, INS_invalid, INS_movsdsse2}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg) +HARDWARE_INTRINSIC(SSE2, StoreScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_movd, INS_movq, INS_movq, INS_invalid, INS_movsd_simd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(SSE2, Subtract, 16, 2, {INS_psubb, INS_psubb, INS_psubw, INS_psubw, INS_psubd, INS_psubd, INS_psubq, INS_psubq, INS_invalid, INS_subpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE2, SubtractSaturate, 16, 2, {INS_psubsb, INS_psubusb, INS_psubsw, INS_psubusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE2, SubtractScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) @@ -750,18 +750,18 @@ HARDWARE_INTRINSIC(AVX2, Xor, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVX512F Intrinsics -HARDWARE_INTRINSIC(AVX512F, And, 64, 2, {INS_vpandd, INS_vpandd, INS_vpandd, INS_vpandd, INS_vpandd, INS_vpandd, INS_vpandq, INS_vpandq, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX512F, AndNot, 64, 2, {INS_vpandnd, INS_vpandnd, INS_vpandnd, INS_vpandnd, INS_vpandnd, INS_vpandnd, INS_vpandnq, INS_vpandnq, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX512F, And, 64, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_vpandq, INS_vpandq, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX512F, AndNot, 64, 2, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_vpandnq, INS_vpandnq, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX512F, InsertVector256, 64, 3, {INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinsertf64x4, INS_vinsertf64x4}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX512F, LoadAlignedVector512, 64, 1, {INS_vmovdqa32, INS_vmovdqa32, INS_vmovdqa32, INS_vmovdqa32, INS_vmovdqa32, INS_vmovdqa32, INS_vmovdqa64, INS_vmovdqa64, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX512F, LoadAlignedVector512, 64, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_vmovdqa64, INS_vmovdqa64, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX512F, LoadAlignedVector512NonTemporal, 64, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX512F, LoadVector512, 64, 1, {INS_vmovdqu32, INS_vmovdqu32, INS_vmovdqu32, INS_vmovdqu32, INS_vmovdqu32, INS_vmovdqu32, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512F, Or, 64, 2, {INS_vpord, INS_vpord, INS_vpord, INS_vpord, INS_vpord, INS_vpord, INS_vporq, INS_vporq, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX512F, Store, 64, 2, {INS_vmovdqu32, INS_vmovdqu32, INS_vmovdqu32, INS_vmovdqu32, INS_vmovdqu32, INS_vmovdqu32, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromSecondArg|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512F, StoreAligned, 64, 2, {INS_vmovdqa32, INS_vmovdqa32, INS_vmovdqa32, INS_vmovdqa32, INS_vmovdqa32, INS_vmovdqa32, INS_vmovdqa64, INS_vmovdqa64, INS_movaps, INS_movapd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg) +HARDWARE_INTRINSIC(AVX512F, LoadVector512, 64, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F, Or, 64, 2, {INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_vporq, INS_vporq, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX512F, Store, 64, 2, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromSecondArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F, StoreAligned, 64, 2, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_vmovdqa64, INS_vmovdqa64, INS_movaps, INS_movapd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(AVX512F, StoreAlignedNonTemporal, 64, 2, {INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntps, INS_movntpd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg) -HARDWARE_INTRINSIC(AVX512F, Xor, 64, 2, {INS_vpxord, INS_vpxord, INS_vpxord, INS_vpxord, INS_vpxord, INS_vpxord, INS_vpxorq, INS_vpxorq, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX512F, Xor, 64, 2, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_vpxorq, INS_vpxorq, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 8da608dd356467..f0e4c5596b927c 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -101,13 +101,88 @@ const char* CodeGen::genInsDisplayName(emitter::instrDesc* id) static char buf[4][TEMP_BUFFER_LEN]; const char* retbuf; - if (GetEmitter()->IsVexOrEvexEncodedInstruction(ins) && !GetEmitter()->IsBMIInstruction(ins) && - !GetEmitter()->IsKInstruction(ins)) + const emitter* emit = GetEmitter(); + + if (emit->IsVexOrEvexEncodedInstruction(ins)) { - sprintf_s(buf[curBuf], TEMP_BUFFER_LEN, "v%s", insName); - retbuf = buf[curBuf]; - curBuf = (curBuf + 1) % 4; - return retbuf; + if (!emit->IsBMIInstruction(ins) && !emit->IsKInstruction(ins)) + { + if (emit->TakesEvexPrefix(id)) + { + switch (ins) + { + case INS_movdqa: + { + return "vmovdqa32"; + } + + case INS_movdqu: + { + return "vmovdqu32"; + } + + case INS_pand: + { + return "vpandd"; + } + + case INS_pandn: + { + return "vpandnd"; + } + + case INS_por: + { + return "vpord"; + } + + case INS_pxor: + { + return "vpxord"; + } + + case INS_vbroadcastf128: + { + return "vbroadcastf32x4"; + } + + case INS_vextractf128: + { + return "vextractf32x4"; + } + + case INS_vinsertf128: + { + return "vinsertf32x4"; + } + + case INS_vbroadcasti128: + { + return "vbroadcasti32x4"; + } + + case INS_vextracti128: + { + return "vextracti32x4"; + } + + case INS_vinserti128: + { + return "vinserti32x4"; + } + + default: + { + break; + } + } + } + + sprintf_s(buf[curBuf], TEMP_BUFFER_LEN, "v%s", insName); + retbuf = buf[curBuf]; + curBuf = (curBuf + 1) % 4; + return retbuf; + } } // Some instructions have different mnemonics depending on the size. @@ -1264,7 +1339,7 @@ instruction CodeGen::ins_Move_Extend(var_types srcType, bool srcInReg) { if (srcType == TYP_DOUBLE) { - return (srcInReg) ? INS_movaps : INS_movsdsse2; + return (srcInReg) ? INS_movaps : INS_movsd_simd; } else if (srcType == TYP_FLOAT) { @@ -1401,7 +1476,7 @@ instruction CodeGenInterface::ins_Load(var_types srcType, bool aligned /*=false* #ifdef FEATURE_SIMD if (srcType == TYP_SIMD8) { - return INS_movsdsse2; + return INS_movsd_simd; } else #endif // FEATURE_SIMD @@ -1423,7 +1498,7 @@ instruction CodeGenInterface::ins_Load(var_types srcType, bool aligned /*=false* #if defined(TARGET_XARCH) if (srcType == TYP_DOUBLE) { - return INS_movsdsse2; + return INS_movsd_simd; } else if (srcType == TYP_FLOAT) { @@ -1659,7 +1734,7 @@ instruction CodeGenInterface::ins_Store(var_types dstType, bool aligned /*=false #ifdef FEATURE_SIMD if (dstType == TYP_SIMD8) { - return INS_movsdsse2; + return INS_movsd_simd; } else #endif // FEATURE_SIMD @@ -1674,7 +1749,7 @@ instruction CodeGenInterface::ins_Store(var_types dstType, bool aligned /*=false { if (dstType == TYP_DOUBLE) { - return INS_movsdsse2; + return INS_movsd_simd; } else if (dstType == TYP_FLOAT) { diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h index 67f55f3057a63e..1ef608460d6831 100644 --- a/src/coreclr/jit/instr.h +++ b/src/coreclr/jit/instr.h @@ -119,6 +119,7 @@ enum insFlags : uint64_t // Resets Resets_OF = 1ULL << 12, Resets_SF = 1ULL << 13, + Resets_ZF = 1ULL << 39, Resets_AF = 1ULL << 14, Resets_PF = 1ULL << 15, Resets_CF = 1ULL << 16, @@ -158,7 +159,20 @@ enum insFlags : uint64_t REX_W0 = 1ULL << 33, REX_W1 = 1ULL << 34, REX_WX = 1ULL << 35, - REX_WIG = REX_W0, + + // encoding of the REX.W-bit is considered for EVEX only and W0 or WIG otherwise + REX_W0_EVEX = REX_W0, + REX_W1_EVEX = 1ULL << 36, + + // encoding of the REX.W-bit is ignored + REX_WIG = REX_W0, + + // whether VEX or EVEX encodings are directly supported + Encoding_VEX = 1ULL << 37, + Encoding_EVEX = 1ULL << 38, + + // Listed above so it is "inline" with the other Resets_* flags + // Resets_ZF = 1ULL << 39, // TODO-Cleanup: Remove this flag and its usage from TARGET_XARCH INS_FLAGS_DONT_CARE = 0x00ULL, diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index e01cd4bfd303f6..150c76d49033e4 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -52,72 +52,72 @@ // id nm um mr mi rm a4 rr tt flags INST5(invalid, "INVALID", IUM_RD, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST5(push, "push", IUM_RD, 0x0030FE, 0x000068, BAD_CODE, BAD_CODE, 0x000050, INS_TT_NONE, INS_FLAGS_None ) -INST5(pop, "pop", IUM_WR, 0x00008E, BAD_CODE, BAD_CODE, BAD_CODE, 0x000058, INS_TT_NONE, INS_FLAGS_None ) +INST5(push, "push", IUM_RD, 0x0030FE, 0x000068, BAD_CODE, BAD_CODE, 0x000050, INS_TT_NONE, INS_FLAGS_None) +INST5(pop, "pop", IUM_WR, 0x00008E, BAD_CODE, BAD_CODE, BAD_CODE, 0x000058, INS_TT_NONE, INS_FLAGS_None) // Does not affect the stack tracking in the emitter -INST5(push_hide, "push", IUM_RD, 0x0030FE, 0x000068, BAD_CODE, BAD_CODE, 0x000050, INS_TT_NONE, INS_FLAGS_None ) -INST5(pop_hide, "pop", IUM_WR, 0x00008E, BAD_CODE, BAD_CODE, BAD_CODE, 0x000058, INS_TT_NONE, INS_FLAGS_None ) +INST5(push_hide, "push", IUM_RD, 0x0030FE, 0x000068, BAD_CODE, BAD_CODE, 0x000050, INS_TT_NONE, INS_FLAGS_None) +INST5(pop_hide, "pop", IUM_WR, 0x00008E, BAD_CODE, BAD_CODE, BAD_CODE, 0x000058, INS_TT_NONE, INS_FLAGS_None) -INST5(inc, "inc", IUM_RW, 0x0000FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x000040, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit ) -INST5(inc_l, "inc", IUM_RW, 0x0000FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C0FE, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF ) -INST5(dec, "dec", IUM_RW, 0x0008FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x000048, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit ) -INST5(dec_l, "dec", IUM_RW, 0x0008FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C8FE, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF ) +INST5(inc, "inc", IUM_RW, 0x0000FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x000040, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit) +INST5(inc_l, "inc", IUM_RW, 0x0000FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C0FE, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF) +INST5(dec, "dec", IUM_RW, 0x0008FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x000048, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit) +INST5(dec_l, "dec", IUM_RW, 0x0008FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C8FE, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF) // Multi-byte opcodes without modrm are represented in mixed endian fashion. // See comment around quarter way through this file for more information. -INST5(bswap, "bswap", IUM_RW, 0x0F00C8, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C80F, INS_TT_NONE, INS_FLAGS_None ) +INST5(bswap, "bswap", IUM_RW, 0x0F00C8, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C80F, INS_TT_NONE, INS_FLAGS_None) // id nm um mr mi rm a4 tt flags -INST4(add, "add", IUM_RW, 0x000000, 0x000080, 0x000002, 0x000004, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) -INST4(or, "or", IUM_RW, 0x000008, 0x000880, 0x00000A, 0x00000C, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) -INST4(adc, "adc", IUM_RW, 0x000010, 0x001080, 0x000012, 0x000014, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | Reads_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) -INST4(sbb, "sbb", IUM_RW, 0x000018, 0x001880, 0x00001A, 0x00001C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | Reads_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) -INST4(and, "and", IUM_RW, 0x000020, 0x002080, 0x000022, 0x000024, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) -INST4(sub, "sub", IUM_RW, 0x000028, 0x002880, 0x00002A, 0x00002C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) +INST4(add, "add", IUM_RW, 0x000000, 0x000080, 0x000002, 0x000004, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit) +INST4(or, "or", IUM_RW, 0x000008, 0x000880, 0x00000A, 0x00000C, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit) +INST4(adc, "adc", IUM_RW, 0x000010, 0x001080, 0x000012, 0x000014, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | Reads_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit) +INST4(sbb, "sbb", IUM_RW, 0x000018, 0x001880, 0x00001A, 0x00001C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | Reads_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit) +INST4(and, "and", IUM_RW, 0x000020, 0x002080, 0x000022, 0x000024, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit) +INST4(sub, "sub", IUM_RW, 0x000028, 0x002880, 0x00002A, 0x00002C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit) // Does not affect the stack tracking in the emitter -INST4(sub_hide, "sub", IUM_RW, 0x000028, 0x002880, 0x00002A, 0x00002C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) +INST4(sub_hide, "sub", IUM_RW, 0x000028, 0x002880, 0x00002A, 0x00002C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit) -INST4(xor, "xor", IUM_RW, 0x000030, 0x003080, 0x000032, 0x000034, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) -INST4(cmp, "cmp", IUM_RD, 0x000038, 0x003880, 0x00003A, 0x00003C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) -INST4(test, "test", IUM_RD, 0x000084, 0x0000F6, 0x000084, 0x0000A8, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Wbit ) -INST4(mov, "mov", IUM_WR, 0x000088, 0x0000C6, 0x00008A, 0x0000B0, INS_TT_NONE, INS_FLAGS_Has_Wbit ) +INST4(xor, "xor", IUM_RW, 0x000030, 0x003080, 0x000032, 0x000034, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit) +INST4(cmp, "cmp", IUM_RD, 0x000038, 0x003880, 0x00003A, 0x00003C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit) +INST4(test, "test", IUM_RD, 0x000084, 0x0000F6, 0x000084, 0x0000A8, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Wbit) +INST4(mov, "mov", IUM_WR, 0x000088, 0x0000C6, 0x00008A, 0x0000B0, INS_TT_NONE, INS_FLAGS_Has_Wbit) -INST4(lea, "lea", IUM_WR, BAD_CODE, BAD_CODE, 0x00008D, BAD_CODE, INS_TT_NONE, INS_FLAGS_None ) +INST4(lea, "lea", IUM_WR, BAD_CODE, BAD_CODE, 0x00008D, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // id nm um mr mi rm tt flags // Note that emitter has only partial support for BT. It can only emit the reg,reg form // and the registers need to be reversed to get the correct encoding. -INST3(bt, "bt", IUM_RD, 0x0F00A3, BAD_CODE, 0x0F00A3, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF ) +INST3(bt, "bt", IUM_RD, 0x0F00A3, BAD_CODE, 0x0F00A3, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF) -INST3(bsf, "bsf", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00BC, INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Undefined_CF ) -INST3(bsr, "bsr", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00BD, INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Undefined_CF ) +INST3(bsf, "bsf", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00BC, INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Undefined_CF) +INST3(bsr, "bsr", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00BD, INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Undefined_CF) -INST3(movsx, "movsx", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00BE, INS_TT_NONE, INS_FLAGS_Has_Wbit ) +INST3(movsx, "movsx", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00BE, INS_TT_NONE, INS_FLAGS_Has_Wbit) #ifdef TARGET_AMD64 -INST3(movsxd, "movsxd", IUM_WR, BAD_CODE, BAD_CODE, 0x4800000063, INS_TT_NONE, INS_FLAGS_Has_Wbit ) +INST3(movsxd, "movsxd", IUM_WR, BAD_CODE, BAD_CODE, 0x4800000063, INS_TT_NONE, INS_FLAGS_Has_Wbit) #endif -INST3(movzx, "movzx", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00B6, INS_TT_NONE, INS_FLAGS_Has_Wbit ) - -INST3(cmovo, "cmovo", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0040, INS_TT_NONE, Reads_OF ) -INST3(cmovno, "cmovno", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0041, INS_TT_NONE, Reads_OF ) -INST3(cmovb, "cmovb", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0042, INS_TT_NONE, Reads_CF ) -INST3(cmovae, "cmovae", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0043, INS_TT_NONE, Reads_CF ) -INST3(cmove, "cmove", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0044, INS_TT_NONE, Reads_ZF ) -INST3(cmovne, "cmovne", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0045, INS_TT_NONE, Reads_ZF ) -INST3(cmovbe, "cmovbe", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0046, INS_TT_NONE, Reads_ZF | Reads_CF ) -INST3(cmova, "cmova", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0047, INS_TT_NONE, Reads_ZF | Reads_CF ) -INST3(cmovs, "cmovs", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0048, INS_TT_NONE, Reads_SF ) -INST3(cmovns, "cmovns", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0049, INS_TT_NONE, Reads_SF ) -INST3(cmovp, "cmovp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004A, INS_TT_NONE, Reads_PF ) -INST3(cmovnp, "cmovnp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004B, INS_TT_NONE, Reads_PF ) -INST3(cmovl, "cmovl", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004C, INS_TT_NONE, Reads_OF | Reads_SF ) -INST3(cmovge, "cmovge", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004D, INS_TT_NONE, Reads_OF | Reads_SF ) -INST3(cmovle, "cmovle", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004E, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF ) -INST3(cmovg, "cmovg", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF ) - -INST3(xchg, "xchg", IUM_RW, 0x000086, BAD_CODE, 0x000086, INS_TT_NONE, INS_FLAGS_Has_Wbit ) -INST3(imul, "imul", IUM_RW, 0x0F00AC, BAD_CODE, 0x0F00AF, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) +INST3(movzx, "movzx", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00B6, INS_TT_NONE, INS_FLAGS_Has_Wbit) + +INST3(cmovo, "cmovo", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0040, INS_TT_NONE, Reads_OF) +INST3(cmovno, "cmovno", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0041, INS_TT_NONE, Reads_OF) +INST3(cmovb, "cmovb", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0042, INS_TT_NONE, Reads_CF) +INST3(cmovae, "cmovae", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0043, INS_TT_NONE, Reads_CF) +INST3(cmove, "cmove", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0044, INS_TT_NONE, Reads_ZF) +INST3(cmovne, "cmovne", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0045, INS_TT_NONE, Reads_ZF) +INST3(cmovbe, "cmovbe", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0046, INS_TT_NONE, Reads_ZF | Reads_CF) +INST3(cmova, "cmova", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0047, INS_TT_NONE, Reads_ZF | Reads_CF) +INST3(cmovs, "cmovs", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0048, INS_TT_NONE, Reads_SF) +INST3(cmovns, "cmovns", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0049, INS_TT_NONE, Reads_SF) +INST3(cmovp, "cmovp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004A, INS_TT_NONE, Reads_PF) +INST3(cmovnp, "cmovnp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004B, INS_TT_NONE, Reads_PF) +INST3(cmovl, "cmovl", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004C, INS_TT_NONE, Reads_OF | Reads_SF) +INST3(cmovge, "cmovge", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004D, INS_TT_NONE, Reads_OF | Reads_SF) +INST3(cmovle, "cmovle", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004E, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) +INST3(cmovg, "cmovg", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) + +INST3(xchg, "xchg", IUM_RW, 0x000086, BAD_CODE, 0x000086, INS_TT_NONE, INS_FLAGS_Has_Wbit) +INST3(imul, "imul", IUM_RW, 0x0F00AC, BAD_CODE, 0x0F00AF, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) // id nm um mr mi rm tt flags @@ -125,25 +125,25 @@ INST3(imul, "imul", IUM_RW, 0x0F00AC, BAD_CODE, // as 2-operand instructions with the target register being implicit // implicit_reg = op1*op2_icon #define INSTMUL INST3 -INSTMUL(imul_AX, "imul", IUM_RD, BAD_CODE, 0x000068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_CX, "imul", IUM_RD, BAD_CODE, 0x000868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_DX, "imul", IUM_RD, BAD_CODE, 0x001068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_BX, "imul", IUM_RD, BAD_CODE, 0x001868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_SP, "imul", IUM_RD, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_BP, "imul", IUM_RD, BAD_CODE, 0x002868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_SI, "imul", IUM_RD, BAD_CODE, 0x003068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_DI, "imul", IUM_RD, BAD_CODE, 0x003868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) +INSTMUL(imul_AX, "imul", IUM_RD, BAD_CODE, 0x000068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) +INSTMUL(imul_CX, "imul", IUM_RD, BAD_CODE, 0x000868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) +INSTMUL(imul_DX, "imul", IUM_RD, BAD_CODE, 0x001068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) +INSTMUL(imul_BX, "imul", IUM_RD, BAD_CODE, 0x001868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) +INSTMUL(imul_SP, "imul", IUM_RD, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) +INSTMUL(imul_BP, "imul", IUM_RD, BAD_CODE, 0x002868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) +INSTMUL(imul_SI, "imul", IUM_RD, BAD_CODE, 0x003068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) +INSTMUL(imul_DI, "imul", IUM_RD, BAD_CODE, 0x003868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) #ifdef TARGET_AMD64 -INSTMUL(imul_08, "imul", IUM_RD, BAD_CODE, 0x4400000068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_09, "imul", IUM_RD, BAD_CODE, 0x4400000868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_10, "imul", IUM_RD, BAD_CODE, 0x4400001068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_11, "imul", IUM_RD, BAD_CODE, 0x4400001868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_12, "imul", IUM_RD, BAD_CODE, 0x4400002068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_13, "imul", IUM_RD, BAD_CODE, 0x4400002868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_14, "imul", IUM_RD, BAD_CODE, 0x4400003068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) -INSTMUL(imul_15, "imul", IUM_RD, BAD_CODE, 0x4400003868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit ) +INSTMUL(imul_08, "imul", IUM_RD, BAD_CODE, 0x4400000068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) +INSTMUL(imul_09, "imul", IUM_RD, BAD_CODE, 0x4400000868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) +INSTMUL(imul_10, "imul", IUM_RD, BAD_CODE, 0x4400001068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) +INSTMUL(imul_11, "imul", IUM_RD, BAD_CODE, 0x4400001868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) +INSTMUL(imul_12, "imul", IUM_RD, BAD_CODE, 0x4400002068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) +INSTMUL(imul_13, "imul", IUM_RD, BAD_CODE, 0x4400002868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) +INSTMUL(imul_14, "imul", IUM_RD, BAD_CODE, 0x4400003068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) +INSTMUL(imul_15, "imul", IUM_RD, BAD_CODE, 0x4400003868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) #endif // TARGET_AMD64 @@ -170,7 +170,7 @@ INSTMUL(imul_15, "imul", IUM_RD, BAD_CODE, 0x4400003868, #define SSEFLT(c) PACK3(0xf3, 0x0f, c) #define SSEDBL(c) PACK3(0xf2, 0x0f, c) #define PCKDBL(c) PACK3(0x66, 0x0f, c) -#define PCKFLT(c) PACK2(0x0f,c) +#define PCKFLT(c) PACK2(0x0f, c) #define PCKMVB(c) PACK3(0x0F, 0x38, c) // These macros encode extra byte that is implicit in the macro. @@ -185,499 +185,460 @@ INSTMUL(imul_15, "imul", IUM_RD, BAD_CODE, 0x4400003868, #define VEX3FLT(c1,c2) PACK4(c1, 0xc5, 0x02, c2) INST3(FIRST_SSE_INSTRUCTION, "FIRST_SSE_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -// These are the SSE instructions used on x86 -INST3(pmovmskb, "pmovmskb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD7), INS_TT_NONE, INS_FLAGS_None) // Move the MSB bits of all bytes in a xmm reg to an int reg -INST3(movmskpd, "movmskpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x50), INS_TT_NONE, INS_FLAGS_None) // Extract 2-bit sign mask from xmm and store in reg. The upper bits of r32 or r64 are filled with zeros. -INST3(movd, "movd", IUM_WR, PCKDBL(0x7E), BAD_CODE, PCKDBL(0x6E), INS_TT_TUPLE1_SCALAR, INS_FLAGS_None) // Move Double/Quadword between mm regs <-> memory/r32/r64 regs, cleanup https://github.com/dotnet/runtime/issues/47943 -INST3(movq, "movq", IUM_WR, PCKDBL(0xD6), BAD_CODE, SSEFLT(0x7E), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_FLAGS_None) // Move Quadword between memory/mm <-> regs, cleanup https://github.com/dotnet/runtime/issues/47943 -INST3(movsdsse2, "movsd", IUM_WR, SSEDBL(0x11), BAD_CODE, SSEDBL(0x10), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstSrcSrcAVXInstruction) - -INST3(punpckldq, "punpckldq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x62), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) - -INST3(xorps, "xorps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x57), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // XOR packed singles - -INST3(cvttsd2si, "cvttsd2si", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x2C), INS_TT_TUPLE1_FIXED, Input_64Bit) // cvt with trunc scalar double to signed DWORDs - -INST3(movntdq, "movntdq", IUM_WR, PCKDBL(0xE7), BAD_CODE, BAD_CODE, INS_TT_FULL_MEM, Input_32Bit) -INST3(movnti, "movnti", IUM_WR, PCKFLT(0xC3), BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST3(movntpd, "movntpd", IUM_WR, PCKDBL(0x2B), BAD_CODE, BAD_CODE, INS_TT_FULL_MEM, Input_64Bit) -INST3(movntps, "movntps", IUM_WR, PCKFLT(0x2B), BAD_CODE, BAD_CODE, INS_TT_FULL_MEM, Input_32Bit) -INST3(movdqu, "movdqu", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_32Bit) // TODO-XARCH-AVX512 TT and IP encoded is movdqu32 -INST3(movdqa, "movdqa", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_32Bit) // TODO-XARCH-AVX512 TT and IP encoded is movdqa32 -INST3(movlpd, "movlpd", IUM_WR, PCKDBL(0x13), BAD_CODE, PCKDBL(0x12), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstSrcSrcAVXInstruction) -INST3(movlps, "movlps", IUM_WR, PCKFLT(0x13), BAD_CODE, PCKFLT(0x12), INS_TT_TUPLE1_FIXED, Input_32Bit | INS_Flags_IsDstSrcSrcAVXInstruction) -INST3(movhpd, "movhpd", IUM_WR, PCKDBL(0x17), BAD_CODE, PCKDBL(0x16), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstSrcSrcAVXInstruction) -INST3(movhps, "movhps", IUM_WR, PCKFLT(0x17), BAD_CODE, PCKFLT(0x16), INS_TT_TUPLE2, Input_32Bit | INS_Flags_IsDstSrcSrcAVXInstruction) -INST3(movss, "movss", IUM_WR, SSEFLT(0x11), BAD_CODE, SSEFLT(0x10), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstSrcSrcAVXInstruction) -INST3(movapd, "movapd", IUM_WR, PCKDBL(0x29), BAD_CODE, PCKDBL(0x28), INS_TT_FULL_MEM, Input_64Bit) -INST3(movaps, "movaps", IUM_WR, PCKFLT(0x29), BAD_CODE, PCKFLT(0x28), INS_TT_FULL_MEM, Input_32Bit) -INST3(movupd, "movupd", IUM_WR, PCKDBL(0x11), BAD_CODE, PCKDBL(0x10), INS_TT_FULL_MEM, Input_64Bit) -INST3(movups, "movups", IUM_WR, PCKFLT(0x11), BAD_CODE, PCKFLT(0x10), INS_TT_FULL_MEM, Input_32Bit) -INST3(movhlps, "movhlps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x12), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) -INST3(movlhps, "movlhps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x16), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) -INST3(movmskps, "movmskps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x50), INS_TT_NONE, INS_FLAGS_None) -INST3(unpckhps, "unpckhps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x15), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) -INST3(unpcklps, "unpcklps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x14), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) -INST3(maskmovdqu, "maskmovdqu", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF7), INS_TT_NONE, INS_FLAGS_None) - -INST3(shufps, "shufps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0xC6), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) -INST3(shufpd, "shufpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC6), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) - -INST3(punpckhdq, "punpckhdq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x6A), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) - -INST3(lfence, "lfence", IUM_RD, 0x000FE8AE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST3(mfence, "mfence", IUM_RD, 0x000FF0AE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST3(prefetchnta, "prefetchnta", IUM_RD, 0x000F0018, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST3(prefetcht0, "prefetcht0", IUM_RD, 0x000F0818, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST3(prefetcht1, "prefetcht1", IUM_RD, 0x000F1018, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST3(prefetcht2, "prefetcht2", IUM_RD, 0x000F1818, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST3(sfence, "sfence", IUM_RD, 0x000FF8AE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) - -// SSE 2 arith -INST3(addps, "addps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x58), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed singles -INST3(addss, "addss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x58), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add scalar singles -INST3(addpd, "addpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x58), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed doubles -INST3(addsd, "addsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x58), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add scalar doubles -INST3(mulps, "mulps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x59), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply packed singles -INST3(mulss, "mulss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x59), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply scalar single -INST3(mulpd, "mulpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x59), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply packed doubles -INST3(mulsd, "mulsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x59), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply scalar doubles -INST3(subps, "subps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5C), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed singles -INST3(subss, "subss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5C), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract scalar singles -INST3(subpd, "subpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5C), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed doubles -INST3(subsd, "subsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5C), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract scalar doubles -INST3(minps, "minps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5D), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Return Minimum packed singles -INST3(minss, "minss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5D), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Return Minimum scalar single -INST3(minpd, "minpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5D), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Return Minimum packed doubles -INST3(minsd, "minsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5D), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Return Minimum scalar double -INST3(divps, "divps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5E), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Divide packed singles -INST3(divss, "divss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5E), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Divide scalar singles -INST3(divpd, "divpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5E), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Divide packed doubles -INST3(divsd, "divsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5E), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Divide scalar doubles -INST3(maxps, "maxps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5F), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Return Maximum packed singles -INST3(maxss, "maxss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5F), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Return Maximum scalar single -INST3(maxpd, "maxpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5F), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Return Maximum packed doubles -INST3(maxsd, "maxsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5F), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Return Maximum scalar double -INST3(xorpd, "xorpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x57), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // XOR packed doubles -INST3(andps, "andps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x54), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // AND packed singles -INST3(andpd, "andpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x54), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // AND packed doubles -INST3(sqrtps, "sqrtps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x51), INS_TT_FULL, Input_32Bit | INS_FLAGS_None) // Sqrt of packed singles -INST3(sqrtss, "sqrtss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x51), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstSrcSrcAVXInstruction) // Sqrt of scalar single -INST3(sqrtpd, "sqrtpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x51), INS_TT_FULL, Input_64Bit | INS_FLAGS_None) // Sqrt of packed doubles -INST3(sqrtsd, "sqrtsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x51), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstSrcSrcAVXInstruction) // Sqrt of scalar double -INST3(andnps, "andnps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x55), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // And-Not packed singles -INST3(andnpd, "andnpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x55), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // And-Not packed doubles -INST3(orps, "orps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x56), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Or packed singles -INST3(orpd, "orpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x56), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Or packed doubles -INST3(haddpd, "haddpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x7C), INS_TT_NONE, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal add packed doubles -INST3(haddps, "haddps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7C), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal add packed floats -INST3(hsubpd, "hsubpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x7D), INS_TT_NONE, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal subtract packed doubles -INST3(hsubps, "hsubps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7D), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal subtract packed floats -INST3(addsubps, "addsubps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xD0), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add/Subtract packed singles -INST3(addsubpd, "addsubpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD0), INS_TT_NONE, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add/Subtract packed doubles - -// SSE 2 approx arith -INST3(rcpps, "rcpps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x53), INS_TT_NONE, Input_32Bit | INS_FLAGS_None) // Reciprocal of packed singles -INST3(rcpss, "rcpss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x53), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstSrcSrcAVXInstruction) // Reciprocal of scalar single -INST3(rsqrtps, "rsqrtps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x52), INS_TT_NONE, Input_32Bit | INS_FLAGS_None) // Reciprocal Sqrt of packed singles -INST3(rsqrtss, "rsqrtss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x52), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstSrcSrcAVXInstruction) // Reciprocal Sqrt of scalar single - -// SSE2 conversions -INST3(cvtpi2ps, "cvtpi2ps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x2A), INS_TT_NONE, Input_32Bit) // cvt packed DWORDs to singles -INST3(cvtsi2ss32, "cvtsi2ss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x2A), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // cvt DWORD to scalar single -INST3(cvtsi2ss64, "cvtsi2ss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x2A), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // cvt DWORD to scalar single -INST3(cvtpi2pd, "cvtpi2pd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x2A), INS_TT_NONE, Input_32Bit) // cvt packed DWORDs to doubles -INST3(cvtsi2sd32, "cvtsi2sd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x2A), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // cvt DWORD to scalar double -INST3(cvtsi2sd64, "cvtsi2sd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x2A), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // cvt DWORD to scalar double -INST3(cvttps2pi, "cvttps2pi", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x2C), INS_TT_NONE, Input_32Bit) // cvt with trunc packed singles to DWORDs -INST3(cvttss2si, "cvttss2si", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x2C), INS_TT_TUPLE1_FIXED, Input_32Bit) // cvt with trunc scalar single to DWORD -INST3(cvttpd2pi, "cvttpd2pi", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x2C), INS_TT_NONE, Input_64Bit) // cvt with trunc packed doubles to DWORDs -INST3(cvtps2pi, "cvtps2pi", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x2D), INS_TT_NONE, Input_32Bit) // cvt packed singles to DWORDs -INST3(cvtss2si, "cvtss2si", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x2D), INS_TT_TUPLE1_FIXED, Input_32Bit) // cvt scalar single to DWORD -INST3(cvtpd2pi, "cvtpd2pi", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x2D), INS_TT_NONE, Input_64Bit) // cvt packed doubles to DWORDs -INST3(cvtsd2si, "cvtsd2si", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x2D), INS_TT_TUPLE1_FIXED, Input_64Bit) // cvt scalar double to DWORD -INST3(cvtps2pd, "cvtps2pd", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5A), INS_TT_HALF, Input_32Bit) // cvt packed singles to doubles -INST3(cvtpd2ps, "cvtpd2ps", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5A), INS_TT_FULL, Input_64Bit) // cvt packed doubles to singles -INST3(cvtss2sd, "cvtss2sd", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5A), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // cvt scalar single to scalar doubles -INST3(cvtsd2ss, "cvtsd2ss", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5A), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // cvt scalar double to scalar singles -INST3(cvtdq2ps, "cvtdq2ps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5B), INS_TT_FULL, Input_32Bit) // cvt packed DWORDs to singles -INST3(cvtps2dq, "cvtps2dq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5B), INS_TT_FULL, Input_32Bit) // cvt packed singles to DWORDs -INST3(cvttps2dq, "cvttps2dq", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5B), INS_TT_FULL, Input_32Bit) // cvt with trunc packed singles to DWORDs -INST3(cvtpd2dq, "cvtpd2dq", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xE6), INS_TT_FULL, Input_64Bit) // cvt packed doubles to DWORDs -INST3(cvttpd2dq, "cvttpd2dq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE6), INS_TT_FULL, Input_64Bit) // cvt with trunc packed doubles to DWORDs -INST3(cvtdq2pd, "cvtdq2pd", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xE6), INS_TT_HALF, Input_32Bit) // cvt packed DWORDs to doubles - -// SSE2 comparison instructions -INST3(comiss, "comiss", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x2F), INS_TT_TUPLE1_SCALAR, Input_32Bit | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Writes_PF | Writes_CF ) // ordered compare singles -INST3(comisd, "comisd", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x2F), INS_TT_TUPLE1_SCALAR, Input_64Bit | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Writes_PF | Writes_CF ) // ordered compare doubles -INST3(ucomiss, "ucomiss", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x2E), INS_TT_TUPLE1_SCALAR, Input_32Bit | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Writes_PF | Writes_CF ) // unordered compare singles -INST3(ucomisd, "ucomisd", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x2E), INS_TT_TUPLE1_SCALAR, Input_64Bit | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Writes_PF | Writes_CF ) // unordered compare doubles - -// SSE2 packed single/double comparison operations. -// Note that these instructions not only compare but also overwrite the first source. -INST3(cmpps, "cmpps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0xC2), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // compare packed singles -INST3(cmppd, "cmppd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC2), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // compare packed doubles -INST3(cmpss, "cmpss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xC2), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // compare scalar singles -INST3(cmpsd, "cmpsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xC2), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // compare scalar doubles - -//SSE2 packed integer operations -INST3(paddb, "paddb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFC), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed byte integers -INST3(paddw, "paddw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFD), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed word (16-bit) integers -INST3(paddd, "paddd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFE), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed double-word (32-bit) integers -INST3(paddq, "paddq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD4), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed quad-word (64-bit) integers -INST3(paddsb, "paddsb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEC), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed signed byte integers and saturate the results -INST3(paddsw, "paddsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xED), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed signed word integers and saturate the results -INST3(paddusb, "paddusb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDC), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed unsigned byte integers and saturate the results -INST3(paddusw, "paddusw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDD), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed unsigned word integers and saturate the results -INST3(pavgb, "pavgb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE0), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Average of packed byte integers -INST3(pavgw, "pavgw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE3), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Average of packed word integers -INST3(psubb, "psubb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF8), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed word (16-bit) integers -INST3(psubw, "psubw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF9), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed word (16-bit) integers -INST3(psubd, "psubd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFA), INS_TT_FULL_MEM, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed double-word (32-bit) integers -INST3(psubq, "psubq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFB), INS_TT_FULL_MEM, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // subtract packed quad-word (64-bit) integers -INST3(pmaddwd, "pmaddwd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF5), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst -INST3(pmulhw, "pmulhw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE5), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply high the packed 16-bit signed integers -INST3(pmulhuw, "pmulhuw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE4), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply high the packed 16-bit unsigned integers -INST3(pmuludq, "pmuludq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF4), INS_TT_FULL_MEM, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed multiply 32-bit unsigned integers and store 64-bit result -INST3(pmullw, "pmullw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD5), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed multiply 16 bit unsigned integers and store lower 16 bits of each result -INST3(pand, "pand", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs -INST3(pandn, "pandn", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs -INST3(por, "por", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise OR of two xmm regs -INST3(pxor, "pxor", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise XOR of two xmm regs -INST3(psadbw, "psadbw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF6), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Compute the sum of absolute differences of packed unsigned 8-bit integers -INST3(psubsb, "psubsb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE8), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation -INST3(psubusb, "psubusb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD8), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation -INST3(psubsw, "psubsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE9), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation -INST3(psubusw, "psubusw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD9), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation - -// Note that the shift immediates share the same encoding between left and right-shift, and are distinguished by the Reg/Opcode, -// which is handled in emitxarch.cpp. -// TODO-XArch-AVX512: Input mem size = vector length. So, should automatically default to the right value. -// This hopefully means we do not need psrldq128/psrldq256/psrldq512 etc -INST3(psrldq, "psrldq", IUM_WR, BAD_CODE, PCKDBL(0x73), BAD_CODE, INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Shift right logical of xmm reg by given number of bytes -INST3(pslldq, "pslldq", IUM_WR, BAD_CODE, PCKDBL(0x73), BAD_CODE, INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Shift left logical of xmm reg by given number of bytes -INST3(psllw, "psllw", IUM_WR, BAD_CODE, PCKDBL(0x71), PCKDBL(0xF1), INS_TT_FULL_MEM | INS_TT_MEM128, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift left logical of 16-bit integers -INST3(pslld, "pslld", IUM_WR, BAD_CODE, PCKDBL(0x72), PCKDBL(0xF2), INS_TT_FULL | INS_TT_MEM128, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift left logical of 32-bit integers -INST3(psllq, "psllq", IUM_WR, BAD_CODE, PCKDBL(0x73), PCKDBL(0xF3), INS_TT_FULL | INS_TT_MEM128, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift left logical of 64-bit integers -INST3(psrlw, "psrlw", IUM_WR, BAD_CODE, PCKDBL(0x71), PCKDBL(0xD1), INS_TT_FULL_MEM | INS_TT_MEM128, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right logical of 16-bit integers -INST3(psrld, "psrld", IUM_WR, BAD_CODE, PCKDBL(0x72), PCKDBL(0xD2), INS_TT_FULL | INS_TT_MEM128, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right logical of 32-bit integers -INST3(psrlq, "psrlq", IUM_WR, BAD_CODE, PCKDBL(0x73), PCKDBL(0xD3), INS_TT_FULL | INS_TT_MEM128, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right logical of 64-bit integers -INST3(psraw, "psraw", IUM_WR, BAD_CODE, PCKDBL(0x71), PCKDBL(0xE1), INS_TT_FULL_MEM | INS_TT_MEM128, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right arithmetic of 16-bit integers -INST3(psrad, "psrad", IUM_WR, BAD_CODE, PCKDBL(0x72), PCKDBL(0xE2), INS_TT_FULL | INS_TT_MEM128, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right arithmetic of 32-bit integers -INST3(pmaxub, "pmaxub", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDE), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum unsigned bytes -INST3(pminub, "pminub", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDA), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum unsigned bytes -INST3(pmaxsw, "pmaxsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEE), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum signed words -INST3(pminsw, "pminsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEA), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum signed words -INST3(pcmpeqd, "pcmpeqd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x76), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 32-bit integers for equality -INST3(pcmpgtd, "pcmpgtd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x66), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 32-bit signed integers for greater than -INST3(pcmpeqw, "pcmpeqw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x75), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 16-bit integers for equality -INST3(pcmpgtw, "pcmpgtw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x65), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 16-bit signed integers for greater than -INST3(pcmpeqb, "pcmpeqb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x74), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 8-bit integers for equality -INST3(pcmpgtb, "pcmpgtb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x64), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 8-bit signed integers for greater than - -INST3(pshufd, "pshufd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x70), INS_TT_FULL, Input_32Bit) // Packed shuffle of 32-bit integers -INST3(pshufhw, "pshufhw", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x70), INS_TT_FULL_MEM, Input_16Bit) // Shuffle the high words in xmm2/m128 based on the encoding in imm8 and store the result in xmm1. -INST3(pshuflw, "pshuflw", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x70), INS_TT_FULL_MEM, Input_16Bit) // Shuffle the low words in xmm2/m128 based on the encoding in imm8 and store the result in xmm1. -INST3(pextrw, "pextrw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC5), INS_TT_TUPLE1_SCALAR, Input_16Bit) // Extract 16-bit value into a r32 with zero extended to 32-bits -INST3(pinsrw, "pinsrw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC4), INS_TT_TUPLE1_SCALAR, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert word at index - -INST3(punpckhbw, "punpckhbw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x68), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ubyte to ushort (hi) -INST3(punpcklbw, "punpcklbw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x60), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ubyte to ushort (lo) -INST3(punpckhqdq, "punpckhqdq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x6D), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen uint to ulong (hi) -INST3(punpcklqdq, "punpcklqdq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x6C), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen uint to ulong (lo) -INST3(punpckhwd, "punpckhwd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x69), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ushort to uint (hi) -INST3(punpcklwd, "punpcklwd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x61), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ushort to uint (lo) -INST3(unpckhpd, "unpckhpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x15), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ubyte to ushort (hi) -INST3(unpcklpd, "unpcklpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x14), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ubyte to ushort (hi) - -INST3(packssdw, "packssdw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x6B), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) int to short with saturation -INST3(packsswb, "packsswb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x63), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) short to byte with saturation -INST3(packuswb, "packuswb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x67), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) short to unsigned byte with saturation -// id nm um mr mi rm flags -INST3(dpps, "dpps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x40), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed dot product of two float vector regs -INST3(dppd, "dppd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x41), INS_TT_NONE, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed dot product of two double vector regs -INST3(insertps, "insertps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x21), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert packed single precision float value -INST3(pcmpeqq, "pcmpeqq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x29), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 64-bit integers for equality -INST3(pcmpgtq, "pcmpgtq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x37), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 64-bit integers for equality -INST3(pmulld, "pmulld", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x40), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed multiply 32 bit unsigned integers and store lower 32 bits of each result -INST3(ptest, "ptest", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x17), INS_TT_NONE, INS_FLAGS_None) // Packed logical compare -INST3(phaddd, "phaddd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x02), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal add -INST3(pabsb, "pabsb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1C), INS_TT_FULL_MEM, Input_8Bit | INS_FLAGS_None) // Packed absolute value of bytes -INST3(pabsw, "pabsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1D), INS_TT_FULL_MEM, Input_16Bit | INS_FLAGS_None) // Packed absolute value of 16-bit integers -INST3(pabsd, "pabsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1E), INS_TT_FULL, Input_32Bit | INS_FLAGS_None) // Packed absolute value of 32-bit integers -// TODO-XArch-AVX512: Input mem size = vector length. So, should automatically default to the right value. -// This hopefully means we do not need palignr128/palignr256/palignr512 etc. -INST3(palignr, "palignr", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0F), INS_TT_FULL_MEM, INS_Flags_IsDstDstSrcAVXInstruction) // Packed Align Right -INST3(pmaddubsw, "pmaddubsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x04), INS_TT_FULL_MEM, INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Packed Signed and Unsigned Bytes -INST3(pmulhrsw, "pmulhrsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0B), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Multiply High with Round and Scale -INST3(pshufb, "pshufb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x00), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Shuffle Bytes -INST3(psignb, "psignb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x08), INS_TT_NONE, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed SIGN -INST3(psignw, "psignw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x09), INS_TT_NONE, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed SIGN -INST3(psignd, "psignd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0A), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed SIGN -INST3(pminsb, "pminsb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x38), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum signed bytes -INST3(pminsd, "pminsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x39), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 32-bit signed integers -INST3(pminuw, "pminuw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3A), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 16-bit unsigned integers -INST3(pminud, "pminud", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3B), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 32-bit unsigned integers -INST3(pmaxsb, "pmaxsb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3C), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum signed bytes -INST3(pmaxsd, "pmaxsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3D), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 32-bit signed integers -INST3(pmaxuw, "pmaxuw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3E), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 16-bit unsigned integers -INST3(pmaxud, "pmaxud", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3F), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 32-bit unsigned integers -INST3(pmovsxbw, "pmovsxbw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x20), INS_TT_HALF_MEM, Input_8Bit) // Packed sign extend byte to short -INST3(pmovsxbd, "pmovsxbd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x21), INS_TT_QUARTER_MEM, Input_8Bit) // Packed sign extend byte to int -INST3(pmovsxbq, "pmovsxbq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x22), INS_TT_EIGHTH_MEM, Input_8Bit) // Packed sign extend byte to long -INST3(pmovsxwd, "pmovsxwd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x23), INS_TT_HALF_MEM, Input_16Bit) // Packed sign extend short to int -INST3(pmovsxwq, "pmovsxwq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x24), INS_TT_QUARTER_MEM, Input_16Bit) // Packed sign extend short to long -INST3(pmovsxdq, "pmovsxdq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x25), INS_TT_HALF_MEM, Input_32Bit) // Packed sign extend int to long -INST3(pmovzxbw, "pmovzxbw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x30), INS_TT_HALF_MEM, Input_8Bit) // Packed zero extend byte to short -INST3(pmovzxbd, "pmovzxbd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x31), INS_TT_QUARTER_MEM, Input_8Bit) // Packed zero extend byte to intg -INST3(pmovzxbq, "pmovzxbq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x32), INS_TT_EIGHTH_MEM, Input_8Bit) // Packed zero extend byte to lon -INST3(pmovzxwd, "pmovzxwd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x33), INS_TT_HALF_MEM, Input_16Bit) // Packed zero extend short to int -INST3(pmovzxwq, "pmovzxwq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x34), INS_TT_QUARTER_MEM, Input_16Bit) // Packed zero extend short to long -INST3(pmovzxdq, "pmovzxdq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x35), INS_TT_HALF_MEM, Input_32Bit) // Packed zero extend int to long -INST3(packusdw, "packusdw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x2B), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) int to unsigned short with saturation -INST3(roundps, "roundps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x08), INS_TT_NONE, Input_32Bit | INS_FLAGS_None) // Round packed single precision floating-point values -INST3(roundss, "roundss", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0A), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstSrcSrcAVXInstruction) // Round scalar single precision floating-point values -INST3(roundpd, "roundpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x09), INS_TT_NONE, Input_64Bit | INS_FLAGS_None) // Round packed double precision floating-point values -INST3(roundsd, "roundsd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0B), INS_TT_NONE, Input_64Bit | INS_Flags_IsDstSrcSrcAVXInstruction) // Round scalar double precision floating-point values -INST3(pmuldq, "pmuldq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x28), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed multiply 32-bit signed integers and store 64-bit result -INST3(blendps, "blendps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0C), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Single Precision Floating-Point Values -INST3(blendvps, "blendvps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x14), INS_TT_NONE, Input_32Bit | INS_FLAGS_None) // Variable Blend Packed Singles -INST3(blendpd, "blendpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0D), INS_TT_NONE, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Double Precision Floating-Point Values -INST3(blendvpd, "blendvpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x15), INS_TT_NONE, Input_64Bit | INS_FLAGS_None) // Variable Blend Packed Doubles -INST3(pblendw, "pblendw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0E), INS_TT_NONE, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Words -INST3(pblendvb, "pblendvb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x10), INS_TT_NONE, Input_8Bit | INS_FLAGS_None) // Variable Blend Packed Bytes -INST3(phaddw, "phaddw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x01), INS_TT_NONE, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal add of 16-bit integers -INST3(phsubw, "phsubw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x05), INS_TT_NONE, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal subtract of 16-bit integers -INST3(phsubd, "phsubd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x06), INS_TT_NONE, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal subtract of 32-bit integers -INST3(phaddsw, "phaddsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x03), INS_TT_NONE, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal add of 16-bit integers with saturation -INST3(phsubsw, "phsubsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x07), INS_TT_NONE, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal subtract of 16-bit integers with saturation -// TODO-XArch-AVX512: Input mem size = vector length. So, should automatically default to the right value. -// This hopefully means we do not need lddqu128/lddqu256/lddqu512 etc. -INST3(lddqu, "lddqu", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xF0), INS_TT_NONE, INS_FLAGS_None) // Load Unaligned integer -INST3(movntdqa, "movntdqa", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x2A), INS_TT_FULL_MEM, INS_FLAGS_None) // Load Double Quadword Non-Temporal Aligned Hint -INST3(movddup, "movddup", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x12), INS_TT_MOVDDUP, Input_64Bit | INS_FLAGS_None) // Replicate Double FP Values -INST3(movsldup, "movsldup", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x12), INS_TT_FULL_MEM, Input_32Bit | INS_FLAGS_None) // Replicate even-indexed Single FP Values -INST3(movshdup, "movshdup", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x16), INS_TT_FULL_MEM, Input_32Bit | INS_FLAGS_None) // Replicate odd-indexed Single FP Values -INST3(phminposuw, "phminposuw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x41), INS_TT_NONE, Input_16Bit | INS_FLAGS_None) // Packed Horizontal Word Minimum -INST3(mpsadbw, "mpsadbw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x42), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Compute Multiple Packed Sums of Absolute Difference -INST3(pinsrb, "pinsrb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x20), INS_TT_TUPLE1_SCALAR, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert Byte -INST3(pinsrd, "pinsrd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x22), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert Dword -INST3(pinsrq, "pinsrq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x22), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert Qword -INST3(pextrb, "pextrb", IUM_WR, SSE3A(0x14), BAD_CODE, BAD_CODE, INS_TT_TUPLE1_SCALAR, Input_8Bit | INS_FLAGS_None) // Extract Byte -INST3(pextrd, "pextrd", IUM_WR, SSE3A(0x16), BAD_CODE, BAD_CODE, INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_FLAGS_None) // Extract Dword -INST3(pextrq, "pextrq", IUM_WR, SSE3A(0x16), BAD_CODE, BAD_CODE, INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_FLAGS_None) // Extract Qword -INST3(pextrw_sse41, "pextrw", IUM_WR, SSE3A(0x15), BAD_CODE, BAD_CODE, INS_TT_TUPLE1_SCALAR, Input_16Bit | INS_FLAGS_None) // Extract Word -INST3(extractps, "extractps", IUM_WR, SSE3A(0x17), BAD_CODE, BAD_CODE, INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_FLAGS_None) // Extract Packed Floating-Point Values - -//PCLMULQDQ instructions -INST3(pclmulqdq, "pclmulqdq" , IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x44), INS_TT_FULL_MEM, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Perform a carry-less multiplication of two quadwords - -//AES instructions -// TODO-XArch-AVX512: Input mem size = vector length. So, should automatically default to the right value. -// This hopefully means we do not need aesdec128/aesdec256/aesdec512 etc. -INST3(aesdec, "aesdec", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDE), INS_TT_FULL_MEM, INS_Flags_IsDstDstSrcAVXInstruction) // Perform one round of an AES decryption flow -INST3(aesdeclast, "aesdeclast", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDF), INS_TT_FULL_MEM, INS_Flags_IsDstDstSrcAVXInstruction) // Perform last round of an AES decryption flow -INST3(aesenc, "aesenc", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDC), INS_TT_FULL_MEM, INS_Flags_IsDstDstSrcAVXInstruction) // Perform one round of an AES encryption flow -INST3(aesenclast, "aesenclast", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDD), INS_TT_FULL_MEM, INS_Flags_IsDstDstSrcAVXInstruction) // Perform last round of an AES encryption flow -INST3(aesimc, "aesimc", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDB), INS_TT_NONE, INS_FLAGS_None) // Perform the AES InvMixColumn Transformation -INST3(aeskeygenassist, "aeskeygenassist", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0xDF), INS_TT_NONE, INS_FLAGS_None) // AES Round Key Generation Assist +// SSE +INST3(addps, "addps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x58), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed singles +INST3(addss, "addss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x58), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add scalar singles +INST3(andnps, "andnps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x55), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // And-Not packed singles +INST3(andps, "andps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x54), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // AND packed singles +INST3(cmpps, "cmpps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0xC2), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // compare packed singles +INST3(cmpss, "cmpss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xC2), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // compare scalar singles +INST3(comiss, "comiss", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x2F), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Writes_PF | Writes_CF) // ordered compare singles +INST3(cvtsi2ss32, "cvtsi2ss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x2A), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // cvt DWORD to scalar single +INST3(cvtsi2ss64, "cvtsi2ss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x2A), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // cvt QWORD to scalar single +INST3(cvtss2si, "cvtss2si", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x2D), INS_TT_TUPLE1_FIXED, Input_32Bit | REX_WX | Encoding_VEX | Encoding_EVEX) // cvt scalar single to DWORD/QWORD +INST3(cvttss2si, "cvttss2si", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x2C), INS_TT_TUPLE1_FIXED, Input_32Bit | REX_WX | Encoding_VEX | Encoding_EVEX) // cvt with trunc scalar single to DWORD +INST3(divps, "divps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5E), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Divide packed singles +INST3(divss, "divss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5E), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Divide scalar singles +INST3(maxps, "maxps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5F), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Return Maximum packed singles +INST3(maxss, "maxss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5F), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Return Maximum scalar single +INST3(minps, "minps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5D), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Return Minimum packed singles +INST3(minss, "minss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5D), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Return Minimum scalar single +INST3(movaps, "movaps", IUM_WR, PCKFLT(0x29), BAD_CODE, PCKFLT(0x28), INS_TT_FULL_MEM, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) +INST3(movhlps, "movhlps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x12), INS_TT_NONE, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) +INST3(movhps, "movhps", IUM_WR, PCKFLT(0x17), BAD_CODE, PCKFLT(0x16), INS_TT_TUPLE2, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) +INST3(movlhps, "movlhps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x16), INS_TT_NONE, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) +INST3(movlps, "movlps", IUM_WR, PCKFLT(0x13), BAD_CODE, PCKFLT(0x12), INS_TT_TUPLE1_FIXED, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) +INST3(movmskps, "movmskps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x50), INS_TT_NONE, REX_WIG | Encoding_VEX) +INST3(movntps, "movntps", IUM_WR, PCKFLT(0x2B), BAD_CODE, BAD_CODE, INS_TT_FULL_MEM, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) +INST3(movss, "movss", IUM_WR, SSEFLT(0x11), BAD_CODE, SSEFLT(0x10), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) +INST3(movups, "movups", IUM_WR, PCKFLT(0x11), BAD_CODE, PCKFLT(0x10), INS_TT_FULL_MEM, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) +INST3(mulps, "mulps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x59), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply packed singles +INST3(mulss, "mulss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x59), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply scalar single +INST3(orps, "orps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x56), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Or packed singles +INST3(prefetchnta, "prefetchnta", IUM_RD, 0x000F0018, BAD_CODE, BAD_CODE, INS_TT_NONE, REX_WIG) +INST3(prefetcht0, "prefetcht0", IUM_RD, 0x000F0818, BAD_CODE, BAD_CODE, INS_TT_NONE, REX_WIG) +INST3(prefetcht1, "prefetcht1", IUM_RD, 0x000F1018, BAD_CODE, BAD_CODE, INS_TT_NONE, REX_WIG) +INST3(prefetcht2, "prefetcht2", IUM_RD, 0x000F1818, BAD_CODE, BAD_CODE, INS_TT_NONE, REX_WIG) +INST3(rcpps, "rcpps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x53), INS_TT_NONE, REX_WIG | Encoding_VEX) // Reciprocal of packed singles +INST3(rcpss, "rcpss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x53), INS_TT_NONE, REX_WIG | Encoding_VEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Reciprocal of scalar single +INST3(rsqrtps, "rsqrtps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x52), INS_TT_NONE, REX_WIG | Encoding_VEX) // Reciprocal Sqrt of packed singles +INST3(rsqrtss, "rsqrtss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x52), INS_TT_NONE, REX_WIG | Encoding_VEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Reciprocal Sqrt of scalar single +INST3(shufps, "shufps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0xC6), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) +INST3(sfence, "sfence", IUM_RD, 0x000FF8AE, BAD_CODE, BAD_CODE, INS_TT_NONE, REX_WIG) +INST3(sqrtps, "sqrtps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x51), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Sqrt of packed singles +INST3(sqrtss, "sqrtss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x51), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Sqrt of scalar single +INST3(subps, "subps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5C), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed singles +INST3(subss, "subss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5C), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract scalar singles +INST3(ucomiss, "ucomiss", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x2E), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Writes_PF | Writes_CF) // unordered compare singles +INST3(unpckhps, "unpckhps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x15), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) +INST3(unpcklps, "unpcklps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x14), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) +INST3(xorps, "xorps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x57), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // XOR packed singles + +// SSE2 +INST3(addpd, "addpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x58), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed doubles +INST3(addsd, "addsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x58), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add scalar doubles +INST3(andnpd, "andnpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x55), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // And-Not packed doubles +INST3(andpd, "andpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x54), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // AND packed doubles +INST3(cmppd, "cmppd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC2), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // compare packed doubles +INST3(cmpsd, "cmpsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xC2), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // compare scalar doubles +INST3(comisd, "comisd", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x2F), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Writes_PF | Writes_CF) // ordered compare doubles +INST3(cvtdq2pd, "cvtdq2pd", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xE6), INS_TT_HALF, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // cvt packed DWORDs to doubles +INST3(cvtdq2ps, "cvtdq2ps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5B), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // cvt packed DWORDs to singles +INST3(cvtpd2dq, "cvtpd2dq", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xE6), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) // cvt packed doubles to DWORDs +INST3(cvtpd2ps, "cvtpd2ps", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5A), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) // cvt packed doubles to singles +INST3(cvtps2dq, "cvtps2dq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5B), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // cvt packed singles to DWORDs +INST3(cvtps2pd, "cvtps2pd", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5A), INS_TT_HALF, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // cvt packed singles to doubles +INST3(cvtsd2si, "cvtsd2si", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x2D), INS_TT_TUPLE1_FIXED, Input_64Bit | REX_WX | Encoding_VEX | Encoding_EVEX) // cvt scalar double to DWORD +INST3(cvtsd2ss, "cvtsd2ss", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5A), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // cvt scalar double to scalar singles +INST3(cvtsi2sd32, "cvtsi2sd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x2A), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // cvt DWORD to scalar double +INST3(cvtsi2sd64, "cvtsi2sd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x2A), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // cvt QWORD to scalar double +INST3(cvtss2sd, "cvtss2sd", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5A), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // cvt scalar single to scalar doubles +INST3(cvttpd2dq, "cvttpd2dq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE6), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) // cvt with trunc packed doubles to DWORDs +INST3(cvttps2dq, "cvttps2dq", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5B), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // cvt with trunc packed singles to DWORDs +INST3(cvttsd2si, "cvttsd2si", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x2C), INS_TT_TUPLE1_FIXED, Input_64Bit | REX_WX | Encoding_VEX | Encoding_EVEX) // cvt with trunc scalar double to signed DWORDs +INST3(divpd, "divpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5E), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Divide packed doubles +INST3(divsd, "divsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5E), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Divide scalar doubles +INST3(lfence, "lfence", IUM_RD, 0x000FE8AE, BAD_CODE, BAD_CODE, INS_TT_NONE, REX_WIG) +INST3(maskmovdqu, "maskmovdqu", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF7), INS_TT_NONE, REX_WIG) +INST3(maxpd, "maxpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5F), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Return Maximum packed doubles +INST3(maxsd, "maxsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5F), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Return Maximum scalar double +INST3(mfence, "mfence", IUM_RD, 0x000FF0AE, BAD_CODE, BAD_CODE, INS_TT_NONE, REX_WIG) +INST3(minpd, "minpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5D), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Return Minimum packed doubles +INST3(minsd, "minsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5D), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Return Minimum scalar double +INST3(movapd, "movapd", IUM_WR, PCKDBL(0x29), BAD_CODE, PCKDBL(0x28), INS_TT_FULL_MEM, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) +INST3(movd, "movd", IUM_WR, PCKDBL(0x7E), BAD_CODE, PCKDBL(0x6E), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_WX | Encoding_VEX | Encoding_EVEX) // Move DWORD/QWORD between xmm regs <-> memory/r32/r64 regs +INST3(movdqa, "movdqa", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) +INST3(movdqu, "movdqu", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) +INST3(movhpd, "movhpd", IUM_WR, PCKDBL(0x17), BAD_CODE, PCKDBL(0x16), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) +INST3(movlpd, "movlpd", IUM_WR, PCKDBL(0x13), BAD_CODE, PCKDBL(0x12), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) +INST3(movmskpd, "movmskpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x50), INS_TT_NONE, REX_WIG | Encoding_VEX) // Extract 2-bit sign mask from xmm and store in reg. The upper bits of r32 or r64 are filled with zeros. +INST3(movntdq, "movntdq", IUM_WR, PCKDBL(0xE7), BAD_CODE, BAD_CODE, INS_TT_FULL_MEM, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) +INST3(movnti, "movnti", IUM_WR, PCKFLT(0xC3), BAD_CODE, BAD_CODE, INS_TT_NONE, REX_WX) +INST3(movntpd, "movntpd", IUM_WR, PCKDBL(0x2B), BAD_CODE, BAD_CODE, INS_TT_FULL_MEM, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) +INST3(movq, "movq", IUM_WR, PCKDBL(0xD6), BAD_CODE, SSEFLT(0x7E), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) // Move Quadword between memory/mm <-> regs +INST3(movsd_simd, "movsd", IUM_WR, SSEDBL(0x11), BAD_CODE, SSEDBL(0x10), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) +INST3(movupd, "movupd", IUM_WR, PCKDBL(0x11), BAD_CODE, PCKDBL(0x10), INS_TT_FULL_MEM, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) +INST3(mulpd, "mulpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x59), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply packed doubles +INST3(mulsd, "mulsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x59), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply scalar doubles +INST3(orpd, "orpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x56), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Or packed doubles +INST3(packssdw, "packssdw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x6B), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) int to short with saturation +INST3(packsswb, "packsswb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x63), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) short to byte with saturation +INST3(packuswb, "packuswb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x67), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) short to unsigned byte with saturation +INST3(paddb, "paddb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFC), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed byte integers +INST3(paddd, "paddd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFE), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed double-word (32-bit) integers +INST3(paddq, "paddq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD4), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed quad-word (64-bit) integers +INST3(paddsb, "paddsb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEC), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed signed byte integers and saturate the results +INST3(paddsw, "paddsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xED), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed signed word integers and saturate the results +INST3(paddusb, "paddusb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDC), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed unsigned byte integers and saturate the results +INST3(paddusw, "paddusw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDD), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed unsigned word integers and saturate the results +INST3(paddw, "paddw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFD), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed word (16-bit) integers +INST3(pand, "pand", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs +INST3(pandn, "pandn", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs +INST3(pavgb, "pavgb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE0), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Average of packed byte integers +INST3(pavgw, "pavgw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE3), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Average of packed word integers +INST3(pcmpeqb, "pcmpeqb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x74), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 8-bit integers for equality +INST3(pcmpeqd, "pcmpeqd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x76), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 32-bit integers for equality +INST3(pcmpeqw, "pcmpeqw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x75), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 16-bit integers for equality +INST3(pcmpgtb, "pcmpgtb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x64), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 8-bit signed integers for greater than +INST3(pcmpgtd, "pcmpgtd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x66), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 32-bit signed integers for greater than +INST3(pcmpgtw, "pcmpgtw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x65), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 16-bit signed integers for greater than +INST3(pextrw, "pextrw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC5), INS_TT_TUPLE1_SCALAR, Input_16Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Extract 16-bit value into a r32 with zero extended to 32-bits +INST3(pinsrw, "pinsrw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC4), INS_TT_TUPLE1_SCALAR, Input_16Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert word at index +INST3(pmaddwd, "pmaddwd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF5), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst +INST3(pmaxsw, "pmaxsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEE), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum signed words +INST3(pmaxub, "pmaxub", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDE), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum unsigned bytes +INST3(pminsw, "pminsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEA), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum signed words +INST3(pminub, "pminub", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDA), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum unsigned bytes +INST3(pmovmskb, "pmovmskb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD7), INS_TT_NONE, REX_WIG | Encoding_VEX) // Move the MSB bits of all bytes in a xmm reg to an int reg +INST3(pmulhuw, "pmulhuw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE4), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply high the packed 16-bit unsigned integers +INST3(pmulhw, "pmulhw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE5), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply high the packed 16-bit signed integers +INST3(pmullw, "pmullw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD5), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed multiply 16 bit unsigned integers and store lower 16 bits of each result +INST3(pmuludq, "pmuludq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF4), INS_TT_FULL_MEM, Input_32Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed multiply 32-bit unsigned integers and store 64-bit result +INST3(por, "por", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise OR of two xmm regs +INST3(psadbw, "psadbw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF6), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Compute the sum of absolute differences of packed unsigned 8-bit integers +INST3(pshufd, "pshufd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x70), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Packed shuffle of 32-bit integers +INST3(pshufhw, "pshufhw", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x70), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX) // Shuffle the high words in xmm2/m128 based on the encoding in imm8 and store the result in xmm1. +INST3(pshuflw, "pshuflw", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x70), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX) // Shuffle the low words in xmm2/m128 based on the encoding in imm8 and store the result in xmm1. +INST3(pslld, "pslld", IUM_WR, BAD_CODE, PCKDBL(0x72), PCKDBL(0xF2), INS_TT_FULL | INS_TT_MEM128, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift left logical of 32-bit integers +INST3(pslldq, "pslldq", IUM_WR, BAD_CODE, PCKDBL(0x73), BAD_CODE, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift left logical of xmm reg by given number of bytes +INST3(psllq, "psllq", IUM_WR, BAD_CODE, PCKDBL(0x73), PCKDBL(0xF3), INS_TT_FULL | INS_TT_MEM128, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift left logical of 64-bit integers +INST3(psllw, "psllw", IUM_WR, BAD_CODE, PCKDBL(0x71), PCKDBL(0xF1), INS_TT_FULL_MEM | INS_TT_MEM128, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift left logical of 16-bit integers +INST3(psrad, "psrad", IUM_WR, BAD_CODE, PCKDBL(0x72), PCKDBL(0xE2), INS_TT_FULL | INS_TT_MEM128, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right arithmetic of 32-bit integers +INST3(psraw, "psraw", IUM_WR, BAD_CODE, PCKDBL(0x71), PCKDBL(0xE1), INS_TT_FULL_MEM | INS_TT_MEM128, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right arithmetic of 16-bit integers +INST3(psrld, "psrld", IUM_WR, BAD_CODE, PCKDBL(0x72), PCKDBL(0xD2), INS_TT_FULL | INS_TT_MEM128, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right logical of 32-bit integers +INST3(psrldq, "psrldq", IUM_WR, BAD_CODE, PCKDBL(0x73), BAD_CODE, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift right logical of xmm reg by given number of bytes +INST3(psrlq, "psrlq", IUM_WR, BAD_CODE, PCKDBL(0x73), PCKDBL(0xD3), INS_TT_FULL | INS_TT_MEM128, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right logical of 64-bit integers +INST3(psrlw, "psrlw", IUM_WR, BAD_CODE, PCKDBL(0x71), PCKDBL(0xD1), INS_TT_FULL_MEM | INS_TT_MEM128, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right logical of 16-bit integers +INST3(psubb, "psubb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF8), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed word (16-bit) integers +INST3(psubd, "psubd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFA), INS_TT_FULL_MEM, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed double-word (32-bit) integers +INST3(psubq, "psubq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFB), INS_TT_FULL_MEM, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // subtract packed quad-word (64-bit) integers +INST3(psubw, "psubw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF9), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed word (16-bit) integers +INST3(psubsb, "psubsb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE8), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation +INST3(psubsw, "psubsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE9), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation +INST3(psubusb, "psubusb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD8), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation +INST3(psubusw, "psubusw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD9), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation +INST3(punpckhbw, "punpckhbw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x68), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ubyte to ushort (hi) +INST3(punpckhdq, "punpckhdq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x6A), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) +INST3(punpckhqdq, "punpckhqdq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x6D), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen uint to ulong (hi) +INST3(punpckhwd, "punpckhwd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x69), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ushort to uint (hi) +INST3(punpcklbw, "punpcklbw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x60), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ubyte to ushort (lo) +INST3(punpckldq, "punpckldq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x62), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) +INST3(punpcklqdq, "punpcklqdq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x6C), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen uint to ulong (lo) +INST3(punpcklwd, "punpcklwd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x61), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ushort to uint (lo) +INST3(pxor, "pxor", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise XOR of two xmm regs +INST3(shufpd, "shufpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC6), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) +INST3(sqrtpd, "sqrtpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x51), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) // Sqrt of packed doubles +INST3(sqrtsd, "sqrtsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x51), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Sqrt of scalar double +INST3(subpd, "subpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5C), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed doubles +INST3(subsd, "subsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5C), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract scalar doubles +INST3(ucomisd, "ucomisd", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x2E), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Writes_PF | Writes_CF) // unordered compare doubles +INST3(unpckhpd, "unpckhpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x15), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ubyte to ushort (hi) +INST3(unpcklpd, "unpcklpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x14), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ubyte to ushort (hi) +INST3(xorpd, "xorpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x57), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // XOR packed doubles + +// SSE3 +INST3(addsubpd, "addsubpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD0), INS_TT_NONE, Input_64Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add/Subtract packed doubles +INST3(addsubps, "addsubps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xD0), INS_TT_NONE, Input_32Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add/Subtract packed singles +INST3(haddpd, "haddpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x7C), INS_TT_NONE, Input_64Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal add packed doubles +INST3(haddps, "haddps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7C), INS_TT_NONE, Input_32Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal add packed floats +INST3(hsubpd, "hsubpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x7D), INS_TT_NONE, Input_64Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal subtract packed doubles +INST3(hsubps, "hsubps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7D), INS_TT_NONE, Input_32Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal subtract packed floats +INST3(lddqu, "lddqu", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xF0), INS_TT_NONE, REX_WIG | Encoding_VEX) // Load Unaligned integer +INST3(movddup, "movddup", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x12), INS_TT_MOVDDUP, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) // Replicate Double FP Values +INST3(movshdup, "movshdup", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x16), INS_TT_FULL_MEM, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Replicate odd-indexed Single FP Values +INST3(movsldup, "movsldup", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x12), INS_TT_FULL_MEM, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Replicate even-indexed Single FP Values + +// SSSE3 +INST3(pabsb, "pabsb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1C), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed absolute value of bytes +INST3(pabsd, "pabsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1E), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Packed absolute value of 32-bit integers +INST3(pabsw, "pabsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1D), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed absolute value of 16-bit integers +INST3(palignr, "palignr", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0F), INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Align Right +INST3(phaddd, "phaddd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x02), INS_TT_NONE, Input_32Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal add +INST3(phaddsw, "phaddsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x03), INS_TT_NONE, Input_16Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal add of 16-bit integers with saturation +INST3(phaddw, "phaddw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x01), INS_TT_NONE, Input_16Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal add of 16-bit integers +INST3(phsubd, "phsubd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x06), INS_TT_NONE, Input_32Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal subtract of 32-bit integers +INST3(phsubsw, "phsubsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x07), INS_TT_NONE, Input_16Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal subtract of 16-bit integers with saturation +INST3(phsubw, "phsubw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x05), INS_TT_NONE, Input_16Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal subtract of 16-bit integers +INST3(pmaddubsw, "pmaddubsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x04), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Packed Signed and Unsigned Bytes +INST3(pmulhrsw, "pmulhrsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0B), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Multiply High with Round and Scale +INST3(pshufb, "pshufb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x00), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Shuffle Bytes +INST3(psignb, "psignb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x08), INS_TT_NONE, Input_8Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed SIGN +INST3(psignd, "psignd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0A), INS_TT_NONE, Input_32Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed SIGN +INST3(psignw, "psignw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x09), INS_TT_NONE, Input_16Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed SIGN + +// AESNI & PCLMULQDQ +INST3(aesdec, "aesdec", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDE), INS_TT_NONE, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Perform one round of an AES decryption flow +INST3(aesdeclast, "aesdeclast", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDF), INS_TT_NONE, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Perform last round of an AES decryption flow +INST3(aesenc, "aesenc", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDC), INS_TT_NONE, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Perform one round of an AES encryption flow +INST3(aesenclast, "aesenclast", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDD), INS_TT_NONE, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Perform last round of an AES encryption flow +INST3(aesimc, "aesimc", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDB), INS_TT_NONE, REX_WIG | Encoding_VEX) // Perform the AES InvMixColumn Transformation +INST3(aeskeygenassist, "aeskeygenassist", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0xDF), INS_TT_NONE, REX_WIG | Encoding_VEX) // AES Round Key Generation Assist +INST3(pclmulqdq, "pclmulqdq" , IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x44), INS_TT_NONE, Input_64Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Perform a carry-less multiplication of two quadwords + +// SSE4.1 +INST3(blendpd, "blendpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0D), INS_TT_NONE, Input_64Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Double Precision Floating-Point Values +INST3(blendps, "blendps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0C), INS_TT_NONE, Input_32Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Single Precision Floating-Point Values +INST3(blendvpd, "blendvpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x15), INS_TT_NONE, Input_64Bit | REX_W0) // Variable Blend Packed Doubles +INST3(blendvps, "blendvps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x14), INS_TT_NONE, Input_32Bit | REX_W0) // Variable Blend Packed Singles +INST3(dppd, "dppd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x41), INS_TT_NONE, Input_64Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed dot product of two double vector regs +INST3(dpps, "dpps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x40), INS_TT_NONE, Input_32Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed dot product of two float vector regs +INST3(extractps, "extractps", IUM_WR, SSE3A(0x17), BAD_CODE, BAD_CODE, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_WIG | Encoding_VEX | Encoding_EVEX) // Extract Packed Floating-Point Values +INST3(insertps, "insertps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x21), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert packed single precision float value +INST3(movntdqa, "movntdqa", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x2A), INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Load Double Quadword Non-Temporal Aligned Hint +INST3(mpsadbw, "mpsadbw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x42), INS_TT_NONE, Input_8Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Compute Multiple Packed Sums of Absolute Difference +INST3(packusdw, "packusdw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x2B), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) int to unsigned short with saturation +INST3(pblendvb, "pblendvb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x10), INS_TT_NONE, Input_8Bit | REX_W0) // Variable Blend Packed Bytes +INST3(pblendw, "pblendw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0E), INS_TT_NONE, Input_16Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Words +INST3(pcmpeqq, "pcmpeqq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x29), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 64-bit integers for equality +INST3(pextrb, "pextrb", IUM_WR, SSE3A(0x14), BAD_CODE, BAD_CODE, INS_TT_TUPLE1_SCALAR, Input_8Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Extract Byte +INST3(pextrd, "pextrd", IUM_WR, SSE3A(0x16), BAD_CODE, BAD_CODE, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Extract Dword +INST3(pextrq, "pextrq", IUM_WR, SSE3A(0x16), BAD_CODE, BAD_CODE, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX) // Extract Qword +INST3(pextrw_sse41, "pextrw", IUM_WR, SSE3A(0x15), BAD_CODE, BAD_CODE, INS_TT_TUPLE1_SCALAR, Input_16Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Extract Word +INST3(phminposuw, "phminposuw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x41), INS_TT_NONE, Input_16Bit | REX_WIG | Encoding_VEX) // Packed Horizontal Word Minimum +INST3(pinsrb, "pinsrb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x20), INS_TT_TUPLE1_SCALAR, Input_8Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert Byte +INST3(pinsrd, "pinsrd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x22), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert Dword +INST3(pinsrq, "pinsrq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x22), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert Qword +INST3(pmaxsb, "pmaxsb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3C), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum signed bytes +INST3(pmaxsd, "pmaxsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3D), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 32-bit signed integers +INST3(pmaxud, "pmaxud", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3F), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 32-bit unsigned integers +INST3(pmaxuw, "pmaxuw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3E), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 16-bit unsigned integers +INST3(pminsb, "pminsb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x38), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum signed bytes +INST3(pminsd, "pminsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x39), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 32-bit signed integers +INST3(pminud, "pminud", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3B), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 32-bit unsigned integers +INST3(pminuw, "pminuw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3A), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 16-bit unsigned integers +INST3(pmovsxbd, "pmovsxbd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x21), INS_TT_QUARTER_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed sign extend byte to int +INST3(pmovsxbq, "pmovsxbq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x22), INS_TT_EIGHTH_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed sign extend byte to long +INST3(pmovsxbw, "pmovsxbw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x20), INS_TT_HALF_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed sign extend byte to short +INST3(pmovsxdq, "pmovsxdq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x25), INS_TT_HALF_MEM, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Packed sign extend int to long +INST3(pmovsxwd, "pmovsxwd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x23), INS_TT_HALF_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed sign extend short to int +INST3(pmovsxwq, "pmovsxwq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x24), INS_TT_QUARTER_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed sign extend short to long +INST3(pmovzxbd, "pmovzxbd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x31), INS_TT_QUARTER_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend byte to intg +INST3(pmovzxbq, "pmovzxbq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x32), INS_TT_EIGHTH_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend byte to lon +INST3(pmovzxbw, "pmovzxbw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x30), INS_TT_HALF_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend byte to short +INST3(pmovzxdq, "pmovzxdq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x35), INS_TT_HALF_MEM, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Packed zero extend int to long +INST3(pmovzxwd, "pmovzxwd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x33), INS_TT_HALF_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend short to int +INST3(pmovzxwq, "pmovzxwq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x34), INS_TT_QUARTER_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend short to long +INST3(pmuldq, "pmuldq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x28), INS_TT_FULL, Input_32Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed multiply 32-bit signed integers and store 64-bit result +INST3(pmulld, "pmulld", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x40), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed multiply 32 bit unsigned integers and store lower 32 bits of each result +INST3(ptest, "ptest", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x17), INS_TT_NONE, REX_WIG | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF) // Packed logical compare +INST3(roundpd, "roundpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x09), INS_TT_NONE, Input_64Bit | REX_WIG | Encoding_VEX) // Round packed double precision floating-point values +INST3(roundps, "roundps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x08), INS_TT_NONE, Input_32Bit | REX_WIG | Encoding_VEX) // Round packed single precision floating-point values +INST3(roundsd, "roundsd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0B), INS_TT_NONE, Input_64Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Round scalar double precision floating-point values +INST3(roundss, "roundss", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0A), INS_TT_NONE, Input_32Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Round scalar single precision floating-point values + +// SSE4.2 +INST3(pcmpgtq, "pcmpgtq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x37), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 64-bit integers for equality + INST3(LAST_SSE_INSTRUCTION, "LAST_SSE_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) INST3(FIRST_AVX_INSTRUCTION, "FIRST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -// AVX only instructions -INST3(vbroadcastss, "broadcastss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x18), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_FLAGS_None) // Broadcast float value read from memory to entire ymm register -INST3(vbroadcastsd, "broadcastsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x19), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_FLAGS_None) // Broadcast float value read from memory to entire ymm register -INST3(vpbroadcastb, "pbroadcastb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x78), INS_TT_TUPLE1_SCALAR, Input_8Bit | INS_FLAGS_None) // Broadcast int8 value from reg/memory to entire ymm register -INST3(vpbroadcastw, "pbroadcastw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x79), INS_TT_TUPLE1_SCALAR, Input_16Bit | INS_FLAGS_None) // Broadcast int16 value from reg/memory to entire ymm register -INST3(vpbroadcastd, "pbroadcastd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x58), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_FLAGS_None) // Broadcast int32 value from reg/memory to entire ymm register -INST3(vpbroadcastq, "pbroadcastq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x59), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_FLAGS_None) // Broadcast int64 value from reg/memory to entire ymm register -INST3(vextractf128, "extractf128", IUM_WR, SSE3A(0x19), BAD_CODE, BAD_CODE, INS_TT_TUPLE4, Input_32Bit ) // Extract 128-bit packed floating point values // TODO-XARCH-AVX512 TT and IP encoded is extractf32x4 -INST3(vextracti128, "extracti128", IUM_WR, SSE3A(0x39), BAD_CODE, BAD_CODE, INS_TT_TUPLE4, Input_32Bit ) // Extract 128-bit packed integer values // TODO-XARCH-AVX512 TT and IP encoded is extractf32x4 -INST3(vinsertf128, "insertf128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x18), INS_TT_TUPLE4, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 128-bit packed floating point values // TODO-XARCH-AVX512 TT and IP encoded is insertf32x4 -INST3(vinserti128, "inserti128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x38), INS_TT_TUPLE4, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 128-bit packed integer values // TODO-XARCH-AVX512 TT and IP encoded is inserti32x4 -INST3(vzeroupper, "zeroupper", IUM_WR, 0xC577F8, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // Zero upper 128-bits of all YMM regs (includes 2-byte fixed VEX prefix) -INST3(vperm2i128, "perm2i128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x46), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Permute 128-bit halves of input register -INST3(vpermq, "permq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x00), INS_TT_FULL, Input_64Bit | INS_FLAGS_None) // Permute 64-bit of input register -INST3(vpblendd, "pblendd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x02), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed DWORDs -INST3(vblendvps, "blendvps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x4A), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Variable Blend Packed Singles -INST3(vblendvpd, "blendvpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x4B), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Variable Blend Packed Doubles -INST3(vpblendvb, "pblendvb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x4C), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Variable Blend Packed Bytes -INST3(vtestps, "testps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0E), INS_TT_NONE, INS_FLAGS_None) // Packed Bit Test -INST3(vtestpd, "testpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0F), INS_TT_NONE, INS_FLAGS_None) // Packed Bit Test -INST3(vpsrlvd, "psrlvd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x45), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Right Logical -INST3(vpsrlvq, "psrlvq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x45), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Right Logical -INST3(vpsravd, "psravd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x46), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Right Arithmetic -INST3(vpsllvd, "psllvd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x47), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Left Logical -INST3(vpsllvq, "psllvq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x47), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Left Logical -INST3(vpermilps, "permilps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x04), INS_TT_FULL, Input_32Bit | INS_FLAGS_None) // Permute In-Lane of Quadruples of Single-Precision Floating-Point Values -INST3(vpermilpd, "permilpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x05), INS_TT_FULL, Input_64Bit | INS_FLAGS_None) // Permute In-Lane of Quadruples of Double-Precision Floating-Point Values -INST3(vpermilpsvar, "permilpsvar", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0C), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Permute In-Lane of Quadruples of Single-Precision Floating-Point Values -INST3(vpermilpdvar, "permilpdvar", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0D), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Permute In-Lane of Quadruples of Double-Precision Floating-Point Values -INST3(vperm2f128, "perm2f128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x06), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Permute Floating-Point Values -INST3(vpermpd, "permpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x01), INS_TT_FULL, Input_64Bit | INS_FLAGS_None) // Permute Double-Precision Floating-Point Values -INST3(vpermd, "permd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x36), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Permute Packed Doublewords Elements -INST3(vpermps, "permps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x16), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Permute Single-Precision Floating-Point Elements -INST3(vbroadcastf128, "broadcastf128", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1A), INS_TT_NONE, INS_FLAGS_None) // Broadcast packed float values read from memory to entire ymm register -INST3(vbroadcasti128, "broadcasti128", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x5A), INS_TT_NONE, INS_FLAGS_None) // Broadcast packed integer values read from memory to entire ymm register -INST3(vmaskmovps, "maskmovps", IUM_WR, SSE38(0x2E), BAD_CODE, SSE38(0x2C), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Packed Single-Precision Floating-Point Loads and Stores -INST3(vmaskmovpd, "maskmovpd", IUM_WR, SSE38(0x2F), BAD_CODE, SSE38(0x2D), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Packed Double-Precision Floating-Point Loads and Stores -INST3(vpmaskmovd, "pmaskmovd", IUM_WR, SSE38(0x8E), BAD_CODE, SSE38(0x8C), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Integer Packed Dword Loads and Stores -INST3(vpmaskmovq, "pmaskmovq", IUM_WR, SSE38(0x8E), BAD_CODE, SSE38(0x8C), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Integer Packed Qword Loads and Stores -INST3(vpgatherdd, "pgatherdd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x90), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed Dword Values Using Signed Dword -INST3(vpgatherqd, "pgatherqd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x91), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed Dword Values Using Signed Qword -INST3(vpgatherdq, "pgatherdq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x90), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed Dword with Signed Dword Indices -INST3(vpgatherqq, "pgatherqq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x91), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed Qword with Signed Dword Indices -INST3(vgatherdps, "gatherdps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x92), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed SP FP values Using Signed Dword Indices -INST3(vgatherqps, "gatherqps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x93), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed SP FP values Using Signed Qword Indices -INST3(vgatherdpd, "gatherdpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x92), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed DP FP Values Using Signed Dword Indices -INST3(vgatherqpd, "gatherqpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x93), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed DP FP Values Using Signed Qword Indices + +// AVX +INST3(vblendvpd, "blendvpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x4B), INS_TT_NONE, Input_64Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Blend Packed Doubles +INST3(vblendvps, "blendvps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x4A), INS_TT_NONE, Input_32Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Blend Packed Singles +INST3(vbroadcastf128, "broadcastf128", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1A), INS_TT_TUPLE4, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Broadcast packed float values read from memory to entire ymm register +INST3(vbroadcastsd, "broadcastsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x19), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_WX | Encoding_VEX | Encoding_EVEX) // Broadcast float value read from memory to entire ymm register +INST3(vbroadcastss, "broadcastss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x18), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Broadcast float value read from memory to entire ymm register +INST3(vextractf128, "extractf128", IUM_WR, SSE3A(0x19), BAD_CODE, BAD_CODE, INS_TT_TUPLE4, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Extract 128-bit packed floating point values +INST3(vinsertf128, "insertf128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x18), INS_TT_TUPLE4, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 128-bit packed floating point values +INST3(vmaskmovpd, "maskmovpd", IUM_WR, SSE38(0x2F), BAD_CODE, SSE38(0x2D), INS_TT_NONE, Input_64Bit | REX_W0 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Packed Double-Precision Floating-Point Loads and Stores +INST3(vmaskmovps, "maskmovps", IUM_WR, SSE38(0x2E), BAD_CODE, SSE38(0x2C), INS_TT_NONE, Input_32Bit | REX_W0 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Packed Single-Precision Floating-Point Loads and Stores +INST3(vpblendvb, "pblendvb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x4C), INS_TT_NONE, Input_8Bit | REX_W0 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Blend Packed Bytes +INST3(vperm2f128, "perm2f128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x06), INS_TT_NONE, REX_W0 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Permute Floating-Point Values +INST3(vpermilpd, "permilpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x05), INS_TT_FULL, Input_64Bit | REX_WX | Encoding_VEX | Encoding_EVEX) // Permute In-Lane of Quadruples of Double-Precision Floating-Point Values +INST3(vpermilpdvar, "permilpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0D), INS_TT_FULL, Input_64Bit | REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Permute In-Lane of Quadruples of Double-Precision Floating-Point Values +INST3(vpermilps, "permilps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x04), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Permute In-Lane of Quadruples of Single-Precision Floating-Point Values +INST3(vpermilpsvar, "permilps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0C), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Permute In-Lane of Quadruples of Single-Precision Floating-Point Values +INST3(vtestpd, "testpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0F), INS_TT_NONE, Input_64Bit | REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF) // Packed Bit Test +INST3(vtestps, "testps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0E), INS_TT_NONE, Input_32Bit | REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF) // Packed Bit Test +INST3(vzeroupper, "zeroupper", IUM_WR, 0xC577F8, BAD_CODE, BAD_CODE, INS_TT_NONE, REX_WIG | Encoding_VEX) // Zero upper 128-bits of all YMM regs (includes 2-byte fixed VEX prefix) + +// AVX2 +INST3(vbroadcasti128, "broadcasti128", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x5A), INS_TT_TUPLE4, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Broadcast packed integer values read from memory to entire ymm register +INST3(vextracti128, "extracti128", IUM_WR, SSE3A(0x39), BAD_CODE, BAD_CODE, INS_TT_TUPLE4, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Extract 128-bit packed integer values +INST3(vgatherdpd, "gatherdpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x92), INS_TT_NONE, Input_64Bit | REX_W1 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed DP FP Values Using Signed Dword Indices +INST3(vgatherdps, "gatherdps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x92), INS_TT_NONE, Input_32Bit | REX_W0 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed SP FP values Using Signed Dword Indices +INST3(vgatherqpd, "gatherqpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x93), INS_TT_NONE, Input_64Bit | REX_W1 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed DP FP Values Using Signed Qword Indices +INST3(vgatherqps, "gatherqps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x93), INS_TT_NONE, Input_32Bit | REX_W0 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed SP FP values Using Signed Qword Indices +INST3(vinserti128, "inserti128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x38), INS_TT_TUPLE4, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 128-bit packed integer values +INST3(vpblendd, "pblendd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x02), INS_TT_NONE, Input_32Bit | REX_W0 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed DWORDs +INST3(vpbroadcastb, "pbroadcastb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x78), INS_TT_TUPLE1_SCALAR, Input_8Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Broadcast int8 value from reg/memory to entire ymm register +INST3(vpbroadcastd, "pbroadcastd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x58), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Broadcast int32 value from reg/memory to entire ymm register +INST3(vpbroadcastq, "pbroadcastq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x59), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_WX | Encoding_VEX | Encoding_EVEX) // Broadcast int64 value from reg/memory to entire ymm register +INST3(vpbroadcastw, "pbroadcastw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x79), INS_TT_TUPLE1_SCALAR, Input_16Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Broadcast int16 value from reg/memory to entire ymm register +INST3(vperm2i128, "perm2i128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x46), INS_TT_NONE, REX_W0 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Permute 128-bit halves of input register +INST3(vpermd, "permd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x36), INS_TT_FULL, Input_64Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Permute Packed Doublewords Elements +INST3(vpermpd, "permpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x01), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX) // Permute Double-Precision Floating-Point Values +INST3(vpermps, "permps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x16), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Permute Single-Precision Floating-Point Elements +INST3(vpermq, "permq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x00), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX) // Permute 64-bit of input register +INST3(vpgatherdd, "pgatherdd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x90), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed Dword Values Using Signed Dword +INST3(vpgatherdq, "pgatherdq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x90), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed Dword with Signed Dword Indices +INST3(vpgatherqd, "pgatherqd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x91), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed Dword Values Using Signed Qword +INST3(vpgatherqq, "pgatherqq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x91), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed Qword with Signed Dword Indices +INST3(vpmaskmovd, "pmaskmovd", IUM_WR, SSE38(0x8E), BAD_CODE, SSE38(0x8C), INS_TT_NONE, Input_32Bit | REX_W0 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Integer Packed Dword Loads and Stores +INST3(vpmaskmovq, "pmaskmovq", IUM_WR, SSE38(0x8E), BAD_CODE, SSE38(0x8C), INS_TT_NONE, Input_64Bit | REX_W1 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Integer Packed Qword Loads and Stores +INST3(vpsllvd, "psllvd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x47), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Left Logical +INST3(vpsllvq, "psllvq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x47), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Left Logical +INST3(vpsravd, "psravd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x46), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Right Arithmetic +INST3(vpsrlvd, "psrlvd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x45), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Right Logical +INST3(vpsrlvq, "psrlvq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x45), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Right Logical INST3(FIRST_FMA_INSTRUCTION, "FIRST_FMA_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // id nm um mr mi rm flags -INST3(vfmadd132pd, "fmadd132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x98), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Add of Packed Double-Precision Floating-Point Values -INST3(vfmadd213pd, "fmadd213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA8), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmadd231pd, "fmadd231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB8), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmadd132ps, "fmadd132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x98), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Add of Packed Single-Precision Floating-Point Values -INST3(vfmadd213ps, "fmadd213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA8), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmadd231ps, "fmadd231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB8), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmadd132sd, "fmadd132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x99), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Add of Scalar Double-Precision Floating-Point Values -INST3(vfmadd213sd, "fmadd213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA9), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmadd231sd, "fmadd231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB9), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmadd132ss, "fmadd132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x99), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Add of Scalar Single-Precision Floating-Point Values -INST3(vfmadd213ss, "fmadd213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA9), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmadd231ss, "fmadd231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB9), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmaddsub132pd, "fmaddsub132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x96), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Alternating Add/Subtract of Packed Double-Precision Floating-Point Values -INST3(vfmaddsub213pd, "fmaddsub213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA6), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmaddsub231pd, "fmaddsub231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB6), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmaddsub132ps, "fmaddsub132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x96), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Alternating Add/Subtract of Packed Single-Precision Floating-Point Values -INST3(vfmaddsub213ps, "fmaddsub213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA6), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmaddsub231ps, "fmaddsub231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB6), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsubadd132pd, "fmsubadd132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x97), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Alternating Subtract/Add of Packed Double-Precision Floating-Point Values -INST3(vfmsubadd213pd, "fmsubadd213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA7), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsubadd231pd, "fmsubadd231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB7), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsubadd132ps, "fmsubadd132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x97), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Alternating Subtract/Add of Packed Single-Precision Floating-Point Values -INST3(vfmsubadd213ps, "fmsubadd213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA7), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsubadd231ps, "fmsubadd231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB7), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsub132pd, "fmsub132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9A), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Packed Double-Precision Floating-Point Values -INST3(vfmsub213pd, "fmsub213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAA), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsub231pd, "fmsub231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBA), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsub132ps, "fmsub132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9A), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Packed Single-Precision Floating-Point Values -INST3(vfmsub213ps, "fmsub213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAA), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsub231ps, "fmsub231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBA), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsub132sd, "fmsub132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9B), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Scalar Double-Precision Floating-Point Values -INST3(vfmsub213sd, "fmsub213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAB), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsub231sd, "fmsub231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBB), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsub132ss, "fmsub132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9B), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Scalar Single-Precision Floating-Point Values -INST3(vfmsub213ss, "fmsub213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAB), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfmsub231ss, "fmsub231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBB), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd132pd, "fnmadd132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9C), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Packed Double-Precision Floating-Point Values -INST3(vfnmadd213pd, "fnmadd213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAC), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd231pd, "fnmadd231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBC), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd132ps, "fnmadd132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9C), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Packed Single-Precision Floating-Point Values -INST3(vfnmadd213ps, "fnmadd213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAC), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd231ps, "fnmadd231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBC), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd132sd, "fnmadd132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9D), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Scalar Double-Precision Floating-Point Values -INST3(vfnmadd213sd, "fnmadd213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAD), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd231sd, "fnmadd231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBD), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd132ss, "fnmadd132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9D), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Scalar Single-Precision Floating-Point Values -INST3(vfnmadd213ss, "fnmadd213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAD), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd231ss, "fnmadd231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBD), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub132pd, "fnmsub132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9E), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Packed Double-Precision Floating-Point Values -INST3(vfnmsub213pd, "fnmsub213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAE), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub231pd, "fnmsub231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBE), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub132ps, "fnmsub132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9E), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Packed Single-Precision Floating-Point Values -INST3(vfnmsub213ps, "fnmsub213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAE), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub231ps, "fnmsub231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBE), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub132sd, "fnmsub132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9F), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Scalar Double-Precision Floating-Point Values -INST3(vfnmsub213sd, "fnmsub213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAF), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub231sd, "fnmsub231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBF), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub132ss, "fnmsub132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9F), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Scalar Single-Precision Floating-Point Values -INST3(vfnmsub213ss, "fnmsub213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAF), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub231ss, "fnmsub231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBF), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmadd132pd, "fmadd132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x98), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Add of Packed Double-Precision Floating-Point Values +INST3(vfmadd213pd, "fmadd213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA8), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmadd231pd, "fmadd231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB8), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmadd132ps, "fmadd132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x98), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Add of Packed Single-Precision Floating-Point Values +INST3(vfmadd213ps, "fmadd213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA8), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmadd231ps, "fmadd231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB8), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmadd132sd, "fmadd132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x99), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Add of Scalar Double-Precision Floating-Point Values +INST3(vfmadd213sd, "fmadd213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA9), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmadd231sd, "fmadd231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB9), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmadd132ss, "fmadd132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x99), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Add of Scalar Single-Precision Floating-Point Values +INST3(vfmadd213ss, "fmadd213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA9), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmadd231ss, "fmadd231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB9), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmaddsub132pd, "fmaddsub132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x96), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Alternating Add/Subtract of Packed Double-Precision Floating-Point Values +INST3(vfmaddsub213pd, "fmaddsub213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA6), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmaddsub231pd, "fmaddsub231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB6), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmaddsub132ps, "fmaddsub132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x96), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Alternating Add/Subtract of Packed Single-Precision Floating-Point Values +INST3(vfmaddsub213ps, "fmaddsub213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA6), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmaddsub231ps, "fmaddsub231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB6), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsubadd132pd, "fmsubadd132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x97), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Alternating Subtract/Add of Packed Double-Precision Floating-Point Values +INST3(vfmsubadd213pd, "fmsubadd213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA7), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsubadd231pd, "fmsubadd231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB7), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsubadd132ps, "fmsubadd132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x97), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Alternating Subtract/Add of Packed Single-Precision Floating-Point Values +INST3(vfmsubadd213ps, "fmsubadd213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xA7), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsubadd231ps, "fmsubadd231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xB7), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsub132pd, "fmsub132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9A), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Packed Double-Precision Floating-Point Values +INST3(vfmsub213pd, "fmsub213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAA), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsub231pd, "fmsub231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBA), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsub132ps, "fmsub132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9A), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Packed Single-Precision Floating-Point Values +INST3(vfmsub213ps, "fmsub213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAA), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsub231ps, "fmsub231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBA), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsub132sd, "fmsub132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9B), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Scalar Double-Precision Floating-Point Values +INST3(vfmsub213sd, "fmsub213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAB), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsub231sd, "fmsub231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBB), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsub132ss, "fmsub132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9B), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Scalar Single-Precision Floating-Point Values +INST3(vfmsub213ss, "fmsub213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAB), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfmsub231ss, "fmsub231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBB), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd132pd, "fnmadd132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9C), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Packed Double-Precision Floating-Point Values +INST3(vfnmadd213pd, "fnmadd213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAC), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd231pd, "fnmadd231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBC), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd132ps, "fnmadd132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9C), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Packed Single-Precision Floating-Point Values +INST3(vfnmadd213ps, "fnmadd213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAC), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd231ps, "fnmadd231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBC), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd132sd, "fnmadd132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9D), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Scalar Double-Precision Floating-Point Values +INST3(vfnmadd213sd, "fnmadd213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAD), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd231sd, "fnmadd231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBD), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd132ss, "fnmadd132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9D), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Scalar Single-Precision Floating-Point Values +INST3(vfnmadd213ss, "fnmadd213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAD), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd231ss, "fnmadd231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBD), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub132pd, "fnmsub132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9E), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Packed Double-Precision Floating-Point Values +INST3(vfnmsub213pd, "fnmsub213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAE), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub231pd, "fnmsub231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBE), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub132ps, "fnmsub132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9E), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Packed Single-Precision Floating-Point Values +INST3(vfnmsub213ps, "fnmsub213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAE), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub231ps, "fnmsub231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBE), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub132sd, "fnmsub132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9F), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Scalar Double-Precision Floating-Point Values +INST3(vfnmsub213sd, "fnmsub213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAF), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub231sd, "fnmsub231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBF), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub132ss, "fnmsub132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9F), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Scalar Single-Precision Floating-Point Values +INST3(vfnmsub213ss, "fnmsub213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAF), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub231ss, "fnmsub231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBF), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // INST3(LAST_FMA_INSTRUCTION, "LAST_FMA_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) INST3(FIRST_AVXVNNI_INSTRUCTION, "FIRST_AVXVNNI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST3(vpdpbusd, "pdpbusd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x50), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Unsigned and Signed Bytes -INST3(vpdpwssd, "pdpwssd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x52), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Signed Word Integers -INST3(vpdpbusds, "pdpbusds", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x51), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Unsigned and Signed Bytes with Saturation -INST3(vpdpwssds, "pdpwssds", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x53), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Signed Word Integers with Saturation +INST3(vpdpbusd, "pdpbusd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x50), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Unsigned and Signed Bytes +INST3(vpdpwssd, "pdpwssd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x52), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Signed Word Integers +INST3(vpdpbusds, "pdpbusds", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x51), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Unsigned and Signed Bytes with Saturation +INST3(vpdpwssds, "pdpwssds", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x53), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Signed Word Integers with Saturation INST3(LAST_AVXVNNI_INSTRUCTION, "LAST_AVXVNNI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -// BMI1 + INST3(FIRST_BMI_INSTRUCTION, "FIRST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST3(andn, "andn", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF2), INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF | INS_Flags_IsDstDstSrcAVXInstruction) // Logical AND NOT -INST3(blsi, "blsi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_IsDstDstSrcAVXInstruction) // Extract Lowest Set Isolated Bit -INST3(blsmsk, "blsmsk", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF | INS_Flags_IsDstDstSrcAVXInstruction) // Get Mask Up to Lowest Set Bit -INST3(blsr, "blsr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_IsDstDstSrcAVXInstruction) // Reset Lowest Set Bit -INST3(bextr, "bextr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF7), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Bit Field Extract + +// BMI1 +INST3(andn, "andn", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF2), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF) // Logical AND NOT +INST3(bextr, "bextr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF) // Bit Field Extract +INST3(blsi, "blsi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF) // Extract Lowest Set Isolated Bit +INST3(blsmsk, "blsmsk", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Resets_ZF | Undefined_AF | Undefined_PF | Writes_CF) // Get Mask Up to Lowest Set Bit +INST3(blsr, "blsr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF) // Reset Lowest Set Bit // BMI2 -INST3(rorx, "rorx", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0xF0), INS_TT_NONE, INS_FLAGS_None) -INST3(pdep, "pdep", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Parallel Bits Deposit -INST3(pext, "pext", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Parallel Bits Extract -INST3(bzhi, "bzhi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_IsDstDstSrcAVXInstruction) // Zero High Bits Starting with Specified Bit Position -INST3(mulx, "mulx", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF6), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Unsigned Multiply Without Affecting Flags +INST3(bzhi, "bzhi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF) // Zero High Bits Starting with Specified Bit Position +INST3(mulx, "mulx", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF6), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Unsigned Multiply Without Affecting Flags +INST3(pdep, "pdep", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Parallel Bits Deposit +INST3(pext, "pext", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Parallel Bits Extract +INST3(rorx, "rorx", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0xF0), INS_TT_NONE, REX_WX | Encoding_VEX) #ifdef TARGET_AMD64 -INST3(shlx, "shlx", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF7), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Shift Logical Left Without Affecting Flags -INST3(sarx, "sarx", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0xF7), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Shift Arithmetic Right Without Affecting Flags -INST3(shrx, "shrx", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF2, 0x0F, 0x38, 0xF7), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Shift Logical Right Without Affecting Flags +INST3(sarx, "sarx", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Arithmetic Right Without Affecting Flags +INST3(shlx, "shlx", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Logical Left Without Affecting Flags +INST3(shrx, "shrx", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF2, 0x0F, 0x38, 0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Logical Right Without Affecting Flags #endif INST3(LAST_BMI_INSTRUCTION, "LAST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST3(FIRST_K_INSTRUCTION, "FIRST_K_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None ) - -INST3(kmovb_msk, "kmovb", IUM_WR, PACK3(0x66, 0x0F, 0x91), BAD_CODE, PACK3(0x66, 0x0F, 0x90), INS_TT_NONE, INS_FLAGS_None ) -INST3(kmovw_msk, "kmovw", IUM_WR, PACK2(0x0F, 0x91), BAD_CODE, PACK2(0x0F, 0x90), INS_TT_NONE, INS_FLAGS_None ) -INST3(kmovd_msk, "kmovd", IUM_WR, PACK3(0xF2, 0x0F, 0x91), BAD_CODE, PACK3(0xF2, 0x0F, 0x90), INS_TT_NONE, INS_FLAGS_None ) -INST3(kmovq_msk, "kmovq", IUM_WR, PACK3(0xF2, 0x0F, 0x91), BAD_CODE, PACK3(0xF2, 0x0F, 0x90), INS_TT_NONE, INS_FLAGS_None ) - +INST3(FIRST_K_INSTRUCTION, "FIRST_K_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST3(kmovb_gpr, "kmovb", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0x66, 0x0F, 0x92), INS_TT_NONE, INS_FLAGS_None ) -INST3(kmovw_gpr, "kmovw", IUM_WR, BAD_CODE, BAD_CODE, PACK2(0x0F, 0x92), INS_TT_NONE, INS_FLAGS_None ) -INST3(kmovd_gpr, "kmovd", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0xF2, 0x0F, 0x92), INS_TT_NONE, INS_FLAGS_None ) -INST3(kmovq_gpr, "kmovq", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0xF2, 0x0F, 0x92), INS_TT_NONE, INS_FLAGS_None ) +INST3(kmovb_gpr, "kmovb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX) +INST3(kmovw_gpr, "kmovw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX) +INST3(kmovd_gpr, "kmovd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX) +INST3(kmovq_gpr, "kmovq", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x92), INS_TT_NONE, REX_W1 | Encoding_VEX) -INST3(LAST_K_INSTRUCTION, "LAST_K_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None ) +INST3(kmovb_msk, "kmovb", IUM_WR, PCKDBL(0x91), BAD_CODE, PCKDBL(0x90), INS_TT_NONE, REX_W0 | Encoding_VEX) +INST3(kmovw_msk, "kmovw", IUM_WR, PCKFLT(0x91), BAD_CODE, PCKFLT(0x90), INS_TT_NONE, REX_W0 | Encoding_VEX) +INST3(kmovd_msk, "kmovd", IUM_WR, PCKDBL(0x91), BAD_CODE, PCKDBL(0x90), INS_TT_NONE, REX_W1 | Encoding_VEX) +INST3(kmovq_msk, "kmovq", IUM_WR, PCKFLT(0x91), BAD_CODE, PCKFLT(0x90), INS_TT_NONE, REX_W1 | Encoding_VEX) +INST3(LAST_K_INSTRUCTION, "LAST_K_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST3(FIRST_AVX512F_INSTRUCTION, "FIRST_AVX512F_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST3(vinsertf64x4, "insertf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE4, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values -INST3(vinserti64x4, "inserti64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE4, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values -INST3(vmovdqa32, "movdqa32", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_32Bit | REX_W0) -INST3(vmovdqa64, "movdqa64", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_64Bit | REX_W1) -INST3(vmovdqu32, "movdqu32", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_32Bit | REX_W0) -INST3(vmovdqu64, "movdqu64", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_64Bit | REX_W1) -INST3(vpandd, "pandd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_FULL, Input_32Bit | REX_W0 | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs -INST3(vpandq, "pandq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_FULL, Input_64Bit | REX_W1 | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs -INST3(vpandnd, "pandnd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_FULL, Input_32Bit | REX_W0 | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs -INST3(vpandnq, "pandnq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_FULL, Input_64Bit | REX_W1 | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs -INST3(vpord, "pord", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), INS_TT_FULL, Input_32Bit | REX_W0 | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise OR of two xmm regs -INST3(vporq, "porq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), INS_TT_FULL, Input_64Bit | REX_W1 | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise OR of two xmm regs -INST3(vpternlogd, "pternlogd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x25), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) -INST3(vpxord, "pxord", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_TT_FULL, Input_32Bit | REX_W0 | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise XOR of two xmm regs -INST3(vpxorq, "pxorq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_TT_FULL, Input_64Bit | REX_W1 | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise XOR of two xmm regs -INST3(LAST_AVX512F_INSTRUCTION, "LAST_AVX512F_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) - -INST3(FIRST_AVX512BW_INSTRUCTION, "FIRST_AVX512BW_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST3(movdqu8, "movdqu8", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_8Bit | INS_FLAGS_None) -INST3(movdqu16, "movdqu16", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_16Bit | INS_FLAGS_None) -INST3(LAST_AVX512BW_INSTRUCTION, "LAST_AVX512BW_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) - -INST3(FIRST_AVX512DQ_INSTRUCTION, "FIRST_AVX512DQ_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST3(vinsertf32x8, "insertf32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE8, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values -INST3(vinserti32x8, "inserti32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE8, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values -INST3(LAST_AVX512DQ_INSTRUCTION, "LAST_AVX512DQ_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) - -INST3(vpmovb2m, "vpmovb2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x29), INS_TT_NONE, Input_8Bit) -INST3(vpmovw2m, "vpmovw2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x29), INS_TT_NONE, Input_16Bit) -INST3(vpmovd2m, "vpmovd2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x39), INS_TT_NONE, Input_32Bit) -INST3(vpmovq2m, "vpmovq2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x39), INS_TT_NONE, Input_64Bit) +// AVX512F +INST3(vinsertf64x4, "insertf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values +INST3(vinserti64x4, "inserti64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values +INST3(vmovdqa64, "movdqa64", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_FLAGS_None) +INST3(vmovdqu64, "movdqu64", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_FLAGS_None) +INST3(vpandq, "pandq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs +INST3(vpandnq, "pandnq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs +INST3(vporq, "porq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise OR of two xmm regs +INST3(vpternlogd, "pternlogd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x25), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) +INST3(vpxorq, "pxorq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise XOR of two xmm regs + +// AVX512BW +INST3(vmovdqu8, "movdqu8", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_8Bit | REX_W0_EVEX | Encoding_EVEX) +INST3(vmovdqu16, "movdqu16", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_16Bit | REX_W1_EVEX | Encoding_EVEX) +INST3(vpmovb2m, "pmovb2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x29), INS_TT_NONE, Input_8Bit | REX_W0_EVEX | Encoding_EVEX) +INST3(vpmovw2m, "pmovw2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x29), INS_TT_NONE, Input_16Bit | REX_W1_EVEX | Encoding_EVEX) + +// AVX512DQ +INST3(vinsertf32x8, "insertf32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values +INST3(vinserti32x8, "inserti32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values +INST3(vpmovd2m, "pmovd2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x39), INS_TT_NONE, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) +INST3(vpmovq2m, "pmovq2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x39), INS_TT_NONE, Input_64Bit | REX_W1_EVEX | Encoding_EVEX) INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) @@ -685,94 +646,94 @@ INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_ INST3(crc32, "crc32", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF2, 0x0F, 0x38, 0xF0), INS_TT_NONE, INS_FLAGS_None) // BMI1 -INST3(tzcnt, "tzcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xBC), INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF ) // Count the Number of Trailing Zero Bits +INST3(tzcnt, "tzcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xBC), INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF) // Count the Number of Trailing Zero Bits // LZCNT -INST3(lzcnt, "lzcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xBD), INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF ) +INST3(lzcnt, "lzcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xBD), INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF) // MOVBE -INST3(movbe, "movbe", IUM_WR, PCKMVB(0xF1), BAD_CODE, PCKMVB(0xF0), INS_TT_NONE, INS_FLAGS_None ) +INST3(movbe, "movbe", IUM_WR, PCKMVB(0xF1), BAD_CODE, PCKMVB(0xF0), INS_TT_NONE, INS_FLAGS_None) // POPCNT -INST3(popcnt, "popcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xB8), INS_TT_NONE, Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Resets_CF ) +INST3(popcnt, "popcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xB8), INS_TT_NONE, Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Resets_CF) // id nm um mr mi flags -INST2(ret, "ret", IUM_RD, 0x0000C3, 0x0000C2, INS_TT_NONE, INS_FLAGS_None ) -INST2(loop, "loop", IUM_RD, BAD_CODE, 0x0000E2, INS_TT_NONE, INS_FLAGS_None ) -INST2(call, "call", IUM_RD, 0x0010FF, 0x0000E8, INS_TT_NONE, INS_FLAGS_None ) - -INST2(rol, "rol", IUM_RW, 0x0000D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(rol_1, "rol", IUM_RW, 0x0000D0, 0x0000D0, INS_TT_NONE, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(rol_N, "rol", IUM_RW, 0x0000C0, 0x0000C0, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(ror, "ror", IUM_RW, 0x0008D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(ror_1, "ror", IUM_RW, 0x0008D0, 0x0008D0, INS_TT_NONE, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(ror_N, "ror", IUM_RW, 0x0008C0, 0x0008C0, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit ) - -INST2(rcl, "rcl", IUM_RW, 0x0010D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit ) -INST2(rcl_1, "rcl", IUM_RW, 0x0010D0, 0x0010D0, INS_TT_NONE, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit ) -INST2(rcl_N, "rcl", IUM_RW, 0x0010C0, 0x0010C0, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit ) -INST2(rcr, "rcr", IUM_RW, 0x0018D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit ) -INST2(rcr_1, "rcr", IUM_RW, 0x0018D0, 0x0018D0, INS_TT_NONE, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit ) -INST2(rcr_N, "rcr", IUM_RW, 0x0018C0, 0x0018C0, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit ) -INST2(shl, "shl", IUM_RW, 0x0020D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(shl_1, "shl", IUM_RW, 0x0020D0, 0x0020D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(shl_N, "shl", IUM_RW, 0x0020C0, 0x0020C0, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(shr, "shr", IUM_RW, 0x0028D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(shr_1, "shr", IUM_RW, 0x0028D0, 0x0028D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(shr_N, "shr", IUM_RW, 0x0028C0, 0x0028C0, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(sar, "sar", IUM_RW, 0x0038D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(sar_1, "sar", IUM_RW, 0x0038D0, 0x0038D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST2(sar_N, "sar", IUM_RW, 0x0038C0, 0x0038C0, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) +INST2(ret, "ret", IUM_RD, 0x0000C3, 0x0000C2, INS_TT_NONE, INS_FLAGS_None) +INST2(loop, "loop", IUM_RD, BAD_CODE, 0x0000E2, INS_TT_NONE, INS_FLAGS_None) +INST2(call, "call", IUM_RD, 0x0010FF, 0x0000E8, INS_TT_NONE, INS_FLAGS_None) + +INST2(rol, "rol", IUM_RW, 0x0000D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit) +INST2(rol_1, "rol", IUM_RW, 0x0000D0, 0x0000D0, INS_TT_NONE, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit) +INST2(rol_N, "rol", IUM_RW, 0x0000C0, 0x0000C0, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit) +INST2(ror, "ror", IUM_RW, 0x0008D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit) +INST2(ror_1, "ror", IUM_RW, 0x0008D0, 0x0008D0, INS_TT_NONE, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit) +INST2(ror_N, "ror", IUM_RW, 0x0008C0, 0x0008C0, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit) + +INST2(rcl, "rcl", IUM_RW, 0x0010D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit) +INST2(rcl_1, "rcl", IUM_RW, 0x0010D0, 0x0010D0, INS_TT_NONE, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit) +INST2(rcl_N, "rcl", IUM_RW, 0x0010C0, 0x0010C0, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit) +INST2(rcr, "rcr", IUM_RW, 0x0018D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit) +INST2(rcr_1, "rcr", IUM_RW, 0x0018D0, 0x0018D0, INS_TT_NONE, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit) +INST2(rcr_N, "rcr", IUM_RW, 0x0018C0, 0x0018C0, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit) +INST2(shl, "shl", IUM_RW, 0x0020D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit) +INST2(shl_1, "shl", IUM_RW, 0x0020D0, 0x0020D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit) +INST2(shl_N, "shl", IUM_RW, 0x0020C0, 0x0020C0, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit) +INST2(shr, "shr", IUM_RW, 0x0028D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit) +INST2(shr_1, "shr", IUM_RW, 0x0028D0, 0x0028D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit) +INST2(shr_N, "shr", IUM_RW, 0x0028C0, 0x0028C0, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit) +INST2(sar, "sar", IUM_RW, 0x0038D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit) +INST2(sar_1, "sar", IUM_RW, 0x0038D0, 0x0038D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit) +INST2(sar_N, "sar", IUM_RW, 0x0038C0, 0x0038C0, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit) // id nm um mr flags -INST1(r_movsb, "rep movsb", IUM_RD, 0x00A4F3, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit ) -INST1(r_movsd, "rep movsd", IUM_RD, 0x00A5F3, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit ) +INST1(r_movsb, "rep movsb", IUM_RD, 0x00A4F3, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit) +INST1(r_movsd, "rep movsd", IUM_RD, 0x00A5F3, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit) #if defined(TARGET_AMD64) -INST1(r_movsq, "rep movsq", IUM_RD, 0xF3A548, INS_TT_NONE, Reads_DF ) +INST1(r_movsq, "rep movsq", IUM_RD, 0xF3A548, INS_TT_NONE, Reads_DF) #endif // defined(TARGET_AMD64) -INST1(movsb, "movsb", IUM_RD, 0x0000A4, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit ) -INST1(movsd, "movsd", IUM_RD, 0x0000A5, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit ) +INST1(movsb, "movsb", IUM_RD, 0x0000A4, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit) +INST1(movsd, "movsd", IUM_RD, 0x0000A5, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit) #if defined(TARGET_AMD64) -INST1(movsq, "movsq", IUM_RD, 0x00A548, INS_TT_NONE, Reads_DF ) +INST1(movsq, "movsq", IUM_RD, 0x00A548, INS_TT_NONE, Reads_DF) #endif // defined(TARGET_AMD64) -INST1(r_stosb, "rep stosb", IUM_RD, 0x00AAF3, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit ) -INST1(r_stosd, "rep stosd", IUM_RD, 0x00ABF3, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit ) +INST1(r_stosb, "rep stosb", IUM_RD, 0x00AAF3, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit) +INST1(r_stosd, "rep stosd", IUM_RD, 0x00ABF3, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit) #if defined(TARGET_AMD64) -INST1(r_stosq, "rep stosq", IUM_RD, 0xF3AB48, INS_TT_NONE, Reads_DF ) +INST1(r_stosq, "rep stosq", IUM_RD, 0xF3AB48, INS_TT_NONE, Reads_DF) #endif // defined(TARGET_AMD64) -INST1(stosb, "stosb", IUM_RD, 0x0000AA, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit ) -INST1(stosd, "stosd", IUM_RD, 0x0000AB, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit ) +INST1(stosb, "stosb", IUM_RD, 0x0000AA, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit) +INST1(stosd, "stosd", IUM_RD, 0x0000AB, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit) #if defined(TARGET_AMD64) -INST1(stosq, "stosq", IUM_RD, 0x00AB48, INS_TT_NONE, Reads_DF ) +INST1(stosq, "stosq", IUM_RD, 0x00AB48, INS_TT_NONE, Reads_DF) #endif // defined(TARGET_AMD64) -INST1(int3, "int3", IUM_RD, 0x0000CC, INS_TT_NONE, INS_FLAGS_None ) -INST1(nop, "nop", IUM_RD, 0x000090, INS_TT_NONE, INS_FLAGS_None ) -INST1(pause, "pause", IUM_RD, 0x0090F3, INS_TT_NONE, INS_FLAGS_None ) -INST1(lock, "lock", IUM_RD, 0x0000F0, INS_TT_NONE, INS_FLAGS_None ) -INST1(leave, "leave", IUM_RD, 0x0000C9, INS_TT_NONE, INS_FLAGS_None ) +INST1(int3, "int3", IUM_RD, 0x0000CC, INS_TT_NONE, INS_FLAGS_None) +INST1(nop, "nop", IUM_RD, 0x000090, INS_TT_NONE, INS_FLAGS_None) +INST1(pause, "pause", IUM_RD, 0x0090F3, INS_TT_NONE, INS_FLAGS_None) +INST1(lock, "lock", IUM_RD, 0x0000F0, INS_TT_NONE, INS_FLAGS_None) +INST1(leave, "leave", IUM_RD, 0x0000C9, INS_TT_NONE, INS_FLAGS_None) -INST1(serialize, "serialize", IUM_RD, 0x0fe801, INS_TT_NONE, INS_FLAGS_None ) +INST1(serialize, "serialize", IUM_RD, 0x0fe801, INS_TT_NONE, INS_FLAGS_None) -INST1(neg, "neg", IUM_RW, 0x0018F6, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST1(not, "not", IUM_RW, 0x0010F6, INS_TT_NONE, INS_FLAGS_None | INS_FLAGS_Has_Wbit ) +INST1(neg, "neg", IUM_RW, 0x0018F6, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit) +INST1(not, "not", IUM_RW, 0x0010F6, INS_TT_NONE, INS_FLAGS_None | INS_FLAGS_Has_Wbit) -INST1(cwde, "cwde", IUM_RD, 0x000098, INS_TT_NONE, INS_FLAGS_None ) -INST1(cdq, "cdq", IUM_RD, 0x000099, INS_TT_NONE, INS_FLAGS_None ) -INST1(idiv, "idiv", IUM_RD, 0x0038F6, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Undefined_CF | INS_FLAGS_Has_Wbit ) -INST1(imulEAX, "imul", IUM_RD, 0x0028F6, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST1(div, "div", IUM_RD, 0x0030F6, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Undefined_CF | INS_FLAGS_Has_Wbit ) -INST1(mulEAX, "mul", IUM_RD, 0x0020F6, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Wbit ) +INST1(cwde, "cwde", IUM_RD, 0x000098, INS_TT_NONE, INS_FLAGS_None) +INST1(cdq, "cdq", IUM_RD, 0x000099, INS_TT_NONE, INS_FLAGS_None) +INST1(idiv, "idiv", IUM_RD, 0x0038F6, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Undefined_CF | INS_FLAGS_Has_Wbit) +INST1(imulEAX, "imul", IUM_RD, 0x0028F6, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Wbit) +INST1(div, "div", IUM_RD, 0x0030F6, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Undefined_CF | INS_FLAGS_Has_Wbit) +INST1(mulEAX, "mul", IUM_RD, 0x0020F6, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Wbit) -INST1(sahf, "sahf", IUM_RD, 0x00009E, INS_TT_NONE, Restore_SF_ZF_AF_PF_CF ) +INST1(sahf, "sahf", IUM_RD, 0x00009E, INS_TT_NONE, Restore_SF_ZF_AF_PF_CF) -INST1(xadd, "xadd", IUM_RW, 0x0F00C0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) -INST1(cmpxchg, "cmpxchg", IUM_RW, 0x0F00B0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit ) +INST1(xadd, "xadd", IUM_RW, 0x0F00C0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit) +INST1(cmpxchg, "cmpxchg", IUM_RW, 0x0F00B0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit) -INST1(shld, "shld", IUM_RW, 0x0F00A4, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF ) -INST1(shrd, "shrd", IUM_RW, 0x0F00AC, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF ) +INST1(shld, "shld", IUM_RW, 0x0F00A4, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF) +INST1(shrd, "shrd", IUM_RW, 0x0F00AC, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF) // For RyuJIT/x86, we follow the x86 calling convention that requires // us to return floating point value on the x87 FP stack, so we need @@ -782,63 +743,63 @@ INST1(fld, "fld", IUM_WR, 0x0000D9, INST1(fstp, "fstp", IUM_WR, 0x0018D9, INS_TT_NONE, INS_FLAGS_x87Instr) #endif // TARGET_X86 -INST1(seto, "seto", IUM_WR, 0x0F0090, INS_TT_NONE, Reads_OF ) -INST1(setno, "setno", IUM_WR, 0x0F0091, INS_TT_NONE, Reads_OF ) -INST1(setb, "setb", IUM_WR, 0x0F0092, INS_TT_NONE, Reads_CF ) -INST1(setae, "setae", IUM_WR, 0x0F0093, INS_TT_NONE, Reads_CF ) -INST1(sete, "sete", IUM_WR, 0x0F0094, INS_TT_NONE, Reads_ZF ) -INST1(setne, "setne", IUM_WR, 0x0F0095, INS_TT_NONE, Reads_ZF ) -INST1(setbe, "setbe", IUM_WR, 0x0F0096, INS_TT_NONE, Reads_ZF | Reads_CF ) -INST1(seta, "seta", IUM_WR, 0x0F0097, INS_TT_NONE, Reads_ZF | Reads_CF ) -INST1(sets, "sets", IUM_WR, 0x0F0098, INS_TT_NONE, Reads_SF ) -INST1(setns, "setns", IUM_WR, 0x0F0099, INS_TT_NONE, Reads_SF ) -INST1(setp, "setp", IUM_WR, 0x0F009A, INS_TT_NONE, Reads_PF ) -INST1(setnp, "setnp", IUM_WR, 0x0F009B, INS_TT_NONE, Reads_PF ) -INST1(setl, "setl", IUM_WR, 0x0F009C, INS_TT_NONE, Reads_OF | Reads_SF ) -INST1(setge, "setge", IUM_WR, 0x0F009D, INS_TT_NONE, Reads_OF | Reads_SF ) -INST1(setle, "setle", IUM_WR, 0x0F009E, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF ) -INST1(setg, "setg", IUM_WR, 0x0F009F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF ) +INST1(seto, "seto", IUM_WR, 0x0F0090, INS_TT_NONE, Reads_OF) +INST1(setno, "setno", IUM_WR, 0x0F0091, INS_TT_NONE, Reads_OF) +INST1(setb, "setb", IUM_WR, 0x0F0092, INS_TT_NONE, Reads_CF) +INST1(setae, "setae", IUM_WR, 0x0F0093, INS_TT_NONE, Reads_CF) +INST1(sete, "sete", IUM_WR, 0x0F0094, INS_TT_NONE, Reads_ZF) +INST1(setne, "setne", IUM_WR, 0x0F0095, INS_TT_NONE, Reads_ZF) +INST1(setbe, "setbe", IUM_WR, 0x0F0096, INS_TT_NONE, Reads_ZF | Reads_CF) +INST1(seta, "seta", IUM_WR, 0x0F0097, INS_TT_NONE, Reads_ZF | Reads_CF) +INST1(sets, "sets", IUM_WR, 0x0F0098, INS_TT_NONE, Reads_SF) +INST1(setns, "setns", IUM_WR, 0x0F0099, INS_TT_NONE, Reads_SF) +INST1(setp, "setp", IUM_WR, 0x0F009A, INS_TT_NONE, Reads_PF) +INST1(setnp, "setnp", IUM_WR, 0x0F009B, INS_TT_NONE, Reads_PF) +INST1(setl, "setl", IUM_WR, 0x0F009C, INS_TT_NONE, Reads_OF | Reads_SF) +INST1(setge, "setge", IUM_WR, 0x0F009D, INS_TT_NONE, Reads_OF | Reads_SF) +INST1(setle, "setle", IUM_WR, 0x0F009E, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) +INST1(setg, "setg", IUM_WR, 0x0F009F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) // Indirect jump used for tailcalls. We differentiate between func-internal // indirect jump (e.g. used for switch) and tailcall indirect jumps because the // x64 unwinder might require the latter to be rex.w prefixed. -INST1(tail_i_jmp, "tail.jmp", IUM_RD, 0x0020FF, INS_TT_NONE, INS_FLAGS_None ) -INST1(i_jmp, "jmp", IUM_RD, 0x0020FF, INS_TT_NONE, INS_FLAGS_None ) -INST0(jmp, "jmp", IUM_RD, 0x0000EB, INS_TT_NONE, INS_FLAGS_None ) -INST0(jo, "jo", IUM_RD, 0x000070, INS_TT_NONE, Reads_OF ) -INST0(jno, "jno", IUM_RD, 0x000071, INS_TT_NONE, Reads_OF ) -INST0(jb, "jb", IUM_RD, 0x000072, INS_TT_NONE, Reads_CF ) -INST0(jae, "jae", IUM_RD, 0x000073, INS_TT_NONE, Reads_CF ) -INST0(je, "je", IUM_RD, 0x000074, INS_TT_NONE, Reads_ZF ) -INST0(jne, "jne", IUM_RD, 0x000075, INS_TT_NONE, Reads_ZF ) -INST0(jbe, "jbe", IUM_RD, 0x000076, INS_TT_NONE, Reads_ZF | Reads_CF ) -INST0(ja, "ja", IUM_RD, 0x000077, INS_TT_NONE, Reads_ZF | Reads_CF ) -INST0(js, "js", IUM_RD, 0x000078, INS_TT_NONE, Reads_SF ) -INST0(jns, "jns", IUM_RD, 0x000079, INS_TT_NONE, Reads_SF ) -INST0(jp, "jp", IUM_RD, 0x00007A, INS_TT_NONE, Reads_PF ) -INST0(jnp, "jnp", IUM_RD, 0x00007B, INS_TT_NONE, Reads_PF ) -INST0(jl, "jl", IUM_RD, 0x00007C, INS_TT_NONE, Reads_OF | Reads_SF ) -INST0(jge, "jge", IUM_RD, 0x00007D, INS_TT_NONE, Reads_OF | Reads_SF ) -INST0(jle, "jle", IUM_RD, 0x00007E, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF ) -INST0(jg, "jg", IUM_RD, 0x00007F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF ) - -INST0(l_jmp, "jmp", IUM_RD, 0x0000E9, INS_TT_NONE, INS_FLAGS_None ) -INST0(l_jo, "jo", IUM_RD, 0x00800F, INS_TT_NONE, Reads_OF ) -INST0(l_jno, "jno", IUM_RD, 0x00810F, INS_TT_NONE, Reads_OF ) -INST0(l_jb, "jb", IUM_RD, 0x00820F, INS_TT_NONE, Reads_CF ) -INST0(l_jae, "jae", IUM_RD, 0x00830F, INS_TT_NONE, Reads_CF ) -INST0(l_je, "je", IUM_RD, 0x00840F, INS_TT_NONE, Reads_ZF ) -INST0(l_jne, "jne", IUM_RD, 0x00850F, INS_TT_NONE, Reads_ZF ) -INST0(l_jbe, "jbe", IUM_RD, 0x00860F, INS_TT_NONE, Reads_ZF | Reads_CF ) -INST0(l_ja, "ja", IUM_RD, 0x00870F, INS_TT_NONE, Reads_ZF | Reads_CF ) -INST0(l_js, "js", IUM_RD, 0x00880F, INS_TT_NONE, Reads_SF ) -INST0(l_jns, "jns", IUM_RD, 0x00890F, INS_TT_NONE, Reads_SF ) -INST0(l_jp, "jp", IUM_RD, 0x008A0F, INS_TT_NONE, Reads_PF ) -INST0(l_jnp, "jnp", IUM_RD, 0x008B0F, INS_TT_NONE, Reads_PF ) -INST0(l_jl, "jl", IUM_RD, 0x008C0F, INS_TT_NONE, Reads_OF | Reads_SF ) -INST0(l_jge, "jge", IUM_RD, 0x008D0F, INS_TT_NONE, Reads_OF | Reads_SF ) -INST0(l_jle, "jle", IUM_RD, 0x008E0F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF ) -INST0(l_jg, "jg", IUM_RD, 0x008F0F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF ) +INST1(tail_i_jmp, "tail.jmp", IUM_RD, 0x0020FF, INS_TT_NONE, INS_FLAGS_None) +INST1(i_jmp, "jmp", IUM_RD, 0x0020FF, INS_TT_NONE, INS_FLAGS_None) +INST0(jmp, "jmp", IUM_RD, 0x0000EB, INS_TT_NONE, INS_FLAGS_None) +INST0(jo, "jo", IUM_RD, 0x000070, INS_TT_NONE, Reads_OF) +INST0(jno, "jno", IUM_RD, 0x000071, INS_TT_NONE, Reads_OF) +INST0(jb, "jb", IUM_RD, 0x000072, INS_TT_NONE, Reads_CF) +INST0(jae, "jae", IUM_RD, 0x000073, INS_TT_NONE, Reads_CF) +INST0(je, "je", IUM_RD, 0x000074, INS_TT_NONE, Reads_ZF) +INST0(jne, "jne", IUM_RD, 0x000075, INS_TT_NONE, Reads_ZF) +INST0(jbe, "jbe", IUM_RD, 0x000076, INS_TT_NONE, Reads_ZF | Reads_CF) +INST0(ja, "ja", IUM_RD, 0x000077, INS_TT_NONE, Reads_ZF | Reads_CF) +INST0(js, "js", IUM_RD, 0x000078, INS_TT_NONE, Reads_SF) +INST0(jns, "jns", IUM_RD, 0x000079, INS_TT_NONE, Reads_SF) +INST0(jp, "jp", IUM_RD, 0x00007A, INS_TT_NONE, Reads_PF) +INST0(jnp, "jnp", IUM_RD, 0x00007B, INS_TT_NONE, Reads_PF) +INST0(jl, "jl", IUM_RD, 0x00007C, INS_TT_NONE, Reads_OF | Reads_SF) +INST0(jge, "jge", IUM_RD, 0x00007D, INS_TT_NONE, Reads_OF | Reads_SF) +INST0(jle, "jle", IUM_RD, 0x00007E, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) +INST0(jg, "jg", IUM_RD, 0x00007F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) + +INST0(l_jmp, "jmp", IUM_RD, 0x0000E9, INS_TT_NONE, INS_FLAGS_None) +INST0(l_jo, "jo", IUM_RD, 0x00800F, INS_TT_NONE, Reads_OF) +INST0(l_jno, "jno", IUM_RD, 0x00810F, INS_TT_NONE, Reads_OF) +INST0(l_jb, "jb", IUM_RD, 0x00820F, INS_TT_NONE, Reads_CF) +INST0(l_jae, "jae", IUM_RD, 0x00830F, INS_TT_NONE, Reads_CF) +INST0(l_je, "je", IUM_RD, 0x00840F, INS_TT_NONE, Reads_ZF) +INST0(l_jne, "jne", IUM_RD, 0x00850F, INS_TT_NONE, Reads_ZF) +INST0(l_jbe, "jbe", IUM_RD, 0x00860F, INS_TT_NONE, Reads_ZF | Reads_CF) +INST0(l_ja, "ja", IUM_RD, 0x00870F, INS_TT_NONE, Reads_ZF | Reads_CF) +INST0(l_js, "js", IUM_RD, 0x00880F, INS_TT_NONE, Reads_SF) +INST0(l_jns, "jns", IUM_RD, 0x00890F, INS_TT_NONE, Reads_SF) +INST0(l_jp, "jp", IUM_RD, 0x008A0F, INS_TT_NONE, Reads_PF) +INST0(l_jnp, "jnp", IUM_RD, 0x008B0F, INS_TT_NONE, Reads_PF) +INST0(l_jl, "jl", IUM_RD, 0x008C0F, INS_TT_NONE, Reads_OF | Reads_SF) +INST0(l_jge, "jge", IUM_RD, 0x008D0F, INS_TT_NONE, Reads_OF | Reads_SF) +INST0(l_jle, "jle", IUM_RD, 0x008E0F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) +INST0(l_jg, "jg", IUM_RD, 0x008F0F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) INST0(align, "align", IUM_RD, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) diff --git a/src/coreclr/jit/simdcodegenxarch.cpp b/src/coreclr/jit/simdcodegenxarch.cpp index 634c20c0ce8f00..1b2e367136db5a 100644 --- a/src/coreclr/jit/simdcodegenxarch.cpp +++ b/src/coreclr/jit/simdcodegenxarch.cpp @@ -76,7 +76,7 @@ void CodeGen::genStoreIndTypeSimd12(GenTreeStoreInd* treeNode) emitter* emit = GetEmitter(); // Store lower 8 bytes - emit->emitInsStoreInd(INS_movsdsse2, EA_8BYTE, treeNode); + emit->emitInsStoreInd(INS_movsd_simd, EA_8BYTE, treeNode); // Update the addr node to offset by 8 @@ -149,7 +149,7 @@ void CodeGen::genLoadIndTypeSimd12(GenTreeIndir* treeNode) if (useSse41) { // Load lower 8 bytes - emit->emitInsLoadInd(INS_movsdsse2, EA_8BYTE, tgtReg, treeNode); + emit->emitInsLoadInd(INS_movsd_simd, EA_8BYTE, tgtReg, treeNode); } // Update the addr node to offset by 8 @@ -250,7 +250,7 @@ void CodeGen::genStoreLclTypeSimd12(GenTreeLclVarCommon* treeNode) else { // Store lower 8 bytes - emit->emitIns_S_R(INS_movsdsse2, EA_8BYTE, dataReg, varNum, offs); + emit->emitIns_S_R(INS_movsd_simd, EA_8BYTE, dataReg, varNum, offs); if (data->IsVectorZero()) { @@ -303,7 +303,7 @@ void CodeGen::genLoadLclTypeSimd12(GenTreeLclVarCommon* treeNode) if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41)) { // Load lower 8 bytes into tgtReg, preserving upper 4 bytes - emit->emitIns_R_S(INS_movsdsse2, EA_8BYTE, tgtReg, varNum, offs); + emit->emitIns_R_S(INS_movsd_simd, EA_8BYTE, tgtReg, varNum, offs); // Load and insert upper 4 byte, 0x20 inserts to index 2 and 0x8 zeros index 3 emit->emitIns_SIMD_R_R_S_I(INS_insertps, EA_16BYTE, tgtReg, tgtReg, varNum, offs + 8, 0x28); @@ -345,7 +345,7 @@ void CodeGen::genStoreSimd12ToStack(regNumber dataReg, regNumber tmpReg) emitter* emit = GetEmitter(); // Store lower 8 bytes - emit->emitIns_AR_R(INS_movsdsse2, EA_8BYTE, dataReg, REG_SPBASE, 0); + emit->emitIns_AR_R(INS_movsd_simd, EA_8BYTE, dataReg, REG_SPBASE, 0); // Extract upper 4 bytes from data emit->emitIns_R_R(INS_movhlps, EA_16BYTE, tmpReg, dataReg);