diff --git a/src/coreclr/jit/codegeninterface.h b/src/coreclr/jit/codegeninterface.h index 1f8f4c6b0c7fe..c2bcedb8ea9b7 100644 --- a/src/coreclr/jit/codegeninterface.h +++ b/src/coreclr/jit/codegeninterface.h @@ -127,7 +127,9 @@ class CodeGenInterface #define INST_FP 0x01 // is it a FP instruction? public: static bool instIsFP(instruction ins); - +#if defined(TARGET_XARCH) + static bool instIsEmbeddedBroadcastCompatible(instruction ins); +#endif // TARGET_XARCH //------------------------------------------------------------------------- // Liveness-related fields & methods public: @@ -764,6 +766,10 @@ class CodeGenInterface virtual const char* siStackVarName(size_t offs, size_t size, unsigned reg, unsigned stkOffs) = 0; #endif // LATE_DISASM + +#if defined(TARGET_XARCH) + bool IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op); +#endif }; #endif // _CODEGEN_INTERFACE_H_ diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 1d49eb6907006..ac0bcf0dffef3 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -781,6 +781,9 @@ class emitter unsigned _idCallRegPtr : 1; // IL indirect calls: addr in reg unsigned _idCallAddr : 1; // IL indirect calls: can make a direct call to iiaAddr unsigned _idNoGC : 1; // Some helpers don't get recorded in GC tables +#if defined(TARGET_XARCH) + unsigned _idEvexbContext : 1; // does EVEX.b need to be set. +#endif // TARGET_XARCH #ifdef TARGET_ARM64 opSize _idOpSize : 3; // operand size: 0=1 , 1=2 , 2=4 , 3=8, 4=16 @@ -814,8 +817,8 @@ class emitter //////////////////////////////////////////////////////////////////////// // Space taken up to here: - // x86: 46 bits - // amd64: 46 bits + // x86: 47 bits + // amd64: 47 bits // arm: 48 bits // arm64: 50 bits // loongarch64: 46 bits @@ -830,8 +833,10 @@ class emitter #define ID_EXTRA_BITFIELD_BITS (16) #elif defined(TARGET_ARM64) #define ID_EXTRA_BITFIELD_BITS (18) -#elif defined(TARGET_XARCH) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) +#elif defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) #define ID_EXTRA_BITFIELD_BITS (14) +#elif defined(TARGET_XARCH) +#define ID_EXTRA_BITFIELD_BITS (15) #else #error Unsupported or unset target architecture #endif @@ -866,8 +871,8 @@ class emitter //////////////////////////////////////////////////////////////////////// // Space taken up to here (with/without prev offset, assuming host==target): - // x86: 52/48 bits - // amd64: 53/48 bits + // x86: 53/49 bits + // amd64: 54/49 bits // arm: 54/50 bits // arm64: 57/52 bits // loongarch64: 53/48 bits @@ -1529,6 +1534,19 @@ class emitter _idNoGC = val; } +#ifdef TARGET_XARCH + bool idIsEvexbContext() const + { + return _idEvexbContext != 0; + } + void idSetEvexbContext() + { + assert(_idEvexbContext == 0); + _idEvexbContext = 1; + assert(_idEvexbContext == 1); + } +#endif + #ifdef TARGET_ARMARCH bool idIsLclVar() const { @@ -3655,9 +3673,25 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id) // emitAttr emitter::emitGetMemOpSize(instrDesc* id) const { + emitAttr defaultSize = id->idOpSize(); instruction ins = id->idIns(); + if (id->idIsEvexbContext()) + { + // should have the assumption that Evex.b now stands for the embedded broadcast context. + // reference: Section 2.7.5 in Intel 64 and ia-32 architectures software developer's manual volume 2. + ssize_t inputSize = GetInputSizeInBytes(id); + switch (inputSize) + { + case 4: + return EA_4BYTE; + case 8: + return EA_8BYTE; + default: + unreached(); + } + } switch (ins) { case INS_pextrb: diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index ce54be61350e7..dd6800347a59c 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -1231,9 +1231,10 @@ bool emitter::TakesEvexPrefix(const instrDesc* id) const #define DEFAULT_BYTE_EVEX_PREFIX_MASK 0xFFFFFFFF00000000ULL #define LBIT_IN_BYTE_EVEX_PREFIX 0x0000002000000000ULL #define LPRIMEBIT_IN_BYTE_EVEX_PREFIX 0x0000004000000000ULL +#define EVEX_B_BIT 0x0000001000000000ULL //------------------------------------------------------------------------ -// AddEvexPrefix: Add default EVEX perfix with only LL' bits set. +// AddEvexPrefix: Add default EVEX prefix with only LL' bits set. // // Arguments: // ins -- processor instruction to check. @@ -1268,6 +1269,22 @@ emitter::code_t emitter::AddEvexPrefix(instruction ins, code_t code, emitAttr at return code; } +//------------------------------------------------------------------------ +// AddEvexPrefix: set Evex.b bit if EvexbContext is set in instruction descritor. +// +// Arguments: +// code -- opcode bits. +// +// Return Value: +// encoded code with Evex.b set if needed. +// +emitter::code_t emitter::AddEvexbBit(code_t code) +{ + assert(hasEvexPrefix(code)); + code |= EVEX_B_BIT; + return code; +} + // Returns true if this instruction requires a VEX prefix // All AVX instructions require a VEX prefix bool emitter::TakesVexPrefix(instruction ins) const @@ -6667,7 +6684,8 @@ void emitter::emitIns_R_S_I(instruction ins, emitAttr attr, regNumber reg1, int emitCurIGsize += sz; } -void emitter::emitIns_R_R_A(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, GenTreeIndir* indir) +void emitter::emitIns_R_R_A( + instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, GenTreeIndir* indir, insOpts instOptions) { assert(IsAvx512OrPriorInstruction(ins)); assert(IsThreeOperandAVXInstruction(ins)); @@ -6678,6 +6696,11 @@ void emitter::emitIns_R_R_A(instruction ins, emitAttr attr, regNumber reg1, regN id->idIns(ins); id->idReg1(reg1); id->idReg2(reg2); + if (instOptions == INS_OPTS_EVEX_b) + { + assert(UseEvexEncoding()); + id->idSetEvexbContext(); + } emitHandleMemOp(indir, id, (ins == INS_mulx) ? IF_RWR_RWR_ARD : emitInsModeFormat(ins, IF_RRD_RRD_ARD), ins); @@ -6778,8 +6801,13 @@ void emitter::emitIns_R_AR_R(instruction ins, emitCurIGsize += sz; } -void emitter::emitIns_R_R_C( - instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, CORINFO_FIELD_HANDLE fldHnd, int offs) +void emitter::emitIns_R_R_C(instruction ins, + emitAttr attr, + regNumber reg1, + regNumber reg2, + CORINFO_FIELD_HANDLE fldHnd, + int offs, + insOpts instOptions) { assert(IsAvx512OrPriorInstruction(ins)); assert(IsThreeOperandAVXInstruction(ins)); @@ -6797,6 +6825,11 @@ void emitter::emitIns_R_R_C( id->idReg1(reg1); id->idReg2(reg2); id->idAddr()->iiaFieldHnd = fldHnd; + if (instOptions == INS_OPTS_EVEX_b) + { + assert(UseEvexEncoding()); + id->idSetEvexbContext(); + } UNATIVE_OFFSET sz = emitInsSizeCV(id, insCodeRM(ins)); id->idCodeSize(sz); @@ -6829,7 +6862,8 @@ void emitter::emitIns_R_R_R(instruction ins, emitAttr attr, regNumber targetReg, emitCurIGsize += sz; } -void emitter::emitIns_R_R_S(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs) +void emitter::emitIns_R_R_S( + instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs, insOpts instOptions) { assert(IsAvx512OrPriorInstruction(ins)); assert(IsThreeOperandAVXInstruction(ins)); @@ -6842,6 +6876,11 @@ void emitter::emitIns_R_R_S(instruction ins, emitAttr attr, regNumber reg1, regN id->idReg2(reg2); id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs); + if (instOptions == INS_OPTS_EVEX_b) + { + assert(UseEvexEncoding()); + id->idSetEvexbContext(); + } #ifdef DEBUG id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs; #endif @@ -8126,14 +8165,15 @@ void emitter::emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber targe // indir -- The GenTreeIndir used for the memory address // void emitter::emitIns_SIMD_R_R_A( - instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir) + instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir, insOpts instOptions) { if (UseSimdEncoding()) { - emitIns_R_R_A(ins, attr, targetReg, op1Reg, indir); + emitIns_R_R_A(ins, attr, targetReg, op1Reg, indir, instOptions); } else { + assert(instOptions == INS_OPTS_NONE); emitIns_Mov(INS_movaps, attr, targetReg, op1Reg, /* canSkip */ true); emitIns_R_A(ins, attr, targetReg, indir); } @@ -8151,15 +8191,21 @@ void emitter::emitIns_SIMD_R_R_A( // fldHnd -- The CORINFO_FIELD_HANDLE used for the memory address // offs -- The offset added to the memory address from fldHnd // -void emitter::emitIns_SIMD_R_R_C( - instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, CORINFO_FIELD_HANDLE fldHnd, int offs) +void emitter::emitIns_SIMD_R_R_C(instruction ins, + emitAttr attr, + regNumber targetReg, + regNumber op1Reg, + CORINFO_FIELD_HANDLE fldHnd, + int offs, + insOpts instOptions) { if (UseSimdEncoding()) { - emitIns_R_R_C(ins, attr, targetReg, op1Reg, fldHnd, offs); + emitIns_R_R_C(ins, attr, targetReg, op1Reg, fldHnd, offs, instOptions); } else { + assert(instOptions == INS_OPTS_NONE); emitIns_Mov(INS_movaps, attr, targetReg, op1Reg, /* canSkip */ true); emitIns_R_C(ins, attr, targetReg, fldHnd, offs); } @@ -8214,14 +8260,15 @@ void emitter::emitIns_SIMD_R_R_R( // offs -- The offset added to the memory address from varx // void emitter::emitIns_SIMD_R_R_S( - instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int varx, int offs) + instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int varx, int offs, insOpts instOptions) { if (UseSimdEncoding()) { - emitIns_R_R_S(ins, attr, targetReg, op1Reg, varx, offs); + emitIns_R_R_S(ins, attr, targetReg, op1Reg, varx, offs, instOptions); } else { + assert(instOptions == INS_OPTS_NONE); emitIns_Mov(INS_movaps, attr, targetReg, op1Reg, /* canSkip */ true); emitIns_R_S(ins, attr, targetReg, varx, offs); } @@ -15709,7 +15756,7 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i) // Return Value: // size in bytes. // -ssize_t emitter::GetInputSizeInBytes(instrDesc* id) +ssize_t emitter::GetInputSizeInBytes(instrDesc* id) const { insFlags inputSize = static_cast((CodeGenInterface::instInfo[id->idIns()] & Input_Mask)); diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index c360efdd52ff7..3fdda27eb1008 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -294,6 +294,7 @@ bool hasEvexPrefix(code_t code) return (code & EVEX_PREFIX_MASK) == EVEX_PREFIX_CODE; } code_t AddEvexPrefix(instruction ins, code_t code, emitAttr attr); +code_t AddEvexbBit(code_t code); //------------------------------------------------------------------------ // AddSimdPrefixIfNeeded: Add the correct SIMD prefix if required. @@ -314,6 +315,10 @@ code_t AddSimdPrefixIfNeeded(const instrDesc* id, code_t code, emitAttr size) if (TakesEvexPrefix(id)) { code = AddEvexPrefix(ins, code, size); + if (id->idIsEvexbContext()) + { + code = AddEvexbBit(code); + } } else if (TakesVexPrefix(ins)) { @@ -385,7 +390,7 @@ bool codeEvexMigrationCheck(code_t code) return hasEvexPrefix(code); } -ssize_t GetInputSizeInBytes(instrDesc* id); +ssize_t GetInputSizeInBytes(instrDesc* id) const; bool containsAVXInstruction = false; bool ContainsAVX() @@ -572,7 +577,12 @@ void emitIns_R_C_I(instruction ins, emitAttr attr, regNumber reg1, CORINFO_FIELD void emitIns_R_S_I(instruction ins, emitAttr attr, regNumber reg1, int varx, int offs, int ival); -void emitIns_R_R_A(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, GenTreeIndir* indir); +void emitIns_R_R_A(instruction ins, + emitAttr attr, + regNumber reg1, + regNumber reg2, + GenTreeIndir* indir, + insOpts instOptions = INS_OPTS_NONE); void emitIns_R_R_AR(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber base, int offs); @@ -585,10 +595,21 @@ void emitIns_R_AR_R(instruction ins, int scale, int offs); -void emitIns_R_R_C( - instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, CORINFO_FIELD_HANDLE fldHnd, int offs); - -void emitIns_R_R_S(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs); +void emitIns_R_R_C(instruction ins, + emitAttr attr, + regNumber reg1, + regNumber reg2, + CORINFO_FIELD_HANDLE fldHnd, + int offs, + insOpts instOptions = INS_OPTS_NONE); + +void emitIns_R_R_S(instruction ins, + emitAttr attr, + regNumber reg1, + regNumber reg2, + int varx, + int offs, + insOpts instOptions = INS_OPTS_NONE); void emitIns_R_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber reg3); @@ -688,11 +709,27 @@ void emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNumber reg, void emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int ival); -void emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir); -void emitIns_SIMD_R_R_C( - instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, CORINFO_FIELD_HANDLE fldHnd, int offs); +void emitIns_SIMD_R_R_A(instruction ins, + emitAttr attr, + regNumber targetReg, + regNumber op1Reg, + GenTreeIndir* indir, + insOpts instOptions = INS_OPTS_NONE); +void emitIns_SIMD_R_R_C(instruction ins, + emitAttr attr, + regNumber targetReg, + regNumber op1Reg, + CORINFO_FIELD_HANDLE fldHnd, + int offs, + insOpts instOptions = INS_OPTS_NONE); void emitIns_SIMD_R_R_R(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg); -void emitIns_SIMD_R_R_S(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int varx, int offs); +void emitIns_SIMD_R_R_S(instruction ins, + emitAttr attr, + regNumber targetReg, + regNumber op1Reg, + int varx, + int offs, + insOpts instOptions = INS_OPTS_NONE); void emitIns_SIMD_R_R_A_I( instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir, int ival); @@ -837,7 +874,6 @@ inline bool emitIsUncondJump(instrDesc* jmp) //------------------------------------------------------------------------ // HasEmbeddedBroadcast: Do we consider embedded broadcast while encoding. -// TODO-XArch-AVX512: Add eventual check on the instrDesc // // Arguments: // id - Instruction descriptor. @@ -847,7 +883,7 @@ inline bool emitIsUncondJump(instrDesc* jmp) // inline bool HasEmbeddedBroadcast(const instrDesc* id) const { - return false; + return id->idIsEvexbContext(); } inline bool HasHighSIMDReg(const instrDesc* id) const; diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 9bd689dac50de..e8ef27402111e 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -19137,6 +19137,18 @@ bool GenTree::isContainableHWIntrinsic() const return true; } + case NI_SSE3_MoveAndDuplicate: + case NI_AVX_BroadcastScalarToVector128: + case NI_AVX2_BroadcastScalarToVector128: + case NI_AVX_BroadcastScalarToVector256: + case NI_AVX2_BroadcastScalarToVector256: + case NI_AVX512F_BroadcastScalarToVector512: + { + // These intrinsic operations are contained as part of the operand of embedded broadcast compatible + // instruction + return true; + } + default: { return false; @@ -25071,6 +25083,44 @@ bool GenTreeHWIntrinsic::OperIsMemoryStoreOrBarrier() const } } +//------------------------------------------------------------------------ +// OperIsEmbBroadcastCompatible: Checks if the intrinsic is a embedded broadcast compatible inintrsic. +// +// Return Value: +// true if the intrinsic node lowering instruction is embedded broadcast compatible. +// +bool GenTreeHWIntrinsic::OperIsEmbBroadcastCompatible() const +{ + return HWIntrinsicInfo::IsEmbBroadcastCompatible(GetHWIntrinsicId()); +} + +//------------------------------------------------------------------------ +// OperIsBroadcastScalar: Is this HWIntrinsic a broadcast node from scalar. +// +// Return Value: +// Whether "this" is a broadcast node from scalar. +// +bool GenTreeHWIntrinsic::OperIsBroadcastScalar() const +{ +#if defined(TARGET_XARCH) + NamedIntrinsic intrinsicId = GetHWIntrinsicId(); + switch (intrinsicId) + { + case NI_AVX2_BroadcastScalarToVector128: + case NI_AVX2_BroadcastScalarToVector256: + case NI_AVX_BroadcastScalarToVector128: + case NI_AVX_BroadcastScalarToVector256: + case NI_SSE3_MoveAndDuplicate: + case NI_AVX512F_BroadcastScalarToVector512: + return true; + default: + return false; + } +#else + return false; +#endif +} + //------------------------------------------------------------------------------ // OperRequiresAsgFlag : Check whether the operation requires GTF_ASG flag regardless // of the children's flags. diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index a050f2d6e2bb4..2f6def6803a5b 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -555,6 +555,7 @@ enum GenTreeFlags : unsigned int GTF_MDARRLEN_NONFAULTING = 0x20000000, // GT_MDARR_LENGTH -- An MD array length operation that cannot fault. Same as GT_IND_NONFAULTING. GTF_MDARRLOWERBOUND_NONFAULTING = 0x20000000, // GT_MDARR_LOWER_BOUND -- An MD array lower bound operation that cannot fault. Same as GT_IND_NONFAULTING. + }; inline constexpr GenTreeFlags operator ~(GenTreeFlags a) @@ -6212,6 +6213,8 @@ struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic bool OperIsMemoryStore(GenTree** pAddr = nullptr) const; bool OperIsMemoryLoadOrStore() const; bool OperIsMemoryStoreOrBarrier() const; + bool OperIsEmbBroadcastCompatible() const; + bool OperIsBroadcastScalar() const; bool OperRequiresAsgFlag() const; bool OperRequiresCallFlag() const; diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h index 26599ba5b1de2..f30f6229be86d 100644 --- a/src/coreclr/jit/hwintrinsic.h +++ b/src/coreclr/jit/hwintrinsic.h @@ -201,6 +201,8 @@ enum HWIntrinsicFlag : unsigned int // The intrinsic is a PermuteVar2x intrinsic HW_Flag_PermuteVar2x = 0x4000000, #endif // TARGET_XARCH + // The intrinsic is an embedded broadcast compatiable intrinsic + HW_Flag_EmbBroadcastCompatible = 0x8000000, }; #if defined(TARGET_XARCH) @@ -769,6 +771,12 @@ struct HWIntrinsicInfo return (flags & HW_Flag_Commutative) != 0; } + static bool IsEmbBroadcastCompatible(NamedIntrinsic id) + { + HWIntrinsicFlag flags = lookupFlags(id); + return (flags & HW_Flag_EmbBroadcastCompatible) != 0; + } + static bool IsMaybeCommutative(NamedIntrinsic id) { HWIntrinsicFlag flags = lookupFlags(id); diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index b804d4a0edd5d..246799bdfc1d0 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -360,7 +360,7 @@ HARDWARE_INTRINSIC(X86Base_X64, DivRem, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // SSE Intrinsics -HARDWARE_INTRINSIC(SSE, Add, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE, Add, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) HARDWARE_INTRINSIC(SSE, AddScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE, And, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE, AndNot, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) @@ -459,7 +459,7 @@ HARDWARE_INTRINSIC(SSE_X64, ConvertScalarToVector128Single, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // SSE2 Intrinsics -HARDWARE_INTRINSIC(SSE2, Add, 16, 2, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_invalid, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE2, Add, 16, 2, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_invalid, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) HARDWARE_INTRINSIC(SSE2, AddSaturate, 16, 2, {INS_paddsb, INS_paddusb, INS_paddsw, INS_paddusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE2, AddScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE2, And, 16, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) @@ -675,7 +675,7 @@ HARDWARE_INTRINSIC(SSE42_X64, Crc32, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVX Intrinsics -HARDWARE_INTRINSIC(AVX, Add, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX, Add, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) HARDWARE_INTRINSIC(AVX, AddSubtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, And, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX, AndNot, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) @@ -754,7 +754,7 @@ HARDWARE_INTRINSIC(AVX, Xor, // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVX2 Intrinsics HARDWARE_INTRINSIC(AVX2, Abs, 32, 1, {INS_pabsb, INS_invalid, INS_pabsw, INS_invalid, INS_pabsd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX2, Add, 32, 2, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX2, Add, 32, 2, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) HARDWARE_INTRINSIC(AVX2, AddSaturate, 32, 2, {INS_paddsb, INS_paddusb, INS_paddsw, INS_paddusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2, AlignRight, 32, 3, {INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX2, And, 32, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) @@ -826,7 +826,7 @@ HARDWARE_INTRINSIC(AVX2, Xor, // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVX512F Intrinsics HARDWARE_INTRINSIC(AVX512F, Abs, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pabsd, INS_invalid, INS_vpabsq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX512F, Add, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX512F, Add, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) HARDWARE_INTRINSIC(AVX512F, AlignRight32, 64, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignd, INS_valignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512F, AlignRight64, 64, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignq, INS_valignq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512F, And, 64, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_vpandq, INS_vpandq, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index b942ddd6d878d..93c4e601bb781 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -335,6 +335,22 @@ bool CodeGenInterface::instIsFP(instruction ins) #endif } +#if defined(TARGET_XARCH) +/***************************************************************************** + * + * Returns non-zero if the given CPU instruction is an embedded broadcast + * compatible instruction. + */ + +// static inline +bool CodeGenInterface::instIsEmbeddedBroadcastCompatible(instruction ins) +{ + assert((unsigned)ins < ArrLen(instInfo)); + + return (instInfo[ins] & INS_Flags_EmbeddedBroadcastSupported) != 0; +} +#endif // TARGET_XARCH + /***************************************************************************** * * Generate a set instruction. @@ -749,7 +765,7 @@ void CodeGen::inst_RV_SH( // logic for determining what "kind" of operand "op" is. // // Arguments: -// op - The operand node for which to obtain the descriptor +// op - The operand node for which to obtain the descriptor. // // Return Value: // The operand descriptor for "op". @@ -795,11 +811,69 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) assert(op->OperIsHWIntrinsic()); #if defined(FEATURE_HW_INTRINSICS) - GenTreeHWIntrinsic* hwintrinsic = op->AsHWIntrinsic(); - NamedIntrinsic intrinsicId = hwintrinsic->GetHWIntrinsicId(); - + GenTreeHWIntrinsic* hwintrinsic = op->AsHWIntrinsic(); + NamedIntrinsic intrinsicId = hwintrinsic->GetHWIntrinsicId(); + var_types simdBaseType = hwintrinsic->GetSimdBaseType(); switch (intrinsicId) { + case NI_AVX_BroadcastScalarToVector128: + case NI_AVX_BroadcastScalarToVector256: + { + // we have the assumption that AVX_BroadcastScalarToVector* + // only take the memory address as the operand. + assert(hwintrinsic->isContained()); + assert(hwintrinsic->OperIsMemoryLoad()); + assert(hwintrinsic->GetOperandCount() == 1); + assert(varTypeIsFloating(simdBaseType)); + GenTree* hwintrinsicChild = hwintrinsic->Op(1); + assert(hwintrinsicChild->isContained()); + if (hwintrinsicChild->OperIs(GT_LCL_ADDR, GT_CLS_VAR_ADDR, GT_CNS_INT, GT_LEA)) + { + addr = hwintrinsic->Op(1); + break; + } + else + { + assert(hwintrinsicChild->OperIs(GT_LCL_VAR)); + return OperandDesc(simdBaseType, hwintrinsicChild); + } + } + + case NI_SSE3_MoveAndDuplicate: + case NI_AVX2_BroadcastScalarToVector128: + case NI_AVX2_BroadcastScalarToVector256: + case NI_AVX512F_BroadcastScalarToVector512: + { + assert(hwintrinsic->isContained()); + if (intrinsicId == NI_SSE3_MoveAndDuplicate) + { + assert(simdBaseType == TYP_DOUBLE); + } + // If broadcast node is contained, should mean that we have some forms like + // Broadcast -> CreateScalarUnsafe -> Scalar. + // If so, directly emit scalar. + // In the codes below, we specially handle the `Broadcast -> CNS_INT` form and + // handle other cases recursively. + GenTree* hwintrinsicChild = hwintrinsic->Op(1); + assert(hwintrinsicChild->isContained()); + if (hwintrinsicChild->OperIs(GT_CNS_INT)) + { + // a special case is when the operand of CreateScalarUnsafe is in integer type, + // CreateScalarUnsafe node will be fold, so we directly match a pattern of + // broadcast -> LCL_VAR(TYP_(U)INT) + ssize_t scalarValue = hwintrinsicChild->AsIntCon()->IconValue(); + UNATIVE_OFFSET cnum = emit->emitDataConst(&scalarValue, genTypeSize(simdBaseType), + genTypeSize(simdBaseType), simdBaseType); + return OperandDesc(compiler->eeFindJitDataOffs(cnum)); + } + else + { + // If the operand of broadcast is not a constant integer, + // we handle all the other cases recursively. + return genOperandDesc(hwintrinsicChild); + } + break; + } case NI_Vector128_CreateScalarUnsafe: case NI_Vector256_CreateScalarUnsafe: case NI_Vector512_CreateScalarUnsafe: @@ -865,8 +939,10 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) return OperandDesc(emit->emitFltOrDblConst(op->AsDblCon()->DconValue(), emitTypeSize(op))); case GT_CNS_INT: + { assert(op->isContainedIntOrIImmed()); return OperandDesc(op->AsIntCon()->IconValue(), op->AsIntCon()->ImmedValNeedsReloc(compiler)); + } case GT_CNS_VEC: { @@ -1096,19 +1172,49 @@ void CodeGen::inst_RV_TT_IV(instruction ins, emitAttr attr, regNumber reg1, GenT } } +//------------------------------------------------------------------------ +// IsEmbeddedBroadcastEnabled: determine if embedded broadcast can be enabled +// +// Arguments: +// ins -- The instruction being emitted +// op -- The second operand of the instruction. +// +#if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS) +bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) +{ + // To enable embedded broadcast, we need 3 things, + // 1. EVEX enabled. + // 2. Embedded broadcast compatible intrinsics + // 3. A contained broadcast scalar node + if (!GetEmitter()->UseEvexEncoding()) + { + return false; + } + if (!instIsEmbeddedBroadcastCompatible(ins)) + { + return false; + } + if (!op->isContained() || !op->OperIsHWIntrinsic()) + { + return false; + } + + return op->AsHWIntrinsic()->OperIsBroadcastScalar(); +} +#endif // TARGET_XARCH && FEATURE_HW_INTRINSICS + //------------------------------------------------------------------------ // inst_RV_RV_TT: Generates an instruction that takes 2 operands: // a register operand and an operand that may be in memory or register // the result is returned in register // // Arguments: -// ins -- The instruction being emitted -// size -- The emit size attribute -// targetReg -- The target register -// op1Reg -- The first operand register -// op2 -- The second operand, which may be a memory node or a node producing a register -// isRMW -- true if the instruction is RMW; otherwise, false -// +// ins -- The instruction being emitted +// size -- The emit size attribute +// targetReg -- The target register +// op1Reg -- The first operand register +// op2 -- The second operand, which may be a memory node or a node producing a register +// isRMW -- true if the instruction is RMW; otherwise, false void CodeGen::inst_RV_RV_TT( instruction ins, emitAttr size, regNumber targetReg, regNumber op1Reg, GenTree* op2, bool isRMW) { @@ -1118,15 +1224,25 @@ void CodeGen::inst_RV_RV_TT( // TODO-XArch-CQ: Commutative operations can have op1 be contained // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained + insOpts instOptions = INS_OPTS_NONE; +#if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS) + bool IsEmbBroadcast = CodeGenInterface::IsEmbeddedBroadcastEnabled(ins, op2); + if (IsEmbBroadcast) + { + instOptions = INS_OPTS_EVEX_b; + } +#endif // TARGET_XARCH && FEATURE_HW_INTRINSICS OperandDesc op2Desc = genOperandDesc(op2); switch (op2Desc.GetKind()) { case OperandKind::ClsVar: - emit->emitIns_SIMD_R_R_C(ins, size, targetReg, op1Reg, op2Desc.GetFieldHnd(), 0); + { + emit->emitIns_SIMD_R_R_C(ins, size, targetReg, op1Reg, op2Desc.GetFieldHnd(), 0, instOptions); break; - + } case OperandKind::Local: - emit->emitIns_SIMD_R_R_S(ins, size, targetReg, op1Reg, op2Desc.GetVarNum(), op2Desc.GetLclOffset()); + emit->emitIns_SIMD_R_R_S(ins, size, targetReg, op1Reg, op2Desc.GetVarNum(), op2Desc.GetLclOffset(), + instOptions); break; case OperandKind::Indir: @@ -1135,7 +1251,7 @@ void CodeGen::inst_RV_RV_TT( // temporary GT_IND to generate code with. GenTreeIndir indirForm; GenTreeIndir* indir = op2Desc.GetIndirForm(&indirForm); - emit->emitIns_SIMD_R_R_A(ins, size, targetReg, op1Reg, indir); + emit->emitIns_SIMD_R_R_A(ins, size, targetReg, op1Reg, indir, instOptions); } break; diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h index df8658195f8da..c3efb39829a02 100644 --- a/src/coreclr/jit/instr.h +++ b/src/coreclr/jit/instr.h @@ -180,10 +180,20 @@ enum insFlags : uint64_t KInstruction = 1ULL << 41, + // EVEX feature: embedded broadcast + INS_Flags_EmbeddedBroadcastSupported = 1ULL << 42, + // TODO-Cleanup: Remove this flag and its usage from TARGET_XARCH INS_FLAGS_DONT_CARE = 0x00ULL, }; +enum insOpts: unsigned +{ + INS_OPTS_NONE, + + INS_OPTS_EVEX_b +}; + #elif defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) // TODO-Cleanup: Move 'insFlags' under TARGET_ARM enum insFlags: unsigned diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 0bb638bfea69b..4cb9cc2955364 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -191,7 +191,7 @@ INSTMUL(imul_15, "imul", IUM_RD, BAD_CODE, 0x4400003868, INST3(FIRST_SSE_INSTRUCTION, "FIRST_SSE_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // SSE -INST3(addps, "addps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x58), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed singles +INST3(addps, "addps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x58), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_Flags_EmbeddedBroadcastSupported) // Add packed singles INST3(addss, "addss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x58), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add scalar singles INST3(andnps, "andnps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x55), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // And-Not packed singles INST3(andps, "andps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x54), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // AND packed singles @@ -240,7 +240,7 @@ INST3(unpcklps, "unpcklps", IUM_WR, BAD_CODE, BAD_CODE, INST3(xorps, "xorps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x57), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // XOR packed singles // SSE2 -INST3(addpd, "addpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x58), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed doubles +INST3(addpd, "addpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x58), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_Flags_EmbeddedBroadcastSupported) // Add packed doubles INST3(addsd, "addsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x58), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add scalar doubles INST3(andnpd, "andnpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x55), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // And-Not packed doubles INST3(andpd, "andpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x54), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // AND packed doubles @@ -290,8 +290,8 @@ INST3(packssdw, "packssdw", IUM_WR, BAD_CODE, BAD_CODE, INST3(packsswb, "packsswb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x63), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) short to byte with saturation INST3(packuswb, "packuswb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x67), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) short to unsigned byte with saturation INST3(paddb, "paddb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFC), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed byte integers -INST3(paddd, "paddd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFE), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed double-word (32-bit) integers -INST3(paddq, "paddq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD4), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed quad-word (64-bit) integers +INST3(paddd, "paddd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFE), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_Flags_EmbeddedBroadcastSupported) // Add packed double-word (32-bit) integers +INST3(paddq, "paddq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD4), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_Flags_EmbeddedBroadcastSupported) // Add packed quad-word (64-bit) integers INST3(paddsb, "paddsb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEC), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed signed byte integers and saturate the results INST3(paddsw, "paddsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xED), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed signed word integers and saturate the results INST3(paddusb, "paddusb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDC), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed unsigned byte integers and saturate the results diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 264405298e968..37dbedd12fb72 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -109,6 +109,9 @@ class Lowering final : public Phase #ifdef FEATURE_HW_INTRINSICS void ContainCheckHWIntrinsicAddr(GenTreeHWIntrinsic* node, GenTree* addr); void ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node); +#ifdef TARGET_XARCH + void TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, GenTreeVecCon* childNode); +#endif // TARGET_XARCH #endif // FEATURE_HW_INTRINSICS #ifdef DEBUG diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 16120f1279d54..9f498b7342f7d 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -2329,12 +2329,13 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) node->ResetHWIntrinsicId(NI_AVX512BW_BroadcastScalarToVector512, tmp1); break; } + case TYP_INT: case TYP_UINT: - case TYP_LONG: - case TYP_ULONG: case TYP_FLOAT: case TYP_DOUBLE: + case TYP_LONG: + case TYP_ULONG: { assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX512F)); node->ResetHWIntrinsicId(NI_AVX512F_BroadcastScalarToVector512, tmp1); @@ -2370,7 +2371,6 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) LowerNode(tmp1); node->ResetHWIntrinsicId(NI_AVX2_BroadcastScalarToVector256, tmp1); - return LowerNode(node); } @@ -2445,7 +2445,6 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) // return Avx2.BroadcastScalarToVector128(tmp1); node->ChangeHWIntrinsicId(NI_AVX2_BroadcastScalarToVector128, tmp1); - return LowerNode(node); } @@ -7382,7 +7381,6 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre canBeContained = !vecCon->IsAllBitsSet() && !vecCon->IsZero(); } } - return canBeContained; } @@ -7503,6 +7501,36 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre return false; } + case NI_SSE3_MoveAndDuplicate: + case NI_AVX2_BroadcastScalarToVector128: + case NI_AVX2_BroadcastScalarToVector256: + case NI_AVX512F_BroadcastScalarToVector512: + { + // make the broadcast node containable when embedded broadcast can be enabled. + if (intrinsicId == NI_SSE3_MoveAndDuplicate) + { + // NI_SSE3_MoveAndDuplicate is for Vector128 only. + assert(childNode->AsHWIntrinsic()->GetSimdBaseType() == TYP_DOUBLE); + } + if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL) && + parentNode->OperIsEmbBroadcastCompatible()) + { + GenTree* broadcastOperand = childNode->AsHWIntrinsic()->Op(1); + bool childSupportsRegOptional; + if (IsContainableHWIntrinsicOp(childNode->AsHWIntrinsic(), broadcastOperand, &childSupportsRegOptional)) + { + return true; + } + } + return false; + } + + case NI_AVX_BroadcastScalarToVector128: + case NI_AVX_BroadcastScalarToVector256: + { + return parentNode->OperIsEmbBroadcastCompatible(); + } + default: { assert(!childNode->isContainableHWIntrinsic()); @@ -7511,6 +7539,153 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre } } +//---------------------------------------------------------------------------------------------- +// TryFoldCnsVecForEmbeddedBroadcast: +// Unfold the eligible constant vector when embedded broadcast is +// available. +// +// Arguments: +// parentNode - The hardware intrinsic node +// childNode - The operand node to try contain +// +void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, GenTreeVecCon* childNode) +{ + assert(!childNode->IsAllBitsSet()); + assert(!childNode->IsZero()); + var_types simdType = parentNode->TypeGet(); + var_types simdBaseType = parentNode->GetSimdBaseType(); + CorInfoType simdBaseJitType = parentNode->GetSimdBaseJitType(); + bool isCreatedFromScalar = true; + int elementCount = GenTreeVecCon::ElementCount(genTypeSize(simdType), simdBaseType); + switch (simdBaseType) + { + case TYP_FLOAT: + case TYP_INT: + case TYP_UINT: + { + uint32_t firstElement = static_cast(childNode->gtSimdVal.u32[0]); + for (int i = 1; i < elementCount; i++) + { + uint32_t elementToCheck = static_cast(childNode->gtSimdVal.u32[i]); + if (firstElement != elementToCheck) + { + isCreatedFromScalar = false; + break; + } + } + break; + } + + case TYP_DOUBLE: +#if defined(TARGET_AMD64) + case TYP_LONG: + case TYP_ULONG: +#endif // TARGET_AMD64 + { + uint64_t firstElement = static_cast(childNode->gtSimdVal.u64[0]); + for (int i = 1; i < elementCount; i++) + { + uint64_t elementToCheck = static_cast(childNode->gtSimdVal.u64[i]); + if (firstElement != elementToCheck) + { + isCreatedFromScalar = false; + break; + } + } + break; + } + + default: + isCreatedFromScalar = false; + break; + } + if (isCreatedFromScalar) + { + NamedIntrinsic broadcastName = NI_AVX2_BroadcastScalarToVector128; + if (simdType == TYP_SIMD32) + { + broadcastName = NI_AVX2_BroadcastScalarToVector256; + } + else if (simdType == TYP_SIMD64) + { + broadcastName = NI_AVX512F_BroadcastScalarToVector512; + } + else + { + assert(simdType == TYP_SIMD16); + } + GenTree* constScalar = nullptr; + switch (simdBaseType) + { + case TYP_FLOAT: + { + float scalar = static_cast(childNode->gtSimdVal.f32[0]); + constScalar = comp->gtNewDconNode(scalar, simdBaseType); + break; + } + case TYP_DOUBLE: + { + double scalar = static_cast(childNode->gtSimdVal.f64[0]); + constScalar = comp->gtNewDconNode(scalar, simdBaseType); + break; + } + case TYP_INT: + { + int32_t scalar = static_cast(childNode->gtSimdVal.i32[0]); + constScalar = comp->gtNewIconNode(scalar, simdBaseType); + break; + } + case TYP_UINT: + { + uint32_t scalar = static_cast(childNode->gtSimdVal.u32[0]); + constScalar = comp->gtNewIconNode(scalar, TYP_INT); + break; + } +#if defined(TARGET_AMD64) + case TYP_LONG: + { + int64_t scalar = static_cast(childNode->gtSimdVal.i64[0]); + constScalar = comp->gtNewIconNode(scalar, simdBaseType); + break; + } + case TYP_ULONG: + { + uint64_t scalar = static_cast(childNode->gtSimdVal.u64[0]); + constScalar = comp->gtNewIconNode(scalar, TYP_LONG); + break; + } +#endif // TARGET_AMD64 + default: + unreached(); + } + GenTreeHWIntrinsic* createScalar = + comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, constScalar, NI_Vector128_CreateScalarUnsafe, simdBaseJitType, + 16); + GenTreeHWIntrinsic* broadcastNode = comp->gtNewSimdHWIntrinsicNode(simdType, createScalar, broadcastName, + simdBaseJitType, genTypeSize(simdType)); + BlockRange().InsertBefore(childNode, broadcastNode); + BlockRange().InsertBefore(broadcastNode, createScalar); + BlockRange().InsertBefore(createScalar, constScalar); + LIR::Use use; + BlockRange().TryGetUse(childNode, &use); + use.ReplaceWith(broadcastNode); + BlockRange().Remove(childNode); + LowerNode(createScalar); + LowerNode(broadcastNode); + if (varTypeIsFloating(simdBaseType)) + { + MakeSrcContained(broadcastNode, createScalar); + } + else if (constScalar->TypeIs(TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG)) + { + MakeSrcContained(broadcastNode, constScalar); + } + MakeSrcContained(parentNode, broadcastNode); + return; + } + MakeSrcContained(parentNode, childNode); +} + //---------------------------------------------------------------------------------------------- // ContainCheckHWIntrinsicAddr: Perform containment analysis for an address operand of a hardware // intrinsic node. @@ -7819,13 +7994,29 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (IsContainableHWIntrinsicOp(node, op2, &supportsOp2RegOptional)) { - MakeSrcContained(node, op2); + if (op2->OperIs(GT_CNS_VEC) && comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F) && + node->OperIsEmbBroadcastCompatible()) + { + TryFoldCnsVecForEmbeddedBroadcast(node, op2->AsVecCon()); + } + else + { + MakeSrcContained(node, op2); + } } else if ((isCommutative || (intrinsicId == NI_BMI2_MultiplyNoFlags) || (intrinsicId == NI_BMI2_X64_MultiplyNoFlags)) && IsContainableHWIntrinsicOp(node, op1, &supportsOp1RegOptional)) { - MakeSrcContained(node, op1); + if (op1->OperIs(GT_CNS_VEC) && comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F) && + node->OperIsEmbBroadcastCompatible()) + { + TryFoldCnsVecForEmbeddedBroadcast(node, op1->AsVecCon()); + } + else + { + MakeSrcContained(node, op1); + } // Swap the operands here to make the containment checks in codegen significantly simpler std::swap(node->Op(1), node->Op(2));