diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 386183737d760e..18a27b41aff49e 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -974,24 +974,22 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #if defined(TARGET_XARCH) void genHWIntrinsic_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, regNumber reg, GenTree* rmOp); void genHWIntrinsic_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, int8_t ival); - void genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr); - void genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, int8_t ival); - void genHWIntrinsic_R_R_RM( - GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTree* op2); + void genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, insOpts instOptions); void genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, int8_t ival); void genHWIntrinsic_R_R_RM_R(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr); void genHWIntrinsic_R_R_R_RM( instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTree* op3); void genHWIntrinsic_R_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, int8_t ival); + void genBaseIntrinsic(GenTreeHWIntrinsic* node); void genX86BaseIntrinsic(GenTreeHWIntrinsic* node); - void genSSEIntrinsic(GenTreeHWIntrinsic* node); - void genSSE2Intrinsic(GenTreeHWIntrinsic* node); + void genSSEIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions); + void genSSE2Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions); void genSSE41Intrinsic(GenTreeHWIntrinsic* node); void genSSE42Intrinsic(GenTreeHWIntrinsic* node); - void genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node); + void genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions); void genAESIntrinsic(GenTreeHWIntrinsic* node); - void genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node); + void genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions); void genFMAIntrinsic(GenTreeHWIntrinsic* node); void genPermuteVar2x(GenTreeHWIntrinsic* node); void genLZCNTIntrinsic(GenTreeHWIntrinsic* node); @@ -999,6 +997,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX void genPOPCNTIntrinsic(GenTreeHWIntrinsic* node); void genXCNTIntrinsic(GenTreeHWIntrinsic* node, instruction ins); void genX86SerializeIntrinsic(GenTreeHWIntrinsic* node); + template void genHWIntrinsicJumpTableFallback(NamedIntrinsic intrinsic, regNumber nonConstImmReg, @@ -1562,7 +1561,13 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX void inst_RV_TT(instruction ins, emitAttr size, regNumber op1Reg, GenTree* op2); void inst_RV_RV_IV(instruction ins, emitAttr size, regNumber reg1, regNumber reg2, unsigned ival); void inst_RV_TT_IV(instruction ins, emitAttr attr, regNumber reg1, GenTree* rmOp, int ival); - void inst_RV_RV_TT(instruction ins, emitAttr size, regNumber targetReg, regNumber op1Reg, GenTree* op2, bool isRMW); + void inst_RV_RV_TT(instruction ins, + emitAttr size, + regNumber targetReg, + regNumber op1Reg, + GenTree* op2, + bool isRMW, + insOpts instOptions); void inst_RV_RV_TT_IV( instruction ins, emitAttr size, regNumber targetReg, regNumber op1Reg, GenTree* op2, int8_t ival, bool isRMW); #endif diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index b806ca975c5072..73d0863de584a3 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1057,7 +1057,7 @@ void CodeGen::genCodeForBinary(GenTreeOp* treeNode) // all have RMW semantics if VEX support is not available bool isRMW = !compiler->canUseVexEncoding(); - inst_RV_RV_TT(ins, emitTypeSize(treeNode), targetReg, op1reg, op2, isRMW); + inst_RV_RV_TT(ins, emitTypeSize(treeNode), targetReg, op1reg, op2, isRMW, INS_OPTS_NONE); genProduceReg(treeNode); return; @@ -7438,7 +7438,7 @@ void CodeGen::genFloatToFloatCast(GenTree* treeNode) // is not available bool isRMW = !compiler->canUseVexEncoding(); - inst_RV_RV_TT(ins, emitTypeSize(dstType), targetReg, targetReg, op1, isRMW); + inst_RV_RV_TT(ins, emitTypeSize(dstType), targetReg, targetReg, op1, isRMW, INS_OPTS_NONE); } genProduceReg(treeNode); @@ -7550,7 +7550,7 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode) // is not available bool isRMW = !compiler->canUseVexEncoding(); - inst_RV_RV_TT(ins, emitTypeSize(srcType), targetReg, targetReg, op1, isRMW); + inst_RV_RV_TT(ins, emitTypeSize(srcType), targetReg, targetReg, op1, isRMW, INS_OPTS_NONE); // Handle the case of srcType = TYP_ULONG. SSE2 conversion instruction // will interpret ULONG value as LONG. Hence we need to adjust the @@ -8071,7 +8071,7 @@ void CodeGen::genIntrinsic(GenTreeIntrinsic* treeNode) regNumber targetReg = treeNode->GetRegNum(); bool isRMW = !compiler->canUseVexEncoding(); - inst_RV_RV_TT(ins, emitTypeSize(treeNode), targetReg, targetReg, srcNode, isRMW); + inst_RV_RV_TT(ins, emitTypeSize(treeNode), targetReg, targetReg, srcNode, isRMW, INS_OPTS_NONE); break; } diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index ef4dd9be146911..22ea8e28a09f4f 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -554,13 +554,6 @@ enum GenTreeFlags : unsigned int GTF_MDARRLEN_NONFAULTING = 0x20000000, // GT_MDARR_LENGTH -- An MD array length operation that cannot fault. Same as GT_IND_NONFAULTING. GTF_MDARRLOWERBOUND_NONFAULTING = 0x20000000, // GT_MDARR_LOWER_BOUND -- An MD array lower bound operation that cannot fault. Same as GT_IND_NONFAULTING. - - GTF_HW_ER_MASK = 0x30000000, // Bits used by handle types below - GTF_HW_ER_TO_EVEN = 0x00000000, // GT_HWINTRINSIC -- embedded rounding mode: FloatRoundingMode = ToEven (Default) "{rn-sae}" - GTF_HW_ER_TO_NEGATIVEINFINITY = 0x10000000, // GT_HWINTRINSIC -- embedded rounding mode: FloatRoundingMode = ToNegativeInfinity "{rd-sae}" - GTF_HW_ER_TO_POSITIVEINFINITY = 0x20000000, // GT_HWINTRINSIC -- embedded rounding mode: FloatRoundingMode = ToPositiveInfinity "{ru-sae}" - GTF_HW_ER_TO_ZERO = 0x30000000, // GT_HWINTRINSIC -- embedded rounding mode: FloatRoundingMode = ToZero "{rz-sae}" - }; inline constexpr GenTreeFlags operator ~(GenTreeFlags a) @@ -2230,43 +2223,6 @@ struct GenTree return (gtOper == GT_CNS_INT) ? (gtFlags & GTF_ICON_HDL_MASK) : GTF_EMPTY; } -#ifdef FEATURE_HW_INTRINSICS - - void ClearEmbRoundingMode() - { - assert(gtOper == GT_HWINTRINSIC); - gtFlags &= ~GTF_HW_ER_MASK; - } - // Set GenTreeFlags on HardwareIntrinsic node to specify the FloatRoundingMode. - // mode can be one of the values from System.Runtime.Intrinsics.X86.FloatRoundingMode. - void SetEmbRoundingMode(uint8_t mode) - { - assert(gtOper == GT_HWINTRINSIC); - ClearEmbRoundingMode(); - switch (mode) - { - case 0x09: - gtFlags |= GTF_HW_ER_TO_NEGATIVEINFINITY; - break; - case 0x0A: - gtFlags |= GTF_HW_ER_TO_POSITIVEINFINITY; - break; - case 0x0B: - gtFlags |= GTF_HW_ER_TO_ZERO; - break; - default: - break; - } - } - - uint8_t GetEmbRoundingMode() - { - assert(gtOper == GT_HWINTRINSIC); - return (uint8_t)((gtFlags & GTF_HW_ER_MASK) >> 28); - } - -#endif // FEATURE_HW_INTRINSICS - // Mark this node as no longer being a handle; clear its GTF_ICON_*_HDL bits. void ClearIconHandleMask() { diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 00342a2d820f35..ec936c349c8f2d 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -82,11 +82,69 @@ static bool genIsTableDrivenHWIntrinsic(NamedIntrinsic intrinsicId, HWIntrinsicC return tableDrivenCategory && tableDrivenFlag; } +//------------------------------------------------------------------------ +// AddEmbRoundingMode: Adds the embedded rounding mode to the insOpts +// +// Arguments: +// instOptions - The existing insOpts +// mode - The embedded rounding mode to add to instOptions +// +// Return Value: +// The modified insOpts +// +static insOpts AddEmbRoundingMode(insOpts instOptions, int8_t mode) +{ + // The full rounding mode is a bitmask in the shape of: + // * RC: 2-bit rounding control + // * RS: 1-bit rounding select + // * P: 1-bit precision mask + // * 4-bit reserved + // + // The embedded rounding form assumes that P is 1, indicating + // that floating-point exceptions should not be raised and also + // assumes that RS is 0, indicating that MXCSR.RC is ignored. + // + // Given that the user is specifying a rounding mode and that + // .NET doesn't support raising IEEE 754 floating-point exceptions, + // we simplify the handling below to only consider the 2-bits of RC. + + assert((instOptions & INS_OPTS_b_MASK) == 0); + unsigned result = static_cast(instOptions); + + switch (mode & 0x03) + { + case 0x01: + { + result |= INS_OPTS_EVEX_eb_er_rd; + break; + } + + case 0x02: + { + result |= INS_OPTS_EVEX_er_ru; + break; + } + + case 0x03: + { + result |= INS_OPTS_EVEX_er_rz; + break; + } + + default: + { + break; + } + } + + return static_cast(result); +} + //------------------------------------------------------------------------ // genHWIntrinsic: Generates the code for a given hardware intrinsic node. // // Arguments: -// node - The hardware intrinsic node +// node - The hardware intrinsic node // void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) { @@ -99,7 +157,89 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) assert(compiler->compIsaSupportedDebugOnly(isa)); assert(HWIntrinsicInfo::RequiresCodegen(intrinsicId)); - if (genIsTableDrivenHWIntrinsic(intrinsicId, category)) + bool isTableDriven = genIsTableDrivenHWIntrinsic(intrinsicId, category); + insOpts instOptions = INS_OPTS_NONE; + + if (GetEmitter()->UseEvexEncoding()) + { + if (HWIntrinsicInfo::IsEmbRoundingCompatible(intrinsicId)) + { + assert(isTableDriven); + size_t expectedArgNum = HWIntrinsicInfo::EmbRoundingArgPos(intrinsicId); + + if (numArgs == expectedArgNum) + { + GenTree* lastOp = node->Op(numArgs); + + // Now that we've extracted the rounding mode, we'll remove the + // last operand, adjust the arg count, and continue. This allows + // us to reuse all the existing logic without having to add new + // specialized handling everywhere. + + switch (numArgs) + { + case 3: + { + numArgs = 2; + node->ResetHWIntrinsicId(intrinsicId, compiler, node->Op(1), node->Op(2)); + break; + } + + default: + { + unreached(); + } + } + + if (lastOp->isContained()) + { + assert(lastOp->IsCnsIntOrI()); + + int8_t mode = static_cast(lastOp->AsIntCon()->IconValue()); + instOptions = AddEmbRoundingMode(instOptions, mode); + } + else + { + var_types baseType = node->GetSimdBaseType(); + + instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); + assert(ins != INS_invalid); + + emitAttr simdSize = emitActualTypeSize(Compiler::getSIMDTypeForSize(node->GetSimdSize())); + assert(simdSize != 0); + + genConsumeMultiOpOperands(node); + genConsumeRegs(lastOp); + + switch (numArgs) + { + case 2: + { + auto emitSwCase = [&](int8_t i) { + insOpts newInstOptions = AddEmbRoundingMode(instOptions, i); + genHWIntrinsic_R_R_RM(node, ins, simdSize, newInstOptions); + }; + regNumber baseReg = node->ExtractTempReg(); + regNumber offsReg = node->GetSingleTempReg(); + genHWIntrinsicJumpTableFallback(intrinsicId, lastOp->GetRegNum(), baseReg, offsReg, + emitSwCase); + break; + } + + default: + { + unreached(); + } + } + + genProduceReg(node); + return; + } + } + } + } + + if (isTableDriven) { regNumber targetReg = node->GetRegNum(); var_types baseType = node->GetSimdBaseType(); @@ -231,7 +371,9 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) // Until we improve the handling of addressing modes in the emitter, we'll create a // temporary GT_IND to generate code with. GenTreeIndir load = indirForm(node->TypeGet(), addr); - genHWIntrinsic_R_R_RM(node, ins, simdSize, targetReg, otherReg, &load); + + assert(!node->isRMWHWIntrinsic(compiler)); + inst_RV_RV_TT(ins, simdSize, targetReg, otherReg, &load, false, instOptions); } else if (HWIntrinsicInfo::isImmOp(intrinsicId, op2)) { @@ -272,7 +414,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) } else { - genHWIntrinsic_R_R_RM(node, ins, simdSize); + genHWIntrinsic_R_R_RM(node, ins, simdSize, instOptions); } break; } @@ -333,13 +475,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) emit->emitIns_R_R(ins, simdSize, op1Reg, op2Reg); } } - else if (HWIntrinsicInfo::IsEmbRoundingCompatible(intrinsicId) && !op3->IsCnsIntOrI()) - { - auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_R_RM(node, ins, simdSize, i); }; - regNumber baseReg = node->ExtractTempReg(); - regNumber offsReg = node->GetSingleTempReg(); - genHWIntrinsicJumpTableFallback(intrinsicId, op3Reg, baseReg, offsReg, emitSwCase); - } else { switch (intrinsicId) @@ -426,6 +561,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) unreached(); break; } + genProduceReg(node); return; } @@ -443,11 +579,11 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) break; case InstructionSet_SSE: case InstructionSet_SSE_X64: - genSSEIntrinsic(node); + genSSEIntrinsic(node, instOptions); break; case InstructionSet_SSE2: case InstructionSet_SSE2_X64: - genSSE2Intrinsic(node); + genSSE2Intrinsic(node, instOptions); break; case InstructionSet_SSE41: case InstructionSet_SSE41_X64: @@ -466,7 +602,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) case InstructionSet_AVX512BW_VL: case InstructionSet_AVX512VBMI: case InstructionSet_AVX512VBMI_VL: - genAvxFamilyIntrinsic(node); + genAvxFamilyIntrinsic(node, instOptions); break; case InstructionSet_AES: genAESIntrinsic(node); @@ -475,7 +611,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) case InstructionSet_BMI1_X64: case InstructionSet_BMI2: case InstructionSet_BMI2_X64: - genBMI1OrBMI2Intrinsic(node); + genBMI1OrBMI2Intrinsic(node, instOptions); break; case InstructionSet_FMA: genFMAIntrinsic(node); @@ -698,11 +834,12 @@ void CodeGen::genHWIntrinsic_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, e // register/memory operand, and that returns a value in register // // Arguments: -// node - The hardware intrinsic node -// ins - The instruction being generated -// attr - The emit attribute for the instruction being generated +// node - The hardware intrinsic node +// ins - The instruction being generated +// attr - The emit attribute for the instruction being generated +// instOptions - The options that modify how the instruction is generated // -void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr) +void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, insOpts instOptions) { regNumber targetReg = node->GetRegNum(); GenTree* op1 = node->Op(1); @@ -712,52 +849,6 @@ void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins, e assert(targetReg != REG_NA); assert(op1Reg != REG_NA); - genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, op1Reg, op2); -} - -//------------------------------------------------------------------------ -// genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a -// register/memory operand, and that returns a value in register -// -// Arguments: -// node - The hardware intrinsic node -// ins - The instruction being generated -// attr - The emit attribute for the instruction being generated -// ival - a "fake" immediate to indicate the rounding mode -// -void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, int8_t ival) -{ - regNumber targetReg = node->GetRegNum(); - GenTree* op1 = node->Op(1); - GenTree* op2 = node->Op(2); - regNumber op1Reg = op1->GetRegNum(); - - assert(targetReg != REG_NA); - assert(op1Reg != REG_NA); - - node->SetEmbRoundingMode((uint8_t)ival); - - genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, op1Reg, op2); -} - -//------------------------------------------------------------------------ -// genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a -// register/memory operand, and that returns a value in register -// -// Arguments: -// node - The hardware intrinsic node -// ins - The instruction being generated -// attr - The emit attribute for the instruction being generated -// targetReg - The register allocated to the result -// op1Reg - The register allocated to the first operand -// op2 - Another operand that maybe in register or memory -// -void CodeGen::genHWIntrinsic_R_R_RM( - GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTree* op2) -{ - assert(targetReg != REG_NA); - assert(op1Reg != REG_NA); - if (op2->isContained() || op2->isUsedFromSpillTemp()) { assert(HWIntrinsicInfo::SupportsContainment(node->GetHWIntrinsicId())); @@ -765,34 +856,7 @@ void CodeGen::genHWIntrinsic_R_R_RM( } bool isRMW = node->isRMWHWIntrinsic(compiler); - - if (node->GetEmbRoundingMode() != 0) - { - // As embedded rounding only appies in R_R_R case, we can skip other checks for different paths. - OperandDesc op2Desc = genOperandDesc(op2); - assert(op2Desc.GetKind() == OperandKind::Reg); - regNumber op2Reg = op2Desc.GetReg(); - - if ((op1Reg != targetReg) && (op2Reg == targetReg) && isRMW) - { - // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW instruction. - // - // For non-commutative instructions, we should have ensured that op2 was marked - // delay free in order to prevent it from getting assigned the same register - // as target. However, for commutative instructions, we can just swap the operands - // in order to have "reg2 = reg2 op reg1" which will end up producing the right code. - - op2Reg = op1Reg; - op1Reg = targetReg; - } - - uint8_t mode = node->GetEmbRoundingMode(); - insOpts instOptions = GetEmitter()->GetEmbRoundingMode(mode); - GetEmitter()->emitIns_SIMD_R_R_R(ins, attr, targetReg, op1Reg, op2Reg, instOptions); - return; - } - - inst_RV_RV_TT(ins, attr, targetReg, op1Reg, op2, isRMW); + inst_RV_RV_TT(ins, attr, targetReg, op1Reg, op2, isRMW, instOptions); } //------------------------------------------------------------------------ @@ -1526,9 +1590,10 @@ void CodeGen::genX86BaseIntrinsic(GenTreeHWIntrinsic* node) // genSSEIntrinsic: Generates the code for an SSE hardware intrinsic node // // Arguments: -// node - The hardware intrinsic node +// node - The hardware intrinsic node +// instOptions - The options used to when generating the instruction. // -void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) +void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) { NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); regNumber targetReg = node->GetRegNum(); @@ -1553,7 +1618,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) { assert(baseType == TYP_LONG); instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); - genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE); + genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE, instOptions); break; } @@ -1590,9 +1655,10 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) // genSSE2Intrinsic: Generates the code for an SSE2 hardware intrinsic node // // Arguments: -// node - The hardware intrinsic node +// node - The hardware intrinsic node +// instOptions - The options used to when generating the instruction. // -void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node) +void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) { NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); regNumber targetReg = node->GetRegNum(); @@ -1608,7 +1674,7 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node) { assert(baseType == TYP_LONG); instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); - genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE); + genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE, instOptions); break; } @@ -1812,9 +1878,10 @@ void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node) // genAvxFamilyIntrinsic: Generates the code for an AVX/AVX2/AVX512 hardware intrinsic node // // Arguments: -// node - The hardware intrinsic node +// node - The hardware intrinsic node +// instOptions - The options used to when generating the instruction. // -void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node) +void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) { NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); @@ -2457,7 +2524,7 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node) { assert(baseType == TYP_ULONG); instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); - genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE); + genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE, instOptions); break; } @@ -2484,9 +2551,10 @@ void CodeGen::genAESIntrinsic(GenTreeHWIntrinsic* node) // genBMI1OrBMI2Intrinsic: Generates the code for a BMI1 and BMI2 hardware intrinsic node // // Arguments: -// node - The hardware intrinsic node +// node - The hardware intrinsic node +// instOptions - The options used to when generating the instruction. // -void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node) +void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) { NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); regNumber targetReg = node->GetRegNum(); @@ -2512,7 +2580,7 @@ void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node) case NI_BMI2_X64_ZeroHighBits: { assert((targetType == TYP_INT) || (targetType == TYP_LONG)); - genHWIntrinsic_R_R_RM(node, ins, emitTypeSize(node->TypeGet())); + genHWIntrinsic_R_R_RM(node, ins, emitTypeSize(node->TypeGet()), instOptions); break; } @@ -2576,7 +2644,8 @@ void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node) emit->emitIns_Mov(INS_mov, attr, REG_EDX, op1Reg, /* canSkip */ true); // generate code for MULX - genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, lowReg, op2); + assert(!node->isRMWHWIntrinsic(compiler)); + inst_RV_RV_TT(ins, attr, targetReg, lowReg, op2, false, INS_OPTS_NONE); // If requires the lower half result, store in the memory pointed to by op3 if (numArgs == 3) diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 473e38df0a8706..80d326bbcf2a08 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -1211,6 +1211,22 @@ bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) return op->AsHWIntrinsic()->OperIsBroadcastScalar(); } + +//------------------------------------------------------------------------ +// AddEmbBroadcastMode: Adds the embedded broadcast mode to the insOpts +// +// Arguments: +// instOptions - The existing insOpts +// +// Return Value: +// The modified insOpts +// +static insOpts AddEmbBroadcastMode(insOpts instOptions) +{ + assert((instOptions & INS_OPTS_b_MASK) == 0); + unsigned result = static_cast(instOptions); + return static_cast(result | INS_OPTS_EVEX_eb_er_rd); +} #endif // TARGET_XARCH && FEATURE_HW_INTRINSICS //------------------------------------------------------------------------ @@ -1225,21 +1241,26 @@ bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) // op1Reg -- The first operand register // op2 -- The second operand, which may be a memory node or a node producing a register // isRMW -- true if the instruction is RMW; otherwise, false -void CodeGen::inst_RV_RV_TT( - instruction ins, emitAttr size, regNumber targetReg, regNumber op1Reg, GenTree* op2, bool isRMW) +// instOptions -- The options that modify how the instruction is generated +void CodeGen::inst_RV_RV_TT(instruction ins, + emitAttr size, + regNumber targetReg, + regNumber op1Reg, + GenTree* op2, + bool isRMW, + insOpts instOptions) { emitter* emit = GetEmitter(); noway_assert(emit->emitVerifyEncodable(ins, EA_SIZE(size), targetReg)); - // TODO-XArch-CQ: Commutative operations can have op1 be contained - // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained +// TODO-XArch-CQ: Commutative operations can have op1 be contained +// TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained - insOpts instOptions = INS_OPTS_NONE; #if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS) - bool IsEmbBroadcast = CodeGenInterface::IsEmbeddedBroadcastEnabled(ins, op2); - if (IsEmbBroadcast) + if (CodeGenInterface::IsEmbeddedBroadcastEnabled(ins, op2)) { - instOptions = INS_OPTS_EVEX_eb_er_rd; + instOptions = AddEmbBroadcastMode(instOptions); + if (emitter::IsBitwiseInstruction(ins) && varTypeIsLong(op2->AsHWIntrinsic()->GetSimdBaseType())) { switch (ins) @@ -1266,7 +1287,9 @@ void CodeGen::inst_RV_RV_TT( } } #endif // TARGET_XARCH && FEATURE_HW_INTRINSICS + OperandDesc op2Desc = genOperandDesc(op2); + switch (op2Desc.GetKind()) { case OperandKind::ClsVar: diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index a149f71e9b4aec..ff9cd371570fa0 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1071,59 +1071,23 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) { size_t numArgs = node->GetOperandCount(); size_t expectedArgNum = HWIntrinsicInfo::EmbRoundingArgPos(intrinsicId); + if (numArgs == expectedArgNum) { - CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); - uint32_t simdSize = node->GetSimdSize(); - var_types simdType = node->TypeGet(); - GenTree* lastOp = node->Op(numArgs); + GenTree* lastOp = node->Op(numArgs); + uint8_t mode = 0xFF; if (lastOp->IsCnsIntOrI()) { - uint8_t mode = static_cast(lastOp->AsIntCon()->IconValue()); - - GenTreeHWIntrinsic* embRoundingNode; - switch (numArgs) - { - case 3: - embRoundingNode = comp->gtNewSimdHWIntrinsicNode(simdType, node->Op(1), node->Op(2), - intrinsicId, simdBaseJitType, simdSize); - break; - case 2: - embRoundingNode = comp->gtNewSimdHWIntrinsicNode(simdType, node->Op(1), intrinsicId, - simdBaseJitType, simdSize); - break; + // Mark the constant as contained since it's specially encoded + MakeSrcContained(node, lastOp); - default: - unreached(); - } - - embRoundingNode->SetEmbRoundingMode(mode); - BlockRange().InsertAfter(lastOp, embRoundingNode); - LIR::Use use; - if (BlockRange().TryGetUse(node, &use)) - { - use.ReplaceWith(embRoundingNode); - } - else - { - embRoundingNode->SetUnusedValue(); - } - BlockRange().Remove(node); - BlockRange().Remove(lastOp); - node = embRoundingNode; - if (mode != 0x08) - { - // As embedded rounding can only work under register-to-register form, we can skip contain check at - // this point. - return node->gtNext; - } + mode = static_cast(lastOp->AsIntCon()->IconValue()); } - else + + if ((mode & 0x03) != 0x00) { - // If the control byte is not constant, generate a jump table fallback when emitting the code. - assert(!lastOp->IsCnsIntOrI()); - node->SetEmbRoundingMode(0x08); + // Embedded rounding only works for register-to-register operations, so skip containment return node->gtNext; } }