Skip to content
This repository has been archived by the owner on Jan 23, 2023. It is now read-only.

Improve Bmi2.MultiplyNoFlags codegen #22047

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 28 additions & 6 deletions src/jit/hwintrinsiccodegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2254,9 +2254,9 @@ void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node)
emitter* emit = getEmitter();

assert(targetReg != REG_NA);
assert(op1 != nullptr);
assert(op1 != nullptr || intrinsicId == NI_BMI2_MultiplyNoFlags2ndValue);

if (!op1->OperIsList())
if (op1 != nullptr && !op1->OperIsList())
{
genConsumeOperands(node);
}
Expand Down Expand Up @@ -2317,7 +2317,17 @@ void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node)
{
op1Reg = op1->gtRegNum;
op2Reg = op2->gtRegNum;
lowReg = targetReg;

if (node->gtNext != nullptr && node->gtNext->OperIsHWIntrinsic() &&
node->gtNext->AsHWIntrinsic()->gtHWIntrinsicId == NI_BMI2_MultiplyNoFlags2ndValue)
{
lowReg = node->gtNext->GetRegNum();
assert(lowReg < REG_COUNT && lowReg != targetReg);
}
else
{
lowReg = targetReg;
}
}
else
{
Expand All @@ -2342,10 +2352,15 @@ void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node)
}

emitAttr attr = emitTypeSize(targetType);
// mov the first operand into implicit source operand EDX/RDX
if (op1Reg != REG_EDX)
if (op2Reg == REG_EDX)
{
assert(op1Reg != REG_EDX);
op2 = op1;
}
else if (op1Reg != REG_EDX)
{
assert(op2Reg != REG_EDX);
// mov the first operand into implicit source operand EDX/RDX
assert(op1Reg < REG_COUNT);
emit->emitIns_R_R(INS_mov, attr, REG_EDX, op1Reg);
}

Expand All @@ -2361,6 +2376,13 @@ void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node)
break;
}

case NI_BMI2_MultiplyNoFlags2ndValue:
assert(node->GetRegNum() < REG_COUNT);
assert(node->gtPrev->OperIsHWIntrinsic() &&
(node->gtPrev->AsHWIntrinsic()->gtHWIntrinsicId == NI_BMI2_MultiplyNoFlags ||
node->gtPrev->AsHWIntrinsic()->gtHWIntrinsicId == NI_BMI2_X64_MultiplyNoFlags));
break;

default:
{
unreached();
Expand Down
2 changes: 1 addition & 1 deletion src/jit/hwintrinsiclistxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,7 @@ HARDWARE_INTRINSIC(BMI2_ParallelBitDeposit, "ParallelBit
HARDWARE_INTRINSIC(BMI2_ParallelBitExtract, "ParallelBitExtract", BMI2, -1, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pext, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(BMI2_ZeroHighBits, "ZeroHighBits", BMI2, -1, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bzhi, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(BMI2_MultiplyNoFlags, "MultiplyNoFlags", BMI2, -1, 0, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulx, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_MaybeMemoryStore|HW_Flag_MultiIns|HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)

HARDWARE_INTRINSIC(BMI2_MultiplyNoFlags2ndValue, "MultiplyNoFlags2ndValue", BMI2, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// Intrinsic ID Function name ISA ival SIMD size NumArg instructions Category Flags
// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE}
Expand Down
24 changes: 24 additions & 0 deletions src/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1305,6 +1305,30 @@ GenTree* Compiler::impBMI1OrBMI2Intrinsic(NamedIntrinsic intrinsic,

if (sig->numArgs == 3)
{
if (opts.OptimizationEnabled() && op3->OperGet() == GT_ADDR &&
op3->gtGetOp1()->OperGet() == GT_LCL_VAR && op3->gtGetOp1()->TypeGet() == callType)
{
// Get a single-use temp to store the 2nd value which we'll later remove in lowering.
GenTreeLclVar* op3Local = op3->gtGetOp1()->AsLclVar();
const unsigned orgNum = op3Local->GetLclNum();
const unsigned tmpNum = lvaGrabTemp(true DEBUGARG("MULX low part"));
assert(tmpNum != 0);
lvaTable[tmpNum].lvType = callType;
op3Local->SetLclNum(tmpNum);

GenTree* mulx = gtNewScalarHWIntrinsicNode(callType, op1, op2, op3, intrinsic);

// Move the 2nd value to the original local var and stitch it together. This transformation produces
// correct results even if lowering fails to optimize it away.
GenTree* mov = gtNewAssignNode(gtNewLclvNode(orgNum, callType), gtNewLclvNode(tmpNum, callType));

GenTree* comma = gtNewOperNode(GT_COMMA, callType, mov, mulx);
comma->gtFlags |= GTF_REVERSE_OPS;
comma->gtFlags |= (mulx->gtFlags & GTF_ALL_EFFECT);
comma->gtFlags |= (mov->gtFlags & GTF_ALL_EFFECT);
return comma;
}

return gtNewScalarHWIntrinsicNode(callType, op1, op2, op3, intrinsic);
}
else
Expand Down
29 changes: 29 additions & 0 deletions src/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3151,6 +3151,35 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
{
op2->SetRegOptional();
}

if (comp->opts.OptimizationEnabled() && !node->IsUnusedValue() &&
op3->OperGet() == GT_LCL_VAR_ADDR && node->gtNext != nullptr &&
node->gtNext->OperGet() == GT_LCL_VAR)
{
GenTreeLclVar* op3Lcl = op3->AsLclVar();
GenTreeLclVar* nextLcl = node->gtNext->AsLclVar();
LclVarDsc* tmp = comp->lvaTable + op3Lcl->GetLclNum();

if (tmp->lvIsTemp && tmp->TypeGet() == node->TypeGet() &&
op3Lcl->GetLclNum() == nextLcl->GetLclNum())
{
// If we allocated a temp local for the 2nd result value in the importer and all
// pre-conditions for optimization still hold, then remove the temp and replace
// its use with a special value node.

tmp->lvAddrExposed = 0;
tmp->lvDoNotEnregister = 0;

BlockRange().Remove(op3);
node->gtOp1 = op1;
node->gtOp2 = op2;

nextLcl->ReplaceWith(
comp->gtNewScalarHWIntrinsicNode(node->TypeGet(), nullptr,
NI_BMI2_MultiplyNoFlags2ndValue),
comp);
}
}
break;
}

Expand Down
16 changes: 14 additions & 2 deletions src/jit/lsraxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2523,8 +2523,20 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
case NI_BMI2_X64_MultiplyNoFlags:
{
assert(numArgs == 2 || numArgs == 3);
srcCount += BuildOperandUses(op1, RBM_EDX);
srcCount += BuildOperandUses(op2);
if (op1->isContained() || (compiler->opts.OptimizationEnabled() && op2->OperGet() == GT_LCL_VAR &&
compiler->lvaTable[op2->AsLclVar()->gtLclNum].lvTracked &&
getIntervalForLocalVarNode(op2->AsLclVar())->physReg == REG_EDX))
{
op2->ClearRegOptional();
srcCount += BuildOperandUses(op2, RBM_EDX);
srcCount += BuildOperandUses(op1);
}
else
{
srcCount += BuildOperandUses(op1, RBM_EDX);
srcCount += BuildOperandUses(op2);
}

if (numArgs == 3)
{
// op3 reg should be different from target reg to
Expand Down