Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARM64-SVE: Add MultiplyAddRotateComplexBySelectedScalar #105002

Merged
merged 3 commits into from
Jul 17, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/coreclr/jit/codegenarm64test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6592,11 +6592,11 @@ void CodeGen::genArm64EmitterUnitTestsSve()
// IF_SVE_GV_3A
theEmitter->emitIns_R_R_R_I_I(INS_sve_fcmla, EA_SCALABLE, REG_V0, REG_V1, REG_V0, 0, 0,
INS_OPTS_SCALABLE_S); // FCMLA <Zda>.S, <Zn>.S, <Zm>.S[<imm>], <const>
theEmitter->emitIns_R_R_R_I_I(INS_sve_fcmla, EA_SCALABLE, REG_V2, REG_V3, REG_V5, 1, 90,
theEmitter->emitIns_R_R_R_I_I(INS_sve_fcmla, EA_SCALABLE, REG_V2, REG_V3, REG_V5, 1, 1,
INS_OPTS_SCALABLE_S); // FCMLA <Zda>.S, <Zn>.S, <Zm>.S[<imm>], <const>
theEmitter->emitIns_R_R_R_I_I(INS_sve_fcmla, EA_SCALABLE, REG_V4, REG_V5, REG_V10, 0, 180,
theEmitter->emitIns_R_R_R_I_I(INS_sve_fcmla, EA_SCALABLE, REG_V4, REG_V5, REG_V10, 0, 2,
INS_OPTS_SCALABLE_S); // FCMLA <Zda>.S, <Zn>.S, <Zm>.S[<imm>], <const>
theEmitter->emitIns_R_R_R_I_I(INS_sve_fcmla, EA_SCALABLE, REG_V6, REG_V7, REG_V15, 1, 270,
theEmitter->emitIns_R_R_R_I_I(INS_sve_fcmla, EA_SCALABLE, REG_V6, REG_V7, REG_V15, 1, 3,
INS_OPTS_SCALABLE_S); // FCMLA <Zda>.S, <Zn>.S, <Zm>.S[<imm>], <const>

// IF_SVE_GX_3A
Expand Down
13 changes: 6 additions & 7 deletions src/coreclr/jit/emitarm64sve.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5825,14 +5825,13 @@ void emitter::emitInsSve_R_R_R_I_I(instruction ins,

case INS_sve_fcmla:
assert(opt == INS_OPTS_SCALABLE_S);
assert(isVectorRegister(reg1)); // ddddd
assert(isVectorRegister(reg2)); // nnnnn
assert(isLowVectorRegister(reg3)); // mmmm
assert(isValidUimm<1>(imm1)); // i
assert(isValidRot(imm2)); // rr
assert(isVectorRegister(reg1)); // ddddd
assert(isVectorRegister(reg2)); // nnnnn
assert(isLowVectorRegister(reg3)); // mmmm
assert(isValidUimm<1>(imm1)); // i
assert(emitIsValidEncodedRotationImm0_to_270(imm2)); // rr

// Convert imm2 from rotation value (0-270) to bitwise representation (0-3)
imm = (imm1 << 2) | emitEncodeRotationImm0_to_270(imm2);
imm = (imm1 << 2) | imm2;
fmt = IF_SVE_GV_3A;
break;

Expand Down
4 changes: 1 addition & 3 deletions src/coreclr/jit/hwintrinsic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -963,9 +963,7 @@ static void ValidateHWIntrinsicInfo(CORINFO_InstructionSet isa, NamedIntrinsic n
if (info.numArgs != -1)
{
// We should only have an expected number of arguments
#if defined(TARGET_ARM64)
assert((info.numArgs >= 0) && (info.numArgs <= 4));
#elif defined(TARGET_XARCH)
#if defined(TARGET_ARM64) || defined(TARGET_XARCH)
assert((info.numArgs >= 0) && (info.numArgs <= 5));
#else
unreached();
Expand Down
13 changes: 13 additions & 0 deletions src/coreclr/jit/hwintrinsic.h
Original file line number Diff line number Diff line change
Expand Up @@ -1085,6 +1085,14 @@ struct HWIntrinsicInfo
break;
}

case NI_Sve_MultiplyAddRotateComplexBySelectedScalar:
{
assert(sig->numArgs == 5);
*imm1Pos = 0;
*imm2Pos = 1;
break;
}

default:
{
assert(sig->numArgs > 0);
Expand All @@ -1105,6 +1113,7 @@ struct HWIntrinsic final
, op2(nullptr)
, op3(nullptr)
, op4(nullptr)
, op5(nullptr)
, numOperands(0)
, baseType(TYP_UNDEF)
{
Expand Down Expand Up @@ -1134,6 +1143,7 @@ struct HWIntrinsic final
GenTree* op2;
GenTree* op3;
GenTree* op4;
GenTree* op5;
size_t numOperands;
var_types baseType;

Expand All @@ -1144,6 +1154,9 @@ struct HWIntrinsic final

switch (numOperands)
{
case 5:
op5 = node->Op(5);
FALLTHROUGH;
case 4:
op4 = node->Op(4);
FALLTHROUGH;
Expand Down
66 changes: 66 additions & 0 deletions src/coreclr/jit/hwintrinsicarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,27 @@ void HWIntrinsicInfo::lookupImmBounds(
immUpperBound = 3;
break;

case NI_Sve_MultiplyAddRotateComplexBySelectedScalar:
// rotation comes after index in the intrinsic's signature,
// but flip the order here so we check the larger range first.
// This conforms to the existing logic in LinearScan::BuildHWIntrinsic
// when determining if we need an internal register for the jump table.
// This flipped ordering is reflected in HWIntrinsicInfo::GetImmOpsPositions.
if (immNumber == 1)
{
// Bounds for rotation
immLowerBound = 0;
immUpperBound = 3;
}
else
{
// Bounds for index
assert(immNumber == 2);
immLowerBound = 0;
immUpperBound = 1;
}
break;

case NI_Sve_TrigonometricMultiplyAddCoefficient:
immLowerBound = 0;
immUpperBound = 7;
Expand Down Expand Up @@ -3004,6 +3025,51 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
break;
}

case NI_Sve_MultiplyAddRotateComplexBySelectedScalar:
{
assert(sig->numArgs == 5);
assert(!isScalar);

CORINFO_ARG_LIST_HANDLE arg1 = sig->args;
CORINFO_ARG_LIST_HANDLE arg2 = info.compCompHnd->getArgNext(arg1);
CORINFO_ARG_LIST_HANDLE arg3 = info.compCompHnd->getArgNext(arg2);
CORINFO_ARG_LIST_HANDLE arg4 = info.compCompHnd->getArgNext(arg3);
CORINFO_ARG_LIST_HANDLE arg5 = info.compCompHnd->getArgNext(arg4);
var_types argType = TYP_UNKNOWN;
CORINFO_CLASS_HANDLE argClass = NO_CLASS_HANDLE;

int imm1LowerBound, imm1UpperBound; // Range for rotation
int imm2LowerBound, imm2UpperBound; // Range for index
HWIntrinsicInfo::lookupImmBounds(intrinsic, simdSize, simdBaseType, 1, &imm1LowerBound, &imm1UpperBound);
HWIntrinsicInfo::lookupImmBounds(intrinsic, simdSize, simdBaseType, 2, &imm2LowerBound, &imm2UpperBound);

argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg5, &argClass)));
GenTree* op5 = getArgForHWIntrinsic(argType, argClass);
assert(HWIntrinsicInfo::isImmOp(intrinsic, op5));
op5 = addRangeCheckIfNeeded(intrinsic, op5, mustExpand, imm1LowerBound, imm1UpperBound);

argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg4, &argClass)));
op4 = getArgForHWIntrinsic(argType, argClass);
assert(HWIntrinsicInfo::isImmOp(intrinsic, op4));
op4 = addRangeCheckIfNeeded(intrinsic, op4, mustExpand, imm2LowerBound, imm2UpperBound);

argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg3, &argClass)));
op3 = getArgForHWIntrinsic(argType, argClass);
argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg2, &argClass)));
op2 = getArgForHWIntrinsic(argType, argClass);
argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg1, &argClass)));
op1 = getArgForHWIntrinsic(argType, argClass);

SetOpLclRelatedToSIMDIntrinsic(op1);
SetOpLclRelatedToSIMDIntrinsic(op2);
SetOpLclRelatedToSIMDIntrinsic(op3);
SetOpLclRelatedToSIMDIntrinsic(op4);
SetOpLclRelatedToSIMDIntrinsic(op5);
retNode = new (this, GT_HWINTRINSIC) GenTreeHWIntrinsic(retType, getAllocator(CMK_ASTNode), intrinsic,
simdBaseJitType, simdSize, op1, op2, op3, op4, op5);
break;
}

default:
{
return nullptr;
Expand Down
60 changes: 60 additions & 0 deletions src/coreclr/jit/hwintrinsiccodegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,9 +265,15 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
regNumber op2Reg = REG_NA;
regNumber op3Reg = REG_NA;
regNumber op4Reg = REG_NA;
regNumber op5Reg = REG_NA;

switch (intrin.numOperands)
{
case 5:
assert(intrin.op5 != nullptr);
op5Reg = intrin.op5->GetRegNum();
FALLTHROUGH;

case 4:
assert(intrin.op4 != nullptr);
op4Reg = intrin.op4->GetRegNum();
Expand Down Expand Up @@ -2407,6 +2413,60 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
break;
}

case NI_Sve_MultiplyAddRotateComplexBySelectedScalar:
{
assert(isRMW);
assert(hasImmediateOperand);

if (targetReg != op1Reg)
{
assert(targetReg != op2Reg);
assert(targetReg != op3Reg);
GetEmitter()->emitInsSve_R_R(INS_sve_movprfx, EA_SCALABLE, targetReg, op1Reg);
}

// If both immediates are constant, we don't need a jump table
if (intrin.op4->IsCnsIntOrI() && intrin.op5->IsCnsIntOrI())
{
assert(intrin.op4->isContainedIntOrIImmed() && intrin.op5->isContainedIntOrIImmed());
GetEmitter()->emitInsSve_R_R_R_I_I(ins, emitSize, targetReg, op2Reg, op3Reg,
intrin.op4->AsIntCon()->gtIconVal,
intrin.op5->AsIntCon()->gtIconVal, opt);
}
else
{
// Use the helper to generate a table. The table can only use a single lookup value, therefore
// the two immediates index (0 to 1, in op4Reg) and rotation (0 to 3, in op5Reg) must be
// combined to a single value (0 to 7)
assert(!intrin.op4->isContainedIntOrIImmed() && !intrin.op5->isContainedIntOrIImmed());
emitAttr scalarSize = emitActualTypeSize(node->GetSimdBaseType());

// Combine the two immediates into op4Reg
// Shift rotation left to be out of range of index
GetEmitter()->emitIns_R_R_I(INS_lsl, scalarSize, op5Reg, op5Reg, 1);
// Combine the two values by ORing
GetEmitter()->emitIns_R_R_R(INS_orr, scalarSize, op4Reg, op4Reg, op5Reg);

// Generate the table using the combined immediate
HWIntrinsicImmOpHelper helper(this, op4Reg, 0, 7, node);
for (helper.EmitBegin(); !helper.Done(); helper.EmitCaseEnd())
{
// Extract index and rotation from the immediate
const int value = helper.ImmValue();
const ssize_t index = value & 1;
const ssize_t rotation = value >> 1;
GetEmitter()->emitInsSve_R_R_R_I_I(ins, emitSize, targetReg, op2Reg, op3Reg, index, rotation,
opt);
}

// Restore the original values in op4Reg and op5Reg
GetEmitter()->emitIns_R_R_I(INS_and, scalarSize, op4Reg, op4Reg, 1);
GetEmitter()->emitIns_R_R_I(INS_lsr, scalarSize, op5Reg, op5Reg, 1);
}

break;
}

default:
unreached();
}
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/hwintrinsiclistarm64sve.h
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ HARDWARE_INTRINSIC(Sve, MinNumberAcross,
HARDWARE_INTRINSIC(Sve, Multiply, -1, 2, true, {INS_sve_mul, INS_sve_mul, INS_sve_mul, INS_sve_mul, INS_sve_mul, INS_sve_mul, INS_sve_mul, INS_sve_mul, INS_sve_fmul, INS_sve_fmul}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, MultiplyAdd, -1, -1, false, {INS_sve_mla, INS_sve_mla, INS_sve_mla, INS_sve_mla, INS_sve_mla, INS_sve_mla, INS_sve_mla, INS_sve_mla, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_FmaIntrinsic|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(Sve, MultiplyAddRotateComplex, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fcmla, INS_sve_fcmla}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_HasImmediateOperand)
HARDWARE_INTRINSIC(Sve, MultiplyAddRotateComplexBySelectedScalar, -1, 5, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fcmla, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_LowVectorOperation|HW_Flag_HasRMWSemantics|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should have HW_Category_SIMDByIndexedElement.

Copy link
Member Author

@amanasifkhalid amanasifkhalid Jul 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that category makes sense, though this API seems different enough from other HW_Category_SIMDByIndexedElement APIs that we would have to add several special cases on that category's path (for example, all the 5-operand handling, plus the fact that we index the third vector register in pairs of elements, so the default bounds calculation for the index doesn't work). Do we want to move this logic over?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

synced offline...ok to skip this but to add a comment in lsraarm64.cpp explaining why we do not mark it such and why we just need 1 internal register although we have 2 immediates.

HARDWARE_INTRINSIC(Sve, MultiplyBySelectedScalar, -1, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fmul, INS_sve_fmul}, HW_Category_SIMDByIndexedElement, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_LowVectorOperation)
HARDWARE_INTRINSIC(Sve, MultiplyExtended, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fmulx, INS_sve_fmulx}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, MultiplySubtract, -1, -1, false, {INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_FmaIntrinsic|HW_Flag_SpecialCodeGen)
Expand Down
12 changes: 12 additions & 0 deletions src/coreclr/jit/lowerarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3698,6 +3698,18 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
}
break;

case NI_Sve_MultiplyAddRotateComplexBySelectedScalar:
assert(hasImmediateOperand);
assert(varTypeIsIntegral(intrin.op4));
assert(varTypeIsIntegral(intrin.op5));
// Can only avoid generating a table if both immediates are constant.
if (intrin.op4->IsCnsIntOrI() && intrin.op5->IsCnsIntOrI())
{
MakeSrcContained(node, intrin.op4);
MakeSrcContained(node, intrin.op5);
}
break;

default:
unreached();
}
Expand Down
Loading
Loading