Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lowering Vector512() methods : comparison + shift #84942

Merged
merged 4 commits into from
Apr 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
212 changes: 200 additions & 12 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19677,6 +19677,45 @@ GenTree* Compiler::gtNewSimdBinOpNode(
intrinsic = NI_AVX2_ShiftRightLogical;
}
}
else if (simdSize == 64)
{
assert(IsBaselineVector512IsaSupportedDebugOnly());

if (op == GT_LSH)
{
if (varTypeIsShort(simdBaseType))
{
intrinsic = NI_AVX512BW_ShiftLeftLogical;
}
else
{
intrinsic = NI_AVX512F_ShiftLeftLogical;
}
}
else if (op == GT_RSH)
{
if (varTypeIsShort(simdBaseType))
{
intrinsic = NI_AVX512BW_ShiftRightArithmetic;
}
else
{
intrinsic = NI_AVX512F_ShiftRightArithmetic;
}
}
else
{
assert(op == GT_RSZ);
if (varTypeIsShort(simdBaseType))
{
intrinsic = NI_AVX512BW_ShiftRightLogical;
}
else
{
intrinsic = NI_AVX512F_ShiftRightLogical;
}
}
}
else if (op == GT_LSH)
{
intrinsic = NI_SSE2_ShiftLeftLogical;
Expand Down Expand Up @@ -20350,11 +20389,6 @@ GenTree* Compiler::gtNewSimdCmpOpNode(

NamedIntrinsic intrinsic = NI_Illegal;

if (simdSize == 64)
{
assert(op == GT_EQ);
}

switch (op)
{
#if defined(TARGET_XARCH)
Expand Down Expand Up @@ -20429,6 +20463,11 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
intrinsic = NI_AVX_CompareGreaterThanOrEqual;
}
}
else if (simdSize == 64)
{
assert(IsBaselineVector512IsaSupportedDebugOnly());
intrinsic = NI_AVX512F_CompareGreaterThanOrEqualSpecial;
tannergooding marked this conversation as resolved.
Show resolved Hide resolved
}
else if (simdBaseType == TYP_FLOAT)
{
intrinsic = NI_SSE_CompareGreaterThanOrEqual;
Expand Down Expand Up @@ -20575,6 +20614,11 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
intrinsic = NI_AVX2_CompareGreaterThan;
}
}
else if (simdSize == 64)
{
assert(IsBaselineVector512IsaSupportedDebugOnly());
intrinsic = NI_AVX512F_CompareGreaterThanSpecial;
}
else if (simdBaseType == TYP_FLOAT)
{
intrinsic = NI_SSE_CompareGreaterThan;
Expand Down Expand Up @@ -20655,6 +20699,11 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
intrinsic = NI_AVX_CompareLessThanOrEqual;
}
}
else if (simdSize == 64)
{
assert(IsBaselineVector512IsaSupportedDebugOnly());
intrinsic = NI_AVX512F_CompareLessThanOrEqualSpecial;
}
else if (simdBaseType == TYP_FLOAT)
{
intrinsic = NI_SSE_CompareLessThanOrEqual;
Expand Down Expand Up @@ -20801,6 +20850,11 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
intrinsic = NI_AVX2_CompareLessThan;
}
}
else if (simdSize == 64)
{
assert(IsBaselineVector512IsaSupportedDebugOnly());
intrinsic = NI_AVX512F_CompareLessThanSpecial;
}
else if (simdBaseType == TYP_FLOAT)
{
intrinsic = NI_SSE_CompareLessThan;
Expand Down Expand Up @@ -20989,18 +21043,18 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode(
#if defined(TARGET_XARCH)
case GT_EQ:
{
if (simdSize == 64)
{
assert(IsBaselineVector512IsaSupportedDebugOnly());
intrinsic = NI_Vector512_op_Equality;
}
else if (simdSize == 32)
if (simdSize == 32)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just curious, why we decided to check for 32 , then 64 and then (implicit) 16 or less? Is it because Vector256 is more common and should hit that condition first from TP perspective?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep. Initially when we had throughput issues, this was one of the things we tried. Making check for 32 the first(being the most common case)

{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
assert(varTypeIsFloating(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2));

intrinsic = NI_Vector256_op_Equality;
}
else if (simdSize == 64)
{
assert(IsBaselineVector512IsaSupportedDebugOnly());
intrinsic = NI_Vector512_op_Equality;
}
else
{
intrinsic = NI_Vector128_op_Equality;
Expand All @@ -21009,8 +21063,125 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode(
}

case GT_GE:
{
// We want to generate a comparison along the lines of
// GT_XX(op1, op2).As<T, TInteger>() == Vector128<TInteger>.AllBitsSet

if (simdSize == 32)
{
// TODO-XArch-CQ: It's a non-trivial amount of work to support these
// for floating-point while only utilizing AVX. It would require, among
// other things, inverting the comparison and potentially support for a
// new Avx.TestNotZ intrinsic to ensure the codegen remains efficient.
assert(compIsaSupportedDebugOnly(InstructionSet_AVX2));
intrinsic = NI_Vector256_op_Equality;
Comment on lines +21072 to +21077
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For other reviewers, this is just a copy/paste of an existing comment that used to be shared across the total of GE/GT/LE/LT

It's actually a lot easier for us to handle this today if we wanted to and is maybe a small/easy win we can do for .NET 8 (of course in a separate PR), so having the logic duplicated now will make doing that a bit simpler

}
else if (simdSize == 64)
{
assert(IsBaselineVector512IsaSupportedDebugOnly());
intrinsic = NI_Vector512_GreaterThanOrEqualAll;
break;
}
else
{
intrinsic = NI_Vector128_op_Equality;
}

op1 = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize);
op2 = gtNewAllBitsSetConNode(simdType);

if (simdBaseType == TYP_FLOAT)
{
simdBaseType = TYP_INT;
simdBaseJitType = CORINFO_TYPE_INT;
}
else if (simdBaseType == TYP_DOUBLE)
{
simdBaseType = TYP_LONG;
simdBaseJitType = CORINFO_TYPE_LONG;
}
break;
}
case GT_GT:
{
// We want to generate a comparison along the lines of
// GT_XX(op1, op2).As<T, TInteger>() == Vector128<TInteger>.AllBitsSet

if (simdSize == 32)
{
// TODO-XArch-CQ: It's a non-trivial amount of work to support these
// for floating-point while only utilizing AVX. It would require, among
// other things, inverting the comparison and potentially support for a
// new Avx.TestNotZ intrinsic to ensure the codegen remains efficient.
assert(compIsaSupportedDebugOnly(InstructionSet_AVX2));
intrinsic = NI_Vector256_op_Equality;
}
else if (simdSize == 64)
{
assert(IsBaselineVector512IsaSupportedDebugOnly());
intrinsic = NI_Vector512_GreaterThanAll;
break;
}
else
{
intrinsic = NI_Vector128_op_Equality;
}

op1 = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize);
op2 = gtNewAllBitsSetConNode(simdType);

if (simdBaseType == TYP_FLOAT)
{
simdBaseType = TYP_INT;
simdBaseJitType = CORINFO_TYPE_INT;
}
else if (simdBaseType == TYP_DOUBLE)
{
simdBaseType = TYP_LONG;
simdBaseJitType = CORINFO_TYPE_LONG;
}
break;
}
case GT_LE:
{
// We want to generate a comparison along the lines of
// GT_XX(op1, op2).As<T, TInteger>() == Vector128<TInteger>.AllBitsSet

if (simdSize == 32)
{
// TODO-XArch-CQ: It's a non-trivial amount of work to support these
// for floating-point while only utilizing AVX. It would require, among
// other things, inverting the comparison and potentially support for a
// new Avx.TestNotZ intrinsic to ensure the codegen remains efficient.
assert(compIsaSupportedDebugOnly(InstructionSet_AVX2));
intrinsic = NI_Vector256_op_Equality;
}
else if (simdSize == 64)
{
assert(IsBaselineVector512IsaSupportedDebugOnly());
intrinsic = NI_Vector512_LessThanOrEqualAll;
break;
}
else
{
intrinsic = NI_Vector128_op_Equality;
}

op1 = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize);
op2 = gtNewAllBitsSetConNode(simdType);

if (simdBaseType == TYP_FLOAT)
{
simdBaseType = TYP_INT;
simdBaseJitType = CORINFO_TYPE_INT;
}
else if (simdBaseType == TYP_DOUBLE)
{
simdBaseType = TYP_LONG;
simdBaseJitType = CORINFO_TYPE_LONG;
}
break;
}
case GT_LT:
{
// We want to generate a comparison along the lines of
Expand All @@ -21025,6 +21196,12 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode(
assert(compIsaSupportedDebugOnly(InstructionSet_AVX2));
intrinsic = NI_Vector256_op_Equality;
}
else if (simdSize == 64)
{
assert(IsBaselineVector512IsaSupportedDebugOnly());
intrinsic = NI_Vector512_LessThanAll;
break;
}
else
{
intrinsic = NI_Vector128_op_Equality;
Expand Down Expand Up @@ -21257,7 +21434,18 @@ GenTree* Compiler::gtNewSimdCndSelNode(

#if defined(TARGET_XARCH)
assert((simdSize != 32) || compIsaSupportedDebugOnly(InstructionSet_AVX));
intrinsic = (simdSize == 32) ? NI_Vector256_ConditionalSelect : NI_Vector128_ConditionalSelect;
if (simdSize == 32)
{
intrinsic = NI_Vector256_ConditionalSelect;
}
else if (simdSize == 64)
{
intrinsic = NI_Vector512_ConditionalSelect;
}
else
{
intrinsic = NI_Vector128_ConditionalSelect;
}
return gtNewSimdHWIntrinsicNode(type, op1, op2, op3, intrinsic, simdBaseJitType, simdSize);
#elif defined(TARGET_ARM64)
return gtNewSimdHWIntrinsicNode(type, op1, op2, op3, NI_AdvSimd_BitwiseSelect, simdBaseJitType, simdSize);
Expand Down
58 changes: 58 additions & 0 deletions src/coreclr/jit/hwintrinsiccodegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1760,6 +1760,64 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node)
break;
}

case NI_AVX512F_CompareGreaterThanOrEqualSpecial:
{
GenTree* op2 = node->Op(2);
op1Reg = op1->GetRegNum();
regNumber op2Reg = op2->GetRegNum();

instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareGreaterThanOrEqualSpecial, baseType);

assert(compareIns != INS_invalid);
assert(emitter::isMaskReg(targetReg));

emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 5);
break;
}

case NI_AVX512F_CompareGreaterThanSpecial:
{
GenTree* op2 = node->Op(2);
op1Reg = op1->GetRegNum();
regNumber op2Reg = op2->GetRegNum();

instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareGreaterThanSpecial, baseType);

assert(compareIns != INS_invalid);
assert(emitter::isMaskReg(targetReg));

emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 6);
break;
}
case NI_AVX512F_CompareLessThanOrEqualSpecial:
{
GenTree* op2 = node->Op(2);
op1Reg = op1->GetRegNum();
regNumber op2Reg = op2->GetRegNum();

instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareLessThanOrEqualSpecial, baseType);

assert(compareIns != INS_invalid);
assert(emitter::isMaskReg(targetReg));

emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 2);
break;
}

case NI_AVX512F_CompareLessThanSpecial:
{
GenTree* op2 = node->Op(2);
op1Reg = op1->GetRegNum();
regNumber op2Reg = op2->GetRegNum();

instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareLessThanSpecial, baseType);

assert(compareIns != INS_invalid);
assert(emitter::isMaskReg(targetReg));

emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 1);
break;
}
case NI_AVX512F_MoveMaskToVectorSpecial:
{
op1Reg = op1->GetRegNum();
Expand Down
Loading