Skip to content

Commit

Permalink
Lower AVX10v1 hwintrinsic in lowering and gentree.cpp for simdSize 32/16
Browse files Browse the repository at this point in the history
  • Loading branch information
khushal1996 committed May 11, 2024
1 parent ab2c78e commit 538e0bf
Show file tree
Hide file tree
Showing 11 changed files with 928 additions and 226 deletions.
3 changes: 2 additions & 1 deletion src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -9502,7 +9502,8 @@ class Compiler
//
bool canUseEvexEncoding() const
{
return compOpportunisticallyDependsOn(InstructionSet_AVX512F);
return (compOpportunisticallyDependsOn(InstructionSet_AVX512F) ||
compOpportunisticallyDependsOn(InstructionSet_AVX10v1));
}

private:
Expand Down
565 changes: 462 additions & 103 deletions src/coreclr/jit/gentree.cpp

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions src/coreclr/jit/hwintrinsiccodegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1447,6 +1447,8 @@ void CodeGen::genNonTableDrivenHWIntrinsicsJumpTableFallback(GenTreeHWIntrinsic*

case NI_AVX512F_ConvertToInt32:
case NI_AVX512F_ConvertToUInt32:
case NI_AVX10v1_ConvertToInt32:
case NI_AVX10v1_ConvertToUInt32:
#if defined(TARGET_AMD64)
case NI_AVX512F_X64_ConvertToInt64:
case NI_AVX512F_X64_ConvertToUInt64:
Expand Down Expand Up @@ -2833,6 +2835,9 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node, insOpts instOption
case NI_AVX512F_X64_ConvertToInt64:
case NI_AVX512F_X64_ConvertToUInt64:
case NI_AVX512F_X64_ConvertToUInt64WithTruncation:
case NI_AVX10v1_ConvertToInt32:
case NI_AVX10v1_ConvertToUInt32:
case NI_AVX10v1_ConvertToUInt32WithTruncation:
{
assert(baseType == TYP_DOUBLE || baseType == TYP_FLOAT);
emitAttr attr = emitTypeSize(targetType);
Expand Down
170 changes: 85 additions & 85 deletions src/coreclr/jit/hwintrinsiclistxarch.h

Large diffs are not rendered by default.

49 changes: 42 additions & 7 deletions src/coreclr/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1458,7 +1458,21 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
{
assert(sig->numArgs == 1);
assert(varTypeIsLong(simdBaseType));
if (IsBaselineVector512IsaSupportedOpportunistically())
if ((simdSize != 64) && compOpportunisticallyDependsOn(InstructionSet_AVX10v1))
{
if (simdSize == 32)
{
intrinsic = NI_AVX10v1_ConvertToVector256Double;
}
else
{
assert(simdSize == 16);
intrinsic = NI_AVX10v1_ConvertToVector128Double;
}
op1 = impSIMDPopStack();
retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize);
}
else if (IsBaselineVector512IsaSupportedOpportunistically())
{
if (simdSize == 64)
{
Expand Down Expand Up @@ -1513,7 +1527,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
assert(sig->numArgs == 1);
assert(simdBaseType == TYP_DOUBLE);

if (IsBaselineVector512IsaSupportedOpportunistically())
if (IsBaselineVector512IsaSupportedOpportunistically() ||
(simdSize != 64 && compIsaSupportedDebugOnly(InstructionSet_AVX10v1)))
{
op1 = impSIMDPopStack();
retNode = gtNewSimdCvtNode(retType, op1, CORINFO_TYPE_LONG, simdBaseJitType, simdSize);
Expand All @@ -1528,7 +1543,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
assert(sig->numArgs == 1);
assert(simdBaseType == TYP_DOUBLE);

if (IsBaselineVector512IsaSupportedOpportunistically())
if (IsBaselineVector512IsaSupportedOpportunistically() ||
(simdSize != 64 && compIsaSupportedDebugOnly(InstructionSet_AVX10v1)))
{
op1 = impSIMDPopStack();
retNode = gtNewSimdCvtNativeNode(retType, op1, CORINFO_TYPE_LONG, simdBaseJitType, simdSize);
Expand Down Expand Up @@ -1560,6 +1576,20 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
unreached();
}
}
else if (simdBaseType == TYP_UINT && simdSize != 64 && compOpportunisticallyDependsOn(InstructionSet_AVX10v1))
{
switch (simdSize)
{
case 16:
intrinsic = NI_AVX10v1_ConvertToVector128Single;
break;
case 32:
intrinsic = NI_AVX10v1_ConvertToVector256Single;
break;
default:
unreached();
}
}
else if (simdBaseType == TYP_UINT && IsBaselineVector512IsaSupportedOpportunistically())
{
switch (simdSize)
Expand Down Expand Up @@ -1592,7 +1622,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
assert(sig->numArgs == 1);
assert(simdBaseType == TYP_FLOAT);

if (IsBaselineVector512IsaSupportedOpportunistically())
if (IsBaselineVector512IsaSupportedOpportunistically() ||
(simdSize != 64 && compIsaSupportedDebugOnly(InstructionSet_AVX10v1)))
{
op1 = impSIMDPopStack();
retNode = gtNewSimdCvtNode(retType, op1, CORINFO_TYPE_UINT, simdBaseJitType, simdSize);
Expand All @@ -1607,7 +1638,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
assert(sig->numArgs == 1);
assert(simdBaseType == TYP_FLOAT);

if (IsBaselineVector512IsaSupportedOpportunistically())
if (IsBaselineVector512IsaSupportedOpportunistically() ||
(simdSize != 64 && compIsaSupportedDebugOnly(InstructionSet_AVX10v1)))
{
op1 = impSIMDPopStack();
retNode = gtNewSimdCvtNativeNode(retType, op1, CORINFO_TYPE_UINT, simdBaseJitType, simdSize);
Expand All @@ -1621,7 +1653,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
{
assert(sig->numArgs == 1);
assert(simdBaseType == TYP_DOUBLE);
if (IsBaselineVector512IsaSupportedOpportunistically())
if (IsBaselineVector512IsaSupportedOpportunistically() ||
(simdSize != 64 && compIsaSupportedDebugOnly(InstructionSet_AVX10v1)))
{
op1 = impSIMDPopStack();
retNode = gtNewSimdCvtNode(retType, op1, CORINFO_TYPE_ULONG, simdBaseJitType, simdSize);
Expand All @@ -1636,7 +1669,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
assert(sig->numArgs == 1);
assert(simdBaseType == TYP_DOUBLE);

if (IsBaselineVector512IsaSupportedOpportunistically())
if (IsBaselineVector512IsaSupportedOpportunistically() ||
(simdSize != 64 && compIsaSupportedDebugOnly(InstructionSet_AVX10v1)))
{
op1 = impSIMDPopStack();
retNode = gtNewSimdCvtNativeNode(retType, op1, CORINFO_TYPE_ULONG, simdBaseJitType, simdSize);
Expand Down Expand Up @@ -3630,6 +3664,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
case NI_AVX512F_FixupScalar:
case NI_AVX512F_VL_Fixup:
case NI_AVX10v1_Fixup:
case NI_AVX10v1_FixupScalar:
{
assert(sig->numArgs == 4);

Expand Down
177 changes: 175 additions & 2 deletions src/coreclr/jit/importercalls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5225,6 +5225,10 @@ GenTree* Compiler::impPrimitiveNamedIntrinsic(NamedIntrinsic intrinsic,
{
hwIntrinsicId = NI_SSE_ConvertToInt32WithTruncation;
}
else if (compOpportunisticallyDependsOn(InstructionSet_AVX10v1))
{
hwIntrinsicId = NI_AVX10v1_ConvertToUInt32WithTruncation;
}
else if (IsBaselineVector512IsaSupportedOpportunistically())
{
hwIntrinsicId = NI_AVX512F_ConvertToUInt32WithTruncation;
Expand All @@ -5238,6 +5242,10 @@ GenTree* Compiler::impPrimitiveNamedIntrinsic(NamedIntrinsic intrinsic,
{
hwIntrinsicId = NI_SSE2_ConvertToInt32WithTruncation;
}
else if (compOpportunisticallyDependsOn(InstructionSet_AVX10v1))
{
hwIntrinsicId = NI_AVX10v1_ConvertToUInt32WithTruncation;
}
else if (IsBaselineVector512IsaSupportedOpportunistically())
{
hwIntrinsicId = NI_AVX512F_ConvertToUInt32WithTruncation;
Expand Down Expand Up @@ -8784,7 +8792,12 @@ GenTree* Compiler::impEstimateIntrinsic(CORINFO_METHOD_HANDLE method,
case NI_System_Math_ReciprocalEstimate:
{
#if defined(TARGET_XARCH)
if (compExactlyDependsOn(InstructionSet_AVX512F))
if (compExactlyDependsOn(InstructionSet_AVX10v1))
{
simdType = TYP_SIMD16;
intrinsicId = NI_AVX10v1_Reciprocal14Scalar;
}
else if (compExactlyDependsOn(InstructionSet_AVX512F))
{
simdType = TYP_SIMD16;
intrinsicId = NI_AVX512F_Reciprocal14Scalar;
Expand Down Expand Up @@ -9234,7 +9247,167 @@ GenTree* Compiler::impMinMaxIntrinsic(CORINFO_METHOD_HANDLE method,
}

#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
if (compOpportunisticallyDependsOn(InstructionSet_AVX512DQ))
if (compOpportunisticallyDependsOn(InstructionSet_AVX10v1))
{
// We are constructing a chain of intrinsics similar to:
// var op1 = Vector128.CreateScalarUnsafe(x);
// var op2 = Vector128.CreateScalarUnsafe(y);
//
// var tmp = Avx10v1.RangeScalar(op1, op2, imm8);
// var tbl = Vector128.CreateScalarUnsafe(0x00);
//
// tmp = Avx10v1.FixupScalar(tmp, op2, tbl, 0x00);
// tmp = Avx10v1.FixupScalar(tmp, op1, tbl, 0x00);
//
// return tmp.ToScalar();

// RangeScalar operates by default almost as MaxNumber or MinNumber
// but, it propagates sNaN and does not propagate qNaN. So we need
// an additional fixup to ensure we propagate qNaN as well.

uint8_t imm8;

if (isMax)
{
if (isMagnitude)
{
// 0b01_11: Sign(CompareResult), Max-Abs Value
imm8 = 0x07;
}
else
{
// 0b01_01: Sign(CompareResult), Max Value
imm8 = 0x05;
}
}
else if (isMagnitude)
{
// 0b01_10: Sign(CompareResult), Min-Abs Value
imm8 = 0x06;
}
else
{
// 0b01_00: Sign(CompareResult), Min Value
imm8 = 0x04;
}

GenTree* op3 = gtNewIconNode(imm8);
GenTree* op2 = gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, impPopStack().val, callJitType, 16);
GenTree* op1 = gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, impPopStack().val, callJitType, 16);

GenTree* op2Clone;
op2 = impCloneExpr(op2, &op2Clone, CHECK_SPILL_ALL, nullptr DEBUGARG("Cloning op2 for Math.Max/Min"));

GenTree* op1Clone;
op1 = impCloneExpr(op1, &op1Clone, CHECK_SPILL_ALL, nullptr DEBUGARG("Cloning op1 for Math.Max/Min"));

GenTree* tmp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, op3, NI_AVX10v1_RangeScalar, callJitType, 16);

// FixupScalar(left, right, table, control) computes the input type of right
// adjusts it based on the table and then returns
//
// In our case, left is going to be the result of the RangeScalar operation,
// which is either sNaN or a normal value, and right is going to be op1 or op2.

GenTree* tbl1 = gtNewVconNode(TYP_SIMD16);
GenTree* tbl2;

// We currently have (commutative)
// * snan, snan = snan
// * snan, qnan = snan
// * snan, norm = snan
// * qnan, qnan = qnan
// * qnan, norm = norm
// * norm, norm = norm

if (isNumber)
{
// We need to fixup the case of:
// * snan, norm = snan
//
// Instead, it should be:
// * snan, norm = norm

// First look at op1 and op2 using op2 as the classification
//
// If op2 is norm, we take op2 (norm)
// If op2 is nan, we take op1 ( nan or norm)
//
// Thus, if one input was norm the fixup is now norm

// QNAN: 0b0000: Preserve left
// SNAN: 0b0000
// ZERO: 0b0001: Preserve right
// +ONE: 0b0001
// -INF: 0b0001
// +INF: 0b0001
// -VAL: 0b0001
// +VAL: 0b0001
tbl1->AsVecCon()->gtSimdVal.i32[0] = 0x11111100;

// Next look at result and fixup using result as the classification
//
// If result is norm, we take the result (norm)
// If result is nan, we take the fixup ( nan or norm)
//
// Thus if either input was snan, we now have norm as expected
// Otherwise, the result was already correct

tbl1 = impCloneExpr(tbl1, &tbl2, CHECK_SPILL_ALL, nullptr DEBUGARG("Cloning tbl for Math.Max/Min"));

op1Clone = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1Clone, op2Clone, tbl1, gtNewIconNode(0),
NI_AVX10v1_FixupScalar, callJitType, 16);

tmp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1Clone, tmp, tbl2, gtNewIconNode(0), NI_AVX10v1_FixupScalar,
callJitType, 16);
}
else
{
// We need to fixup the case of:
// * qnan, norm = norm
//
// Instead, it should be:
// * qnan, norm = qnan

// First look at op1 and op2 using op2 as the classification
//
// If op2 is norm, we take op1 ( nan or norm)
// If op2 is snan, we take op1 ( nan or norm)
// If op2 is qnan, we take op2 (qnan)
//
// Thus, if either input was qnan the fixup is now qnan

// QNAN: 0b0001: Preserve right
// SNAN: 0b0000: Preserve left
// ZERO: 0b0000
// +ONE: 0b0000
// -INF: 0b0000
// +INF: 0b0000
// -VAL: 0b0000
// +VAL: 0b0000
tbl1->AsVecCon()->gtSimdVal.i32[0] = 0x00000001;

// Next look at result and fixup using fixup as the classification
//
// If fixup is norm, we take the result (norm)
// If fixup is sNaN, we take the result (sNaN)
// If fixup is qNaN, we take the fixup (qNaN)
//
// Thus if the fixup was qnan, we now have qnan as expected
// Otherwise, the result was already correct

tbl1 = impCloneExpr(tbl1, &tbl2, CHECK_SPILL_ALL, nullptr DEBUGARG("Cloning tbl for Math.Max/Min"));

op1Clone = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1Clone, op2Clone, tbl1, gtNewIconNode(0),
NI_AVX10v1_FixupScalar, callJitType, 16);

tmp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp, op1Clone, tbl2, gtNewIconNode(0), NI_AVX10v1_FixupScalar,
callJitType, 16);
}

return gtNewSimdToScalarNode(callType, tmp, callJitType, 16);
}
else if (compOpportunisticallyDependsOn(InstructionSet_AVX512DQ))
{
// We are constructing a chain of intrinsics similar to:
// var op1 = Vector128.CreateScalarUnsafe(x);
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/jit/importervectorization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD(
GenTreeVecCon* toLowerVec2 = gtNewVconNode(simdType, (BYTE*)toLowerMask + byteLen - simdSize);

#if defined(TARGET_XARCH)
if (compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL))
if (compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL) || compOpportunisticallyDependsOn(InstructionSet_AVX10v1))
{
GenTree* control;

Expand All @@ -185,7 +185,7 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD(
// ((v1 ^ cns1) | (v2 ^ cns2)) == zero

#if defined(TARGET_XARCH)
if (compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL))
if (compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL) || compOpportunisticallyDependsOn(InstructionSet_AVX10v1))
{
GenTree* control;

Expand Down
Loading

0 comments on commit 538e0bf

Please sign in to comment.