From 3a05e3ba72177d1c86dce732cac4d62171e5ecfa Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Sat, 6 Jul 2024 23:23:40 -0700 Subject: [PATCH 01/10] Decompose some bitwise operations in HIR to allow more overall optimizations to kick in --- src/coreclr/jit/gentree.cpp | 93 ++++++-------- src/coreclr/jit/hwintrinsicarm64.cpp | 34 ++++++ src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 40 ++++++ src/coreclr/jit/hwintrinsiclistarm64.h | 4 +- src/coreclr/jit/hwintrinsiclistxarch.h | 19 +-- src/coreclr/jit/hwintrinsicxarch.cpp | 43 ++++++- src/coreclr/jit/importervectorization.cpp | 34 +----- src/coreclr/jit/lowerarmarch.cpp | 70 +++++++++++ src/coreclr/jit/lowerxarch.cpp | 129 ++++++++++++++++++++ src/coreclr/jit/morph.cpp | 94 +------------- src/coreclr/jit/valuenum.cpp | 57 ++------- 11 files changed, 380 insertions(+), 237 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 4093f3b796bdfe..fa1bd73aa3dea0 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -20861,12 +20861,6 @@ GenTree* Compiler::gtNewSimdBinOpNode( } } } - - if (op == GT_AND_NOT) - { - // GT_AND_NOT expects `op1 & ~op2`, but xarch does `~op1 & op2` - needsReverseOps = true; - } break; } #endif // TARGET_XARCH @@ -20897,11 +20891,34 @@ GenTree* Compiler::gtNewSimdBinOpNode( if (intrinsic != NI_Illegal) { + if (op == GT_AND_NOT) + { + assert(fgNodeThreading == NodeThreading::LIR); + +#if defined(TARGET_XARCH) + // GT_AND_NOT expects `op1 & ~op2`, but xarch does `~op1 & op2` + // We specially handle this here since we're only producing a + // native intrinsic node in LIR + + std::swap(op1, op2); +#endif // TARGET_XARCH + } return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize); } switch (op) { + case GT_AND_NOT: + { + // Prior to LIR, we want to explicitly decompose this operation so that downstream phases can + // appropriately optimize around the individual operations being performed, particularly ~op2, + // and produce overall better codegen. + assert(fgNodeThreading != NodeThreading::LIR); + + op2 = gtNewSimdUnOpNode(GT_NOT, type, op2, simdBaseJitType, simdSize); + return gtNewSimdBinOpNode(GT_AND, type, op1, op2, simdBaseJitType, simdSize); + } + #if defined(TARGET_XARCH) case GT_RSZ: { @@ -21066,9 +21083,6 @@ GenTree* Compiler::gtNewSimdBinOpNode( vecCon1->gtSimdVal.u64[i] = 0x00FF00FF00FF00FF; } - // Validate we can't use AVX512F_VL_TernaryLogic here - assert(!canUseEvexEncodingDebugOnly()); - // Vector256 maskedProduct = Avx2.And(widenedProduct, vecCon1).AsInt16() GenTree* maskedProduct = gtNewSimdBinOpNode(GT_AND, widenedType, widenedProduct, vecCon1, widenedSimdBaseJitType, widenedSimdSize); @@ -22033,9 +22047,6 @@ GenTree* Compiler::gtNewSimdCmpOpNode( v = gtNewSimdHWIntrinsicNode(type, v, gtNewIconNode(SHUFFLE_ZZXX, TYP_INT), NI_SSE2_Shuffle, CORINFO_TYPE_INT, simdSize); - // Validate we can't use AVX512F_VL_TernaryLogic here - assert(!canUseEvexEncodingDebugOnly()); - op2 = gtNewSimdBinOpNode(GT_AND, type, u, v, simdBaseJitType, simdSize); return gtNewSimdBinOpNode(GT_OR, type, op1, op2, simdBaseJitType, simdSize); } @@ -24146,9 +24157,6 @@ GenTree* Compiler::gtNewSimdNarrowNode( GenTree* vecCon2 = gtCloneExpr(vecCon1); - // Validate we can't use AVX512F_VL_TernaryLogic here - assert(!canUseEvexEncodingDebugOnly()); - tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize); tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize); tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_AVX2_PackUnsignedSaturate, CORINFO_TYPE_UBYTE, @@ -24187,9 +24195,6 @@ GenTree* Compiler::gtNewSimdNarrowNode( GenTree* vecCon2 = gtCloneExpr(vecCon1); - // Validate we can't use AVX512F_VL_TernaryLogic here - assert(!canUseEvexEncodingDebugOnly()); - tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize); tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize); tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_AVX2_PackUnsignedSaturate, CORINFO_TYPE_USHORT, @@ -24291,9 +24296,6 @@ GenTree* Compiler::gtNewSimdNarrowNode( GenTree* vecCon2 = gtCloneExpr(vecCon1); - // Validate we can't use AVX512F_VL_TernaryLogic here - assert(!canUseEvexEncodingDebugOnly()); - tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize); tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize); @@ -24330,9 +24332,6 @@ GenTree* Compiler::gtNewSimdNarrowNode( GenTree* vecCon2 = gtCloneExpr(vecCon1); - // Validate we can't use AVX512F_VL_TernaryLogic here - assert(!canUseEvexEncodingDebugOnly()); - tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize); tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize); @@ -27821,6 +27820,14 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, assert(!isScalar); assert(op2->TypeIs(simdType)); + if (comp->fgNodeThreading != NodeThreading::LIR) + { + // We don't want to support creating AND_NOT nodes prior to LIR + // as it can break important optimizations. We'll produces this + // in lowering instead. + break; + } + #if defined(TARGET_XARCH) if (simdSize == 64) { @@ -30155,13 +30162,8 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) bool isScalar = false; genTreeOps oper = tree->GetOperForHWIntrinsicId(&isScalar); -#if defined(TARGET_XARCH) - if (oper == GT_AND_NOT) - { - // xarch does: ~op1 & op2, we need op1 & ~op2 - std::swap(op1, op2); - } -#endif // TARGET_XARCH + // We shouldn't find AND_NOT nodes since it should only be produced in lowering + assert(oper != GT_AND_NOT); GenTree* cnsNode = nullptr; GenTree* otherNode = nullptr; @@ -30674,31 +30676,6 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) break; } - case GT_AND_NOT: - { - // Handle `x & ~0 == x` and `0 & ~x == 0` - if (cnsNode->IsVectorZero()) - { - if (cnsNode == op1) - { - resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT); - break; - } - else - { - resultNode = otherNode; - } - break; - } - - // Handle `x & ~AllBitsSet == 0` - if (cnsNode->IsVectorAllBitsSet() && (cnsNode == op2)) - { - resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT); - } - break; - } - case GT_DIV: { if (varTypeIsFloating(simdBaseType)) @@ -31089,12 +31066,12 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) { switch (ni) { - case NI_Vector128_ConditionalSelect: #if defined(TARGET_XARCH) + case NI_Vector128_ConditionalSelect: case NI_Vector256_ConditionalSelect: case NI_Vector512_ConditionalSelect: #elif defined(TARGET_ARM64) - case NI_Vector64_ConditionalSelect: + case NI_AdvSimd_BitwiseSelect: case NI_Sve_ConditionalSelect: #endif { diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index 3a13fcfab7358c..eeccc08cce0908 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -611,6 +611,40 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_AdvSimd_BitwiseClear: + { + assert(sig->numArgs == 2); + + // We don't want to support creating AND_NOT nodes prior to LIR + // as it can break important optimizations. We'll produces this + // in lowering instead so decompose into the individual operations + // on import + + op2 = impSIMDPopStack(); + op1 = impSIMDPopStack(); + + op2 = gtNewSimdUnOpNode(GT_NOT, retType, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdBinOpNode(GT_AND, retType, op1, op2, simdBaseJitType, simdSize); + break; + } + + case NI_AdvSimd_OrNot: + { + assert(sig->numArgs == 2); + + // We don't want to support creating OR_NOT nodes prior to LIR + // as it can break important optimizations. We'll produces this + // in lowering instead so decompose into the individual operations + // on import + + op2 = impSIMDPopStack(); + op1 = impSIMDPopStack(); + + op2 = gtNewSimdUnOpNode(GT_NOT, retType, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdBinOpNode(GT_OR, retType, op1, op2, simdBaseJitType, simdSize); + break; + } + case NI_Vector64_AndNot: case NI_Vector128_AndNot: { diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index d96690c4003601..bb3876dd085e25 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -2856,6 +2856,46 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node, insOpts instOption break; } + case NI_EVEX_XnorMask: + { + assert(instOptions == INS_OPTS_NONE); + + uint32_t simdSize = node->GetSimdSize(); + uint32_t count = simdSize / genTypeSize(baseType); + + if (count <= 8) + { + assert((count == 2) || (count == 4) || (count == 8)); + ins = INS_kxnorb; + } + else if (count == 16) + { + ins = INS_kxnorw; + } + else if (count == 32) + { + ins = INS_kxnord; + } + else + { + assert(count == 64); + ins = INS_kxnorq; + } + + op1Reg = op1->GetRegNum(); + + GenTree* op2 = node->Op(2); + regNumber op2Reg = op2->GetRegNum(); + + assert(emitter::isMaskReg(targetReg)); + assert(emitter::isMaskReg(op1Reg)); + assert(emitter::isMaskReg(op2Reg)); + + // Use EA_32BYTE to ensure the VEX.L bit gets set + emit->emitIns_R_R_R(ins, EA_32BYTE, targetReg, op1Reg, op2Reg); + break; + } + case NI_AVX512F_ConvertToInt32: case NI_AVX512F_ConvertToUInt32: case NI_AVX512F_ConvertToUInt32WithTruncation: diff --git a/src/coreclr/jit/hwintrinsiclistarm64.h b/src/coreclr/jit/hwintrinsiclistarm64.h index 1881a70c041f1a..a0d52376ded103 100644 --- a/src/coreclr/jit/hwintrinsiclistarm64.h +++ b/src/coreclr/jit/hwintrinsiclistarm64.h @@ -245,7 +245,7 @@ HARDWARE_INTRINSIC(AdvSimd, AddScalar, HARDWARE_INTRINSIC(AdvSimd, AddWideningLower, 8, 2, true, {INS_saddl, INS_uaddl, INS_saddl, INS_uaddl, INS_saddl, INS_uaddl, INS_saddw, INS_uaddw, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_BaseTypeFromSecondArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AdvSimd, AddWideningUpper, 16, 2, true, {INS_saddl2, INS_uaddl2, INS_saddl2, INS_uaddl2, INS_saddl2, INS_uaddl2, INS_saddw2, INS_uaddw2, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_BaseTypeFromSecondArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AdvSimd, And, -1, 2, true, {INS_and, INS_and, INS_and, INS_and, INS_and, INS_and, INS_and, INS_and, INS_and, INS_and}, HW_Category_SIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AdvSimd, BitwiseClear, -1, 2, true, {INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic}, HW_Category_SIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AdvSimd, BitwiseClear, -1, 2, true, {INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic}, HW_Category_SIMD, HW_Flag_SpecialImport) HARDWARE_INTRINSIC(AdvSimd, BitwiseSelect, -1, 3, true, {INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl}, HW_Category_SIMD, HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AdvSimd, Ceiling, -1, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_frintp, INS_invalid}, HW_Category_SIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AdvSimd, CeilingScalar, 8, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_frintp, INS_frintp}, HW_Category_SIMD, HW_Flag_SIMDScalar) @@ -383,7 +383,7 @@ HARDWARE_INTRINSIC(AdvSimd, NegateSaturate, HARDWARE_INTRINSIC(AdvSimd, NegateScalar, 8, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_fneg, INS_fneg}, HW_Category_SIMD, HW_Flag_SIMDScalar) HARDWARE_INTRINSIC(AdvSimd, Not, -1, 1, true, {INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn}, HW_Category_SIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AdvSimd, Or, -1, 2, true, {INS_orr, INS_orr, INS_orr, INS_orr, INS_orr, INS_orr, INS_orr, INS_orr, INS_orr, INS_orr}, HW_Category_SIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AdvSimd, OrNot, -1, 2, true, {INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn}, HW_Category_SIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AdvSimd, OrNot, -1, 2, true, {INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn}, HW_Category_SIMD, HW_Flag_SpecialImport) HARDWARE_INTRINSIC(AdvSimd, PolynomialMultiply, -1, 2, true, {INS_pmul, INS_pmul, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AdvSimd, PolynomialMultiplyWideningLower, 8, 2, true, {INS_pmull, INS_pmull, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_Commutative) HARDWARE_INTRINSIC(AdvSimd, PolynomialMultiplyWideningUpper, 16, 2, true, {INS_pmull2, INS_pmull2, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_Commutative) diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 58c51a2a6bcdd9..0d8eba2e3be1d2 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -361,7 +361,7 @@ HARDWARE_INTRINSIC(X86Base_X64, DivRem, HARDWARE_INTRINSIC(SSE, Add, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(SSE, AddScalar, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE, And, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(SSE, AndNot, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(SSE, AndNot, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(SSE, CompareEqual, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareGreaterThan, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareGreaterThanOrEqual, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) @@ -465,7 +465,7 @@ HARDWARE_INTRINSIC(SSE2, Add, HARDWARE_INTRINSIC(SSE2, AddSaturate, 16, 2, true, {INS_paddsb, INS_paddusb, INS_paddsw, INS_paddusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(SSE2, AddScalar, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE2, And, 16, 2, true, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(SSE2, AndNot, 16, 2, true, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(SSE2, AndNot, 16, 2, true, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(SSE2, Average, 16, 2, true, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(SSE2, CompareEqual, 16, 2, true, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareGreaterThan, 16, 2, true, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) @@ -696,7 +696,7 @@ HARDWARE_INTRINSIC(SSE42_X64, Crc32, HARDWARE_INTRINSIC(AVX, Add, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX, AddSubtract, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, And, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX, AndNot, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX, AndNot, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX, Blend, 32, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, BlendVariable, 32, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vblendvps, INS_vblendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, BroadcastScalarToVector128, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastss, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) @@ -778,7 +778,7 @@ HARDWARE_INTRINSIC(AVX2, Add, HARDWARE_INTRINSIC(AVX2, AddSaturate, 32, 2, true, {INS_paddsb, INS_paddusb, INS_paddsw, INS_paddusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX2, AlignRight, 32, 3, false, {INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX2, And, 32, 2, false, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX2, AndNot, 32, 2, false, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX2, AndNot, 32, 2, false, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX2, Average, 32, 2, true, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX2, Blend, -1, 3, true, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_vpblendd, INS_vpblendd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, BlendVariable, 32, 3, false, {INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) @@ -853,7 +853,7 @@ HARDWARE_INTRINSIC(AVX512F, AddScalar, HARDWARE_INTRINSIC(AVX512F, AlignRight32, 64, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignd, INS_valignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX512F, AlignRight64, 64, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignq, INS_valignq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX512F, And, 64, 2, true, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_vpandq, INS_vpandq, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX512F, AndNot, 64, 2, true, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_vpandnq, INS_vpandnq, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX512F, AndNot, 64, 2, true, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_vpandnq, INS_vpandnq, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX512F, BlendVariable, 64, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512F, BroadcastVector128ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MaybeMemoryLoad) @@ -1156,7 +1156,7 @@ HARDWARE_INTRINSIC(AVX512CD_VL, LeadingZeroCount, // AVX512DQ Intrinsics #define FIRST_NI_AVX512DQ NI_AVX512DQ_And HARDWARE_INTRINSIC(AVX512DQ, And, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX512DQ, AndNot, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX512DQ, AndNot, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX512DQ, BroadcastPairScalarToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x2, INS_vbroadcasti32x2, INS_invalid, INS_invalid, INS_vbroadcastf32x2, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512DQ, BroadcastVector128ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti64x2, INS_vbroadcasti64x2, INS_invalid, INS_vbroadcastf64x2}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512DQ, BroadcastVector256ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x8, INS_vbroadcasti32x8, INS_invalid, INS_invalid, INS_vbroadcastf32x8, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) @@ -1341,7 +1341,7 @@ HARDWARE_INTRINSIC(AVX10v1, TernaryLogic, // AVX10V1_V512 Intrinsics #define FIRST_NI_AVX10v1_V512 NI_AVX10v1_V512_And HARDWARE_INTRINSIC(AVX10v1_V512, And, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX10v1_V512, AndNot, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1_V512, AndNot, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX10v1_V512, BroadcastPairScalarToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x2, INS_vbroadcasti32x2, INS_invalid, INS_invalid, INS_vbroadcastf32x2, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX10v1_V512, BroadcastVector128ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti64x2, INS_vbroadcasti64x2, INS_invalid, INS_vbroadcastf64x2}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX10v1_V512, BroadcastVector256ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x8, INS_vbroadcasti32x8, INS_invalid, INS_invalid, INS_vbroadcastf32x8, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) @@ -1411,7 +1411,7 @@ HARDWARE_INTRINSIC(AES, KeygenAssist, // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // BMI1 Intrinsics #define FIRST_NI_BMI1 NI_BMI1_AndNot -HARDWARE_INTRINSIC(BMI1, AndNot, 0, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI1, AndNot, 0, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_SpecialImport|HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(BMI1, BitFieldExtract, 0, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bextr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(BMI1, ExtractLowestSetBit, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsi, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(BMI1, GetMaskUpToLowestSetBit, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsmsk, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed) @@ -1425,7 +1425,7 @@ HARDWARE_INTRINSIC(BMI1, TrailingZeroCount, // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // BMI1 Intrinsics #define FIRST_NI_BMI1_X64 NI_BMI1_X64_AndNot -HARDWARE_INTRINSIC(BMI1_X64, AndNot, 0, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI1_X64, AndNot, 0, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_SpecialImport|HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(BMI1_X64, BitFieldExtract, 0, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bextr, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(BMI1_X64, ExtractLowestSetBit, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsi, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(BMI1_X64, GetMaskUpToLowestSetBit, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsmsk, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed) @@ -1572,6 +1572,7 @@ HARDWARE_INTRINSIC(EVEX, OrMask, HARDWARE_INTRINSIC(EVEX, ShiftLeftMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(EVEX, ShiftRightMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(EVEX, XorMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(EVEX, XnorMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) #endif // FEATURE_HW_INTRINSIC diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index a09fb4950f5068..fe9539037f1473 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1394,15 +1394,52 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_SSE_AndNot: + case NI_SSE2_AndNot: + case NI_AVX_AndNot: + case NI_AVX2_AndNot: + case NI_AVX512F_AndNot: + case NI_AVX512DQ_AndNot: + case NI_AVX10v1_V512_AndNot: + { + assert(sig->numArgs == 2); + + // We don't want to support creating AND_NOT nodes prior to LIR + // as it can break important optimizations. We'll produces this + // in lowering instead so decompose into the individual operations + // on import, taking into account that despite the name, these APIs + // do (~op1 & op2), so we need to account for that + + op2 = impSIMDPopStack(); + op1 = impSIMDPopStack(); + + op1 = gtNewSimdUnOpNode(GT_NOT, retType, op1, simdBaseJitType, simdSize); + retNode = gtNewSimdBinOpNode(GT_AND, retType, op1, op2, simdBaseJitType, simdSize); + break; + } + + case NI_BMI1_AndNot: + case NI_BMI1_X64_AndNot: + { + assert(sig->numArgs == 2); + + // The same general reasoning for the decomposition exists here as + // given above for the SIMD AndNot APIs. + + op2 = impPopStack().val; + op1 = impPopStack().val; + + op1 = gtNewOperNode(GT_NOT, retType, op1); + retNode = gtNewOperNode(GT_AND, retType, op1, op2); + break; + } + case NI_Vector128_AndNot: case NI_Vector256_AndNot: case NI_Vector512_AndNot: { assert(sig->numArgs == 2); - impSpillSideEffect(true, - verCurrentState.esStackDepth - 2 DEBUGARG("Spilling op1 side effects for HWIntrinsic")); - op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); diff --git a/src/coreclr/jit/importervectorization.cpp b/src/coreclr/jit/importervectorization.cpp index 7447c35de2f297..281b4f6ed4ccd8 100644 --- a/src/coreclr/jit/importervectorization.cpp +++ b/src/coreclr/jit/importervectorization.cpp @@ -160,20 +160,8 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD( GenTreeVecCon* toLowerVec1 = gtNewVconNode(simdType, toLowerMask); GenTreeVecCon* toLowerVec2 = gtNewVconNode(simdType, (BYTE*)toLowerMask + byteLen - simdSize); -#if defined(TARGET_XARCH) - if (canUseEvexEncoding()) - { - GenTree* control; - - control = gtNewIconNode(static_cast((0xF0 | 0xCC) ^ 0xAA)); // (A | B)) ^ C - xor1 = gtNewSimdTernaryLogicNode(simdType, vec1, toLowerVec1, cnsVec1, control, baseType, simdSize); - } - else -#endif // TARGET_XARCH - { - vec1 = gtNewSimdBinOpNode(GT_OR, simdType, vec1, toLowerVec1, baseType, simdSize); - xor1 = gtNewSimdBinOpNode(GT_XOR, simdType, vec1, cnsVec1, baseType, simdSize); - } + vec1 = gtNewSimdBinOpNode(GT_OR, simdType, vec1, toLowerVec1, baseType, simdSize); + xor1 = gtNewSimdBinOpNode(GT_XOR, simdType, vec1, cnsVec1, baseType, simdSize); vec2 = gtNewSimdBinOpNode(GT_OR, simdType, vec2, toLowerVec2, baseType, simdSize); } @@ -184,22 +172,10 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD( // ((v1 ^ cns1) | (v2 ^ cns2)) == zero -#if defined(TARGET_XARCH) - if (canUseEvexEncoding()) - { - GenTree* control; - - control = gtNewIconNode(static_cast(0xF0 | (0xCC ^ 0xAA))); // A | (B ^ C) - orr = gtNewSimdTernaryLogicNode(simdType, xor1, vec2, cnsVec2, control, baseType, simdSize); - } - else -#endif // TARGET_XARCH - { - GenTree* xor2; + GenTree* xor2; - xor2 = gtNewSimdBinOpNode(GT_XOR, simdType, vec2, cnsVec2, baseType, simdSize); - orr = gtNewSimdBinOpNode(GT_OR, simdType, xor1, xor2, baseType, simdSize); - } + xor2 = gtNewSimdBinOpNode(GT_XOR, simdType, vec2, cnsVec2, baseType, simdSize); + orr = gtNewSimdBinOpNode(GT_OR, simdType, xor1, xor2, baseType, simdSize); // Optimization: use a single load when byteLen equals simdSize. // For code simplicity we always create nodes for two vectors case. diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index ca37c3a951c59a..3e268d5d71c6af 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1233,6 +1233,76 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); + bool isScalar = false; + genTreeOps oper = node->GetOperForHWIntrinsicId(&isScalar); + + switch (oper) + { + case GT_AND: + case GT_OR: + { + // We want to recognize (~op1 & op2) and transform it + // into AdvSimd.AndNot(op2, op1) as well as (op1 & ~op2) + // transforming it into AdvSimd.AndNot(op1, op2) + // + // We want to similarly handle (~op1 | op2) and (op1 | ~op2) + + bool transform = false; + + GenTree* op1 = node->Op(1); + GenTree* op2 = node->Op(2); + + if (op2->OperIsHWIntrinsic()) + { + bool op2IsScalar = false; + genTreeOps op2Oper = op2->AsHWIntrinsic()->GetOperForHWIntrinsicId(&op2IsScalar); + + if (op2Oper == GT_NOT) + { + assert(!op2IsScalar); + transform = true; + } + } + + if (!transform && op1->OperIsHWIntrinsic()) + { + bool op1IsScalar = false; + genTreeOps op1Oper = op1->AsHWIntrinsic()->GetOperForHWIntrinsicId(&op1IsScalar); + + if (op1Oper == GT_NOT) + { + assert(!op1IsScalar); + transform = true; + std::swap(op1, op2); + } + } + + if (transform) + { + if (oper == GT_AND) + { + oper = GT_AND_NOT; + intrinsicId = NI_AdvSimd_BitwiseClear; + } + else + { + assert(oper == GT_OR); + oper = GT_NONE; + intrinsicId = NI_AdvSimd_OrNot; + } + + node->ChangeHWIntrinsicId(intrinsicId, op1, op2); + oper = GT_AND_NOT; + } + break; + } + + default: + { + break; + } + } + switch (intrinsicId) { case NI_Vector64_Create: diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 97800d5d10ddb8..372ea7df10ecda 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1418,6 +1418,60 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) bool isScalar = false; genTreeOps oper = node->GetOperForHWIntrinsicId(&isScalar); + if (oper == GT_AND) + { + // We want to recognize (~op1 & op2) and transform it + // into XarchIsa.AndNot(op1, op2) as well as (op1 & ~op2) + // transforming it into XarchIsa.AndNot(op2, op1), which + // takes into account that the XARCH apis operate more like + // NotAnd + + bool transform = false; + + GenTree* op1 = node->Op(1); + GenTree* op2 = node->Op(2); + + if (op1->OperIsHWIntrinsic()) + { + GenTreeHWIntrinsic* op1Intrin = op1->AsHWIntrinsic(); + + bool op1IsScalar = false; + genTreeOps op1Oper = op1Intrin->GetOperForHWIntrinsicId(&op1IsScalar); + + if ((op1Oper == GT_XOR) && op1Intrin->Op(2)->IsVectorAllBitsSet()) + { + assert(!op1IsScalar); + transform = true; + } + } + + if (!transform && op2->OperIsHWIntrinsic()) + { + GenTreeHWIntrinsic* op2Intrin = op2->AsHWIntrinsic(); + + bool op2IsScalar = false; + genTreeOps op2Oper = op2Intrin->GetOperForHWIntrinsicId(&op2IsScalar); + + if ((op2Oper == GT_XOR) && op2Intrin->Op(2)->IsVectorAllBitsSet()) + { + assert(!op2IsScalar); + transform = true; + std::swap(op1, op2); + } + } + + if (transform) + { + var_types simdBaseType = node->GetSimdBaseType(); + unsigned simdSize = node->GetSimdSize(); + + oper = GT_AND_NOT; + intrinsicId = + GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(comp, oper, op1, op2, simdBaseType, simdSize, false); + node->ChangeHWIntrinsicId(intrinsicId, op1, op2); + } + } + switch (oper) { case GT_AND: @@ -1741,6 +1795,81 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_EVEX_AndMask: + { + // We want to recognize (~op1 & op2) and transform it + // into Evex.AndNotMask(op1, op2) as well as (op1 & ~op2) + // transforming it into Evex.AndNotMask(op2, op1), which + // takes into account that the XARCH APIs operate more like + // NotAnd + + bool transform = false; + + GenTree* op1 = node->Op(1); + GenTree* op2 = node->Op(2); + + if (op1->OperIsHWIntrinsic(NI_EVEX_NotMask)) + { + unsigned simdBaseTypeSize = genTypeSize(node->GetSimdBaseType()); + + if (genTypeSize(op1->AsHWIntrinsic()->GetSimdBaseType()) == simdBaseTypeSize) + { + transform = true; + } + } + + if (!transform && op2->OperIsHWIntrinsic(NI_EVEX_NotMask)) + { + unsigned simdBaseTypeSize = genTypeSize(node->GetSimdBaseType()); + + if (genTypeSize(op2->AsHWIntrinsic()->GetSimdBaseType()) == simdBaseTypeSize) + { + transform = true; + std::swap(op1, op2); + } + } + + if (transform) + { + intrinsicId = NI_EVEX_AndNotMask; + node->ChangeHWIntrinsicId(intrinsicId, op1, op2); + } + break; + } + + case NI_EVEX_NotMask: + { + // We want to recognize ~(op1 ^ op2) and transform it + // into Evex.XnorMask(op1, op2) + + GenTree* op1 = node->Op(1); + + if (op1->OperIsHWIntrinsic(NI_EVEX_XorMask)) + { + unsigned simdBaseTypeSize = genTypeSize(node->GetSimdBaseType()); + + if (genTypeSize(op1->AsHWIntrinsic()->GetSimdBaseType()) == simdBaseTypeSize) + { + LIR::Use use; + if (BlockRange().TryGetUse(node, &use)) + { + use.ReplaceWith(op1); + } + else + { + op1->SetUnusedValue(); + } + + BlockRange().Remove(node); + node = op1->AsHWIntrinsic(); + + intrinsicId = NI_EVEX_XnorMask; + node->ChangeHWIntrinsicId(intrinsicId, node->Op(1), node->Op(2)); + } + } + break; + } + case NI_Vector128_ToScalar: case NI_Vector256_ToScalar: case NI_Vector512_ToScalar: diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 0205f020825ff4..3bde4235325a10 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -9971,6 +9971,9 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node) genTreeOps actualOper = node->GetOperForHWIntrinsicId(&isScalar); genTreeOps oper = actualOper; + // We shouldn't find AND_NOT nodes since it should only be produced in lowering + assert(oper != GT_AND_NOT); + if (GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper)) { GenTree* op1 = node->Op(1); @@ -10049,12 +10052,6 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node) break; } - case GT_AND_NOT: - { - maskIntrinsicId = NI_EVEX_AndNotMask; - break; - } - case GT_NOT: { maskIntrinsicId = NI_EVEX_NotMask; @@ -10134,91 +10131,6 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node) switch (oper) { - // Transforms: - // 1. (~v1 & v2) to VectorXxx.AndNot(v2, v1) - // 2. (v1 & ~v2) to VectorXxx.AndNot(v1, v2) - case GT_AND: - { - GenTree* op1 = node->Op(1); - GenTree* op2 = node->Op(2); - GenTree* lhs = nullptr; - GenTree* rhs = nullptr; - - if (op1->OperIsHWIntrinsic()) - { - // Try handle: ~op1 & op2 - GenTreeHWIntrinsic* hw = op1->AsHWIntrinsic(); - genTreeOps hwOper = hw->GetOperForHWIntrinsicId(&isScalar); - - if (isScalar) - { - return node; - } - -#if defined(TARGET_ARM64) - if (hwOper == GT_NOT) - { - lhs = op2; - rhs = hw->Op(1); - } -#elif defined(TARGET_XARCH) - if ((hwOper == GT_XOR) && hw->Op(2)->IsVectorAllBitsSet()) - { - lhs = op2; - rhs = hw->Op(1); - } -#endif // !TARGET_ARM64 && !TARGET_XARCH - } - - if ((lhs == nullptr) && op2->OperIsHWIntrinsic()) - { - // Try handle: op1 & ~op2 - GenTreeHWIntrinsic* hw = op2->AsHWIntrinsic(); - genTreeOps hwOper = hw->GetOperForHWIntrinsicId(&isScalar); - - if (isScalar) - { - return node; - } - -#if defined(TARGET_ARM64) - if (hwOper == GT_NOT) - { - lhs = op1; - rhs = hw->Op(1); - } -#elif defined(TARGET_XARCH) - if ((hwOper == GT_XOR) && hw->Op(2)->IsVectorAllBitsSet()) - { - lhs = op1; - rhs = hw->Op(1); - } -#endif // !TARGET_ARM64 && !TARGET_XARCH - } - - if (lhs == nullptr) - { - break; - } - assert(rhs != nullptr); - - // Filter out side effecting cases for several reasons: - // 1. gtNewSimdBinOpNode may swap operand order. - // 2. The code above will swap operand order. - // 3. The code above does not handle GTF_REVERSE_OPS. - if (((lhs->gtFlags | rhs->gtFlags) & GTF_ALL_EFFECT) != 0) - { - break; - } - - GenTree* andnNode = gtNewSimdBinOpNode(GT_AND_NOT, retType, lhs, rhs, simdBaseJitType, simdSize); - - DEBUG_DESTROY_NODE(node); - INDEBUG(andnNode->gtDebugFlags |= GTF_DEBUG_NODE_MORPHED); - - return andnNode; - } - #if defined(TARGET_ARM64) // Transforms: // 1. -(-v1) to v1 diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index 09ea63f5e0ae4b..1b795678e7b000 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -7919,13 +7919,11 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(GenTreeHWIntrinsic* tree, if (oper != GT_NONE) { + // We shouldn't find AND_NOT nodes since it should only be produced in lowering + assert(oper != GT_AND_NOT); + #if defined(TARGET_XARCH) - if (oper == GT_AND_NOT) - { - // xarch does: ~arg0VN & arg1VN - std::swap(arg0VN, arg1VN); - } - else if ((oper == GT_LSH) || (oper == GT_RSH) || (oper == GT_RSZ)) + if ((oper == GT_LSH) || (oper == GT_RSH) || (oper == GT_RSZ)) { if (TypeOfVN(arg1VN) == TYP_SIMD16) { @@ -8047,6 +8045,9 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(GenTreeHWIntrinsic* tree, bool isScalar = false; genTreeOps oper = tree->GetOperForHWIntrinsicId(&isScalar); + // We shouldn't find AND_NOT nodes since it should only be produced in lowering + assert(oper != GT_AND_NOT); + if (isScalar) { // We don't support folding scalars today @@ -8108,37 +8109,6 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(GenTreeHWIntrinsic* tree, break; } - case GT_AND_NOT: - { -#if defined(TARGET_XARCH) - std::swap(arg0VN, arg1VN); -#endif // TARGET_XARCH - - // Handle `x & ~0 == x` and `0 & ~x == 0` - ValueNum zeroVN = VNZeroForType(type); - - if (cnsVN == zeroVN) - { - if (cnsVN == arg0VN) - { - return zeroVN; - } - return argVN; - } - - // Handle `x & ~AllBitsSet == 0` - ValueNum allBitsVN = VNAllBitsForType(type); - - if (cnsVN == allBitsVN) - { - if (cnsVN == arg1VN) - { - return zeroVN; - } - } - break; - } - case GT_DIV: { if (varTypeIsFloating(baseType)) @@ -8397,6 +8367,9 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(GenTreeHWIntrinsic* tree, bool isScalar = false; genTreeOps oper = tree->GetOperForHWIntrinsicId(&isScalar); + // We shouldn't find AND_NOT nodes since it should only be produced in lowering + assert(oper != GT_AND_NOT); + if (isScalar) { // We don't support folding scalars today @@ -8411,12 +8384,6 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(GenTreeHWIntrinsic* tree, return arg0VN; } - case GT_AND_NOT: - { - // Handle `x & ~x == 0` - return VNZeroForType(type); - } - case GT_OR: { // Handle `x | x == x` @@ -8575,12 +8542,12 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunTernary(GenTreeHWIntrinsic* tree, switch (ni) { - case NI_Vector128_ConditionalSelect: #if defined(TARGET_XARCH) + case NI_Vector128_ConditionalSelect: case NI_Vector256_ConditionalSelect: case NI_Vector512_ConditionalSelect: #elif defined(TARGET_ARM64) - case NI_Vector64_ConditionalSelect: + case NI_AdvSimd_BitwiseSelect: case NI_Sve_ConditionalSelect: #endif { From 51ece716e37e3c7ee16365720c392794652ac304 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Sun, 7 Jul 2024 07:12:57 -0700 Subject: [PATCH 02/10] Ensure that we actually remove the underlying op --- src/coreclr/jit/lowerarmarch.cpp | 15 ++++++-- src/coreclr/jit/lowerxarch.cpp | 59 ++++++++++++++++++-------------- 2 files changed, 46 insertions(+), 28 deletions(-) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 3e268d5d71c6af..498edfdcb54288 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1254,25 +1254,36 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) if (op2->OperIsHWIntrinsic()) { + GenTreeHWIntrinsic* op2Intrin = op2->AsHWIntrinsic(); + bool op2IsScalar = false; - genTreeOps op2Oper = op2->AsHWIntrinsic()->GetOperForHWIntrinsicId(&op2IsScalar); + genTreeOps op2Oper = op2Intrin->GetOperForHWIntrinsicId(&op2IsScalar); if (op2Oper == GT_NOT) { assert(!op2IsScalar); transform = true; + + op2 = op2Intrin->Op(1); + BlockRange().Remove(op2Intrin); } } if (!transform && op1->OperIsHWIntrinsic()) { + GenTreeHWIntrinsic* opIntrin = op1->AsHWIntrinsic(); + bool op1IsScalar = false; - genTreeOps op1Oper = op1->AsHWIntrinsic()->GetOperForHWIntrinsicId(&op1IsScalar); + genTreeOps op1Oper = opIntrin->GetOperForHWIntrinsicId(&op1IsScalar); if (op1Oper == GT_NOT) { assert(!op1IsScalar); transform = true; + + op1 = opIntrin->Op(1); + BlockRange().Remove(opIntrin); + std::swap(op1, op2); } } diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 372ea7df10ecda..df178bb56d47a5 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1433,29 +1433,38 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) if (op1->OperIsHWIntrinsic()) { - GenTreeHWIntrinsic* op1Intrin = op1->AsHWIntrinsic(); + GenTreeHWIntrinsic* opIntrin = op1->AsHWIntrinsic(); bool op1IsScalar = false; - genTreeOps op1Oper = op1Intrin->GetOperForHWIntrinsicId(&op1IsScalar); + genTreeOps op1Oper = opIntrin->GetOperForHWIntrinsicId(&op1IsScalar); - if ((op1Oper == GT_XOR) && op1Intrin->Op(2)->IsVectorAllBitsSet()) + if ((op1Oper == GT_XOR) && opIntrin->Op(2)->IsVectorAllBitsSet()) { assert(!op1IsScalar); transform = true; + + op1 = opIntrin->Op(1); + BlockRange().Remove(opIntrin->Op(2)); + BlockRange().Remove(opIntrin); } } if (!transform && op2->OperIsHWIntrinsic()) { - GenTreeHWIntrinsic* op2Intrin = op2->AsHWIntrinsic(); + GenTreeHWIntrinsic* opIntrin = op2->AsHWIntrinsic(); bool op2IsScalar = false; - genTreeOps op2Oper = op2Intrin->GetOperForHWIntrinsicId(&op2IsScalar); + genTreeOps op2Oper = opIntrin->GetOperForHWIntrinsicId(&op2IsScalar); - if ((op2Oper == GT_XOR) && op2Intrin->Op(2)->IsVectorAllBitsSet()) + if ((op2Oper == GT_XOR) && opIntrin->Op(2)->IsVectorAllBitsSet()) { assert(!op2IsScalar); transform = true; + + op2 = opIntrin->Op(1); + BlockRange().Remove(opIntrin->Op(2)); + BlockRange().Remove(opIntrin); + std::swap(op1, op2); } } @@ -1810,21 +1819,30 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) if (op1->OperIsHWIntrinsic(NI_EVEX_NotMask)) { - unsigned simdBaseTypeSize = genTypeSize(node->GetSimdBaseType()); + GenTreeHWIntrinsic* opIntrin = op1->AsHWIntrinsic(); + unsigned simdBaseTypeSize = genTypeSize(node->GetSimdBaseType()); - if (genTypeSize(op1->AsHWIntrinsic()->GetSimdBaseType()) == simdBaseTypeSize) + if (genTypeSize(opIntrin->GetSimdBaseType()) == simdBaseTypeSize) { transform = true; + + op1 = opIntrin->Op(1); + BlockRange().Remove(opIntrin); } } if (!transform && op2->OperIsHWIntrinsic(NI_EVEX_NotMask)) { - unsigned simdBaseTypeSize = genTypeSize(node->GetSimdBaseType()); + GenTreeHWIntrinsic* opIntrin = op2->AsHWIntrinsic(); + unsigned simdBaseTypeSize = genTypeSize(node->GetSimdBaseType()); - if (genTypeSize(op2->AsHWIntrinsic()->GetSimdBaseType()) == simdBaseTypeSize) + if (genTypeSize(opIntrin->GetSimdBaseType()) == simdBaseTypeSize) { transform = true; + + op1 = opIntrin->Op(1); + BlockRange().Remove(opIntrin); + std::swap(op1, op2); } } @@ -1846,25 +1864,14 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) if (op1->OperIsHWIntrinsic(NI_EVEX_XorMask)) { - unsigned simdBaseTypeSize = genTypeSize(node->GetSimdBaseType()); + GenTreeHWIntrinsic* opIntrin = op1->AsHWIntrinsic(); + unsigned simdBaseTypeSize = genTypeSize(node->GetSimdBaseType()); - if (genTypeSize(op1->AsHWIntrinsic()->GetSimdBaseType()) == simdBaseTypeSize) + if (genTypeSize(opIntrin->GetSimdBaseType()) == simdBaseTypeSize) { - LIR::Use use; - if (BlockRange().TryGetUse(node, &use)) - { - use.ReplaceWith(op1); - } - else - { - op1->SetUnusedValue(); - } - - BlockRange().Remove(node); - node = op1->AsHWIntrinsic(); - intrinsicId = NI_EVEX_XnorMask; - node->ChangeHWIntrinsicId(intrinsicId, node->Op(1), node->Op(2)); + node->ResetHWIntrinsicId(intrinsicId, comp, opIntrin->Op(1), opIntrin->Op(2)); + BlockRange().Remove(opIntrin); } } break; From 75741109cc134c0e4ee279a8e05279da0f7021b4 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Sun, 7 Jul 2024 10:24:17 -0700 Subject: [PATCH 03/10] Ensure the AND_NOT decomposition is still folded during import for minopts --- src/coreclr/jit/gentree.cpp | 15 +++++++++++---- src/coreclr/jit/hwintrinsicarm64.cpp | 18 ++++-------------- src/coreclr/jit/hwintrinsicxarch.cpp | 12 +++++++++--- 3 files changed, 24 insertions(+), 21 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index fa1bd73aa3dea0..90604ff8c2ae59 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -20615,10 +20615,17 @@ GenTree* Compiler::gtNewSimdAbsNode(var_types type, GenTree* op1, CorInfoType si GenTree* bitMask; - bitMask = gtNewDconNode(-0.0, simdBaseType); - bitMask = gtNewSimdCreateBroadcastNode(type, bitMask, simdBaseJitType, simdSize); - - return gtNewSimdBinOpNode(GT_AND_NOT, type, op1, bitMask, simdBaseJitType, simdSize); + if (simdBaseType == TYP_FLOAT) + { + bitMask = gtNewIconNode(0x7FFFFFFF); + bitMask = gtNewSimdCreateBroadcastNode(type, bitMask, CORINFO_TYPE_INT, simdSize); + } + else + { + bitMask = gtNewLconNode(0x7FFFFFFFFFFFFFFF); + bitMask = gtNewSimdCreateBroadcastNode(type, bitMask, CORINFO_TYPE_LONG, simdSize); + } + return gtNewSimdBinOpNode(GT_AND, type, op1, bitMask, simdBaseJitType, simdSize); } NamedIntrinsic intrinsic = NI_Illegal; diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index eeccc08cce0908..e0dbdf3d8f1250 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -612,6 +612,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, } case NI_AdvSimd_BitwiseClear: + case NI_Vector64_AndNot: + case NI_Vector128_AndNot: { assert(sig->numArgs == 2); @@ -623,7 +625,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - op2 = gtNewSimdUnOpNode(GT_NOT, retType, op2, simdBaseJitType, simdSize); + op2 = gtFoldExpr(gtNewSimdUnOpNode(GT_NOT, retType, op2, simdBaseJitType, simdSize)); retNode = gtNewSimdBinOpNode(GT_AND, retType, op1, op2, simdBaseJitType, simdSize); break; } @@ -640,23 +642,11 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - op2 = gtNewSimdUnOpNode(GT_NOT, retType, op2, simdBaseJitType, simdSize); + op2 = gtFoldExpr(gtNewSimdUnOpNode(GT_NOT, retType, op2, simdBaseJitType, simdSize)); retNode = gtNewSimdBinOpNode(GT_OR, retType, op1, op2, simdBaseJitType, simdSize); break; } - case NI_Vector64_AndNot: - case NI_Vector128_AndNot: - { - assert(sig->numArgs == 2); - - op2 = impSIMDPopStack(); - op1 = impSIMDPopStack(); - - retNode = gtNewSimdBinOpNode(GT_AND_NOT, retType, op1, op2, simdBaseJitType, simdSize); - break; - } - case NI_Vector64_As: case NI_Vector64_AsByte: case NI_Vector64_AsDouble: diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index fe9539037f1473..4aa5e5c2d128ff 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1413,7 +1413,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - op1 = gtNewSimdUnOpNode(GT_NOT, retType, op1, simdBaseJitType, simdSize); + op1 = gtFoldExpr(gtNewSimdUnOpNode(GT_NOT, retType, op1, simdBaseJitType, simdSize)); retNode = gtNewSimdBinOpNode(GT_AND, retType, op1, op2, simdBaseJitType, simdSize); break; } @@ -1429,7 +1429,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impPopStack().val; op1 = impPopStack().val; - op1 = gtNewOperNode(GT_NOT, retType, op1); + op1 = gtFoldExpr(gtNewOperNode(GT_NOT, retType, op1)); retNode = gtNewOperNode(GT_AND, retType, op1, op2); break; } @@ -1440,10 +1440,16 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { assert(sig->numArgs == 2); + // We don't want to support creating AND_NOT nodes prior to LIR + // as it can break important optimizations. We'll produces this + // in lowering instead so decompose into the individual operations + // on import + op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdBinOpNode(GT_AND_NOT, retType, op1, op2, simdBaseJitType, simdSize); + op2 = gtFoldExpr(gtNewSimdUnOpNode(GT_NOT, retType, op2, simdBaseJitType, simdSize)); + retNode = gtNewSimdBinOpNode(GT_AND, retType, op1, op2, simdBaseJitType, simdSize); break; } From 3ad1ec20c65cea50b916f47894228144d9a155aa Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Sun, 7 Jul 2024 10:36:30 -0700 Subject: [PATCH 04/10] Ensure we propagate AllBitsSet into simd GT_XOR on xarch --- src/coreclr/jit/gentree.cpp | 15 +++++++++++++++ src/coreclr/jit/hwintrinsiclistxarch.h | 14 +++++++------- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 90604ff8c2ae59..d7fe83c22893c2 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -28902,6 +28902,21 @@ bool GenTreeHWIntrinsic::ShouldConstantProp(GenTree* operand, GenTreeVecCon* vec return IsUserCall() && (operand == Op(2)); } +#if defined(TARGET_XARCH) + case NI_SSE_Xor: + case NI_SSE2_Xor: + case NI_AVX_Xor: + case NI_AVX2_Xor: + case NI_AVX512F_Xor: + case NI_AVX512DQ_Xor: + case NI_AVX10v1_V512_Xor: + { + // We recognize this as GT_NOT which can enable other optimizations + assert(GetOperandCount() == 2); + return vecCon->IsVectorAllBitsSet(); + } +#endif // TARGET_XARCH + default: { break; diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 0d8eba2e3be1d2..ebb82a2d863536 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -441,7 +441,7 @@ HARDWARE_INTRINSIC(SSE, Subtract, HARDWARE_INTRINSIC(SSE, SubtractScalar, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE, UnpackHigh, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpckhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(SSE, UnpackLow, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpcklps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(SSE, Xor, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(SSE, Xor, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp) #define LAST_NI_SSE NI_SSE_Xor // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** @@ -563,7 +563,7 @@ HARDWARE_INTRINSIC(SSE2, SubtractScalar, HARDWARE_INTRINSIC(SSE2, SumAbsoluteDifferences, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_psadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE2, UnpackHigh, 16, 2, true, {INS_punpckhbw, INS_punpckhbw, INS_punpckhwd, INS_punpckhwd, INS_punpckhdq, INS_punpckhdq, INS_punpckhqdq, INS_punpckhqdq, INS_invalid, INS_unpckhpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(SSE2, UnpackLow, 16, 2, true, {INS_punpcklbw, INS_punpcklbw, INS_punpcklwd, INS_punpcklwd, INS_punpckldq, INS_punpckldq, INS_punpcklqdq, INS_punpcklqdq, INS_invalid, INS_unpcklpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(SSE2, Xor, 16, 2, true, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_invalid, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(SSE2, Xor, 16, 2, true, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_invalid, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp) #define LAST_NI_SSE2 NI_SSE2_Xor // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** @@ -764,7 +764,7 @@ HARDWARE_INTRINSIC(AVX, TestNotZAndNotC, HARDWARE_INTRINSIC(AVX, TestZ, -1, 2, true, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, UnpackHigh, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpckhps, INS_unpckhpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX, UnpackLow, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpcklps, INS_unpcklpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX, Xor, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX, Xor, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp) #define LAST_NI_AVX NI_AVX_Xor // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** @@ -838,7 +838,7 @@ HARDWARE_INTRINSIC(AVX2, SubtractSaturate, HARDWARE_INTRINSIC(AVX2, SumAbsoluteDifferences, 32, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_psadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, UnpackHigh, 32, 2, true, {INS_punpckhbw, INS_punpckhbw, INS_punpckhwd, INS_punpckhwd, INS_punpckhdq, INS_punpckhdq, INS_punpckhqdq, INS_punpckhqdq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX2, UnpackLow, 32, 2, true, {INS_punpcklbw, INS_punpcklbw, INS_punpcklwd, INS_punpcklwd, INS_punpckldq, INS_punpckldq, INS_punpcklqdq, INS_punpcklqdq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX2, Xor, 32, 2, false, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX2, Xor, 32, 2, false, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp) #define LAST_NI_AVX2 NI_AVX2_Xor // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** @@ -975,7 +975,7 @@ HARDWARE_INTRINSIC(AVX512F, SubtractScalar, HARDWARE_INTRINSIC(AVX512F, TernaryLogic, 64, 4, true, {INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogq, INS_vpternlogq, INS_vpternlogd, INS_vpternlogq}, HW_Category_IMM, HW_Flag_SpecialImport|HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX512F, UnpackHigh, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_punpckhdq, INS_punpckhdq, INS_punpckhqdq, INS_punpckhqdq, INS_unpckhps, INS_unpckhpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX512F, UnpackLow, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_punpckldq, INS_punpckldq, INS_punpcklqdq, INS_punpcklqdq, INS_unpcklps, INS_unpcklpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX512F, Xor, 64, 2, true, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_vpxorq, INS_vpxorq, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX512F, Xor, 64, 2, true, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_vpxorq, INS_vpxorq, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp) #define LAST_NI_AVX512F NI_AVX512F_Xor // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** @@ -1176,7 +1176,7 @@ HARDWARE_INTRINSIC(AVX512DQ, Range, HARDWARE_INTRINSIC(AVX512DQ, RangeScalar, 16, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrangess, INS_vrangesd}, HW_Category_IMM, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(AVX512DQ, Reduce, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vreduceps, INS_vreducepd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX512DQ, ReduceScalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vreducess, INS_vreducesd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(AVX512DQ, Xor, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX512DQ, Xor, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp) #define LAST_NI_AVX512DQ NI_AVX512DQ_Xor // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** @@ -1364,7 +1364,7 @@ HARDWARE_INTRINSIC(AVX10v1_V512, PermuteVar64x8, HARDWARE_INTRINSIC(AVX10v1_V512, PermuteVar64x8x2, 64, 3, false, {INS_vpermt2b, INS_vpermt2b, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_PermuteVar2x|HW_Flag_RmwIntrinsic|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX10v1_V512, Range, 64, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrangeps, INS_vrangepd}, HW_Category_IMM, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX10v1_V512, Reduce, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vreduceps, INS_vreducepd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX10v1_V512, Xor, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1_V512, Xor, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp) #define LAST_NI_AVX10v1_V512 NI_AVX10v1_V512_Xor // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** From d08dcb47f61ed236e9886c72c81519ed75fee493 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Sun, 7 Jul 2024 15:23:55 -0700 Subject: [PATCH 05/10] Ensure that we prefer AndNot over TernaryLogic --- src/coreclr/jit/lowerxarch.cpp | 36 ++++++++++++++++------------------ 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index df178bb56d47a5..62dd3a5862beba 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1508,43 +1508,41 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) +- t1 t2 = binary logical op2 */ - GenTree* second = use.User(); - if (!second->OperIs(GT_HWINTRINSIC)) + GenTree* user = use.User(); + if (!user->OperIs(GT_HWINTRINSIC)) { break; } - bool nestedIsScalar = false; - genTreeOps nestedOper = second->AsHWIntrinsic()->GetOperForHWIntrinsicId(&isScalar); + GenTreeHWIntrinsic* userIntrin = user->AsHWIntrinsic(); - if (nestedOper == GT_NONE) - { - // TODO: We should support cases like CNDSEL - break; - } + bool userIsScalar = false; + genTreeOps userOper = userIntrin->GetOperForHWIntrinsicId(&isScalar); - if (nestedIsScalar) + if ((userOper != GT_AND) && (userOper != GT_OR) && (userOper != GT_XOR)) { + // TODO: We should support other cases like AND_NOT, CNDSEL, and TernaryLogic break; } + assert(!userIsScalar); - if ((nestedOper != GT_AND) && (nestedOper != GT_OR) && (nestedOper != GT_XOR)) + if ((userOper == GT_AND) && (oper == GT_XOR) && op2->IsVectorAllBitsSet()) { - // TODO: We should support other cases like AND_NOT, NOT, and CNDSEL - break; + // We have something that will transform to AND_NOT, which we want to + // prefer over TernaryLogic or any other transform. + return node->gtNext; } - GenTree* op3 = second->AsHWIntrinsic()->Op(1) == node ? second->AsHWIntrinsic()->Op(2) - : second->AsHWIntrinsic()->Op(1); - GenTree* control = comp->gtNewIconNode(node->GetTernaryControlByte(second->AsHWIntrinsic())); + GenTree* op3 = userIntrin->Op(1) == node ? userIntrin->Op(2) : userIntrin->Op(1); + GenTree* control = comp->gtNewIconNode(node->GetTernaryControlByte(userIntrin)); CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); unsigned simdSize = node->GetSimdSize(); var_types simdType = Compiler::getSIMDTypeForSize(simdSize); GenTree* ternaryNode = comp->gtNewSimdTernaryLogicNode(simdType, op1, op2, op3, control, simdBaseJitType, simdSize); - BlockRange().InsertBefore(second, control, ternaryNode); + BlockRange().InsertBefore(userIntrin, control, ternaryNode); LIR::Use finalRes; - if (BlockRange().TryGetUse(second, &finalRes)) + if (BlockRange().TryGetUse(userIntrin, &finalRes)) { finalRes.ReplaceWith(ternaryNode); } @@ -1554,7 +1552,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) } GenTree* next = node->gtNext; BlockRange().Remove(node); - BlockRange().Remove(second); + BlockRange().Remove(userIntrin); return next; } break; From 2c93a186f4dbdb6d335fd3f8c7db835810e8c4f7 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Sun, 7 Jul 2024 17:23:08 -0700 Subject: [PATCH 06/10] Cleanup the TernaryLogic lowering code --- src/coreclr/jit/gentree.cpp | 79 ---------- src/coreclr/jit/gentree.h | 1 - src/coreclr/jit/hwintrinsic.cpp | 31 ++++ src/coreclr/jit/hwintrinsic.h | 1 + src/coreclr/jit/lowerxarch.cpp | 269 ++++++++++++++++---------------- 5 files changed, 169 insertions(+), 212 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index d7fe83c22893c2..62f748bca3cf27 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -29710,85 +29710,6 @@ unsigned GenTreeHWIntrinsic::GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree return 0; } - -//------------------------------------------------------------------------ -// GetTernaryControlByte: calculate the value of the control byte for ternary node -// with given logic nodes on the input. -// -// Return value: the value of the ternary control byte. -uint8_t GenTreeHWIntrinsic::GetTernaryControlByte(GenTreeHWIntrinsic* second) const -{ - // we assume we have a structure like: - /* - /- A - +- B - t1 = binary logical op1 - - /- C - +- t1 - t2 = binary logical op2 - */ - - // To calculate the control byte value: - // The way the constants work is we have three keys: - // * A: 0xF0 - // * B: 0xCC - // * C: 0xAA - // - // To compute the correct control byte, you simply perform the corresponding operation on these keys. So, if you - // wanted to do (A & B) ^ C, you would compute (0xF0 & 0xCC) ^ 0xAA or 0x6A. - assert(second->Op(1) == this || second->Op(2) == this); - const uint8_t A = 0xF0; - const uint8_t B = 0xCC; - const uint8_t C = 0xAA; - - bool isScalar = false; - - genTreeOps firstOper = GetOperForHWIntrinsicId(&isScalar); - assert(!isScalar); - - genTreeOps secondOper = second->GetOperForHWIntrinsicId(&isScalar); - assert(!isScalar); - - uint8_t AB = 0; - uint8_t ABC = 0; - - if (firstOper == GT_AND) - { - AB = A & B; - } - else if (firstOper == GT_OR) - { - AB = A | B; - } - else if (firstOper == GT_XOR) - { - AB = A ^ B; - } - else - { - unreached(); - } - - if (secondOper == GT_AND) - { - ABC = AB & C; - } - else if (secondOper == GT_OR) - { - ABC = AB | C; - } - else if (secondOper == GT_XOR) - { - ABC = AB ^ C; - } - else - { - unreached(); - } - - return ABC; -} #endif // TARGET_XARCH && FEATURE_HW_INTRINSICS unsigned GenTreeLclFld::GetSize() const diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index ec043090da3465..67137fec968303 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -6534,7 +6534,6 @@ struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic bool OperRequiresGlobRefFlag() const; unsigned GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree* op1, GenTree* op2, GenTree* op3); - uint8_t GetTernaryControlByte(GenTreeHWIntrinsic* second) const; ClassLayout* GetLayout(Compiler* compiler) const; diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index 4f016940840b2c..195af93de557b1 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -352,6 +352,37 @@ const TernaryLogicInfo& TernaryLogicInfo::lookup(uint8_t control) return ternaryLogicFlags[control]; } + +uint8_t TernaryLogicInfo::GetTernaryControlByte(genTreeOps oper, uint8_t op1, uint8_t op2) +{ + switch (oper) + { + case GT_AND: + { + return static_cast(op1 & op2); + } + + case GT_AND_NOT: + { + return static_cast(~op1 & op2); + } + + case GT_OR: + { + return static_cast(op1 & op2); + } + + case GT_XOR: + { + return static_cast(op1 & op2); + } + + default: + { + unreached(); + } + } +} #endif // TARGET_XARCH //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h index 915657e0146310..e6a802920c0571 100644 --- a/src/coreclr/jit/hwintrinsic.h +++ b/src/coreclr/jit/hwintrinsic.h @@ -480,6 +480,7 @@ struct TernaryLogicInfo TernaryLogicUseFlags oper3Use : 3; static const TernaryLogicInfo& lookup(uint8_t control); + static uint8_t GetTernaryControlByte(genTreeOps oper, uint8_t op1, uint8_t op2); TernaryLogicUseFlags GetAllUseFlags() const { diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 62dd3a5862beba..d7732e0cbe8fe1 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1418,181 +1418,183 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) bool isScalar = false; genTreeOps oper = node->GetOperForHWIntrinsicId(&isScalar); - if (oper == GT_AND) + if (GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper)) { - // We want to recognize (~op1 & op2) and transform it - // into XarchIsa.AndNot(op1, op2) as well as (op1 & ~op2) - // transforming it into XarchIsa.AndNot(op2, op1), which - // takes into account that the XARCH apis operate more like - // NotAnd + // These are the control bytes used for TernaryLogic - bool transform = false; + const uint8_t A = 0xF0; + const uint8_t B = 0xCC; + const uint8_t C = 0xAA; + + var_types simdType = node->TypeGet(); + CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); + var_types simdBaseType = node->GetSimdBaseType(); + unsigned simdSize = node->GetSimdSize(); GenTree* op1 = node->Op(1); GenTree* op2 = node->Op(2); + GenTree* op3 = nullptr; - if (op1->OperIsHWIntrinsic()) - { - GenTreeHWIntrinsic* opIntrin = op1->AsHWIntrinsic(); + // We want to specially recognize this pattern as GT_NOT + bool isOperNot = (oper == GT_XOR) && op2->IsVectorAllBitsSet(); + bool isV512Supported = false; - bool op1IsScalar = false; - genTreeOps op1Oper = opIntrin->GetOperForHWIntrinsicId(&op1IsScalar); + LIR::Use use; + if (BlockRange().TryGetUse(node, &use)) + { + GenTree* user = use.User(); - if ((op1Oper == GT_XOR) && opIntrin->Op(2)->IsVectorAllBitsSet()) + if (user->OperIsHWIntrinsic()) { - assert(!op1IsScalar); - transform = true; + GenTreeHWIntrinsic* userIntrin = user->AsHWIntrinsic(); - op1 = opIntrin->Op(1); - BlockRange().Remove(opIntrin->Op(2)); - BlockRange().Remove(opIntrin); - } - } + bool userIsScalar = false; + genTreeOps userOper = userIntrin->GetOperForHWIntrinsicId(&isScalar); - if (!transform && op2->OperIsHWIntrinsic()) - { - GenTreeHWIntrinsic* opIntrin = op2->AsHWIntrinsic(); + if (GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(userOper)) + { + if (isOperNot && (userOper == GT_AND)) + { + // We want to specially handle GT_AND_NOT as its available without EVEX + GenTree* nextNode = node->gtNext; - bool op2IsScalar = false; - genTreeOps op2Oper = opIntrin->GetOperForHWIntrinsicId(&op2IsScalar); + BlockRange().Remove(op2); + BlockRange().Remove(node); - if ((op2Oper == GT_XOR) && opIntrin->Op(2)->IsVectorAllBitsSet()) - { - assert(!op2IsScalar); - transform = true; + // Note that despite its name, the xarch instruction does ~op1 & op2, so + // we need to ensure op1 is the value whose ones complement is computed - op2 = opIntrin->Op(1); - BlockRange().Remove(opIntrin->Op(2)); - BlockRange().Remove(opIntrin); + op2 = userIntrin->Op(2); - std::swap(op1, op2); - } - } + if (op2 == node) + { + op2 = userIntrin->Op(1); + } - if (transform) - { - var_types simdBaseType = node->GetSimdBaseType(); - unsigned simdSize = node->GetSimdSize(); + NamedIntrinsic intrinsic = + GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(comp, GT_AND_NOT, op1, op2, simdBaseType, + simdSize, false); + userIntrin->ResetHWIntrinsicId(intrinsic, comp, op1, op2); - oper = GT_AND_NOT; - intrinsicId = - GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(comp, oper, op1, op2, simdBaseType, simdSize, false); - node->ChangeHWIntrinsicId(intrinsicId, op1, op2); - } - } + return nextNode; + } - switch (oper) - { - case GT_AND: - case GT_OR: - case GT_XOR: - { - if (!comp->canUseEvexEncoding()) - { - break; - } + if (comp->compIsEvexOpportunisticallySupported(isV512Supported)) + { + // For everything else we want to lower it to a standard TernaryLogic node + GenTree* nextNode = node->gtNext; - GenTree* op1 = node->Op(1); - GenTree* op2 = node->Op(2); + BlockRange().Remove(node); + op3 = userIntrin->Op(2); - LIR::Use use; - if (BlockRange().TryGetUse(node, &use)) - { - // search for structure like: - /* - /- A - +- B - t1 = binary logical op1 + if (op3 == node) + { + op3 = userIntrin->Op(1); + } - /- C - +- t1 - t2 = binary logical op2 - */ - GenTree* user = use.User(); - if (!user->OperIs(GT_HWINTRINSIC)) - { - break; - } + uint8_t controlByte = 0x00; - GenTreeHWIntrinsic* userIntrin = user->AsHWIntrinsic(); + if ((userOper == GT_XOR) && op3->IsVectorAllBitsSet()) + { + // We're being used by what is actually GT_NOT, so we + // need to shift parameters down so that A is unused - bool userIsScalar = false; - genTreeOps userOper = userIntrin->GetOperForHWIntrinsicId(&isScalar); + std::swap(op2, op3); + std::swap(op1, op2); - if ((userOper != GT_AND) && (userOper != GT_OR) && (userOper != GT_XOR)) - { - // TODO: We should support other cases like AND_NOT, CNDSEL, and TernaryLogic - break; - } - assert(!userIsScalar); + if (isOperNot) + { + // We have what is actually a double not, so just return op2 + // which is the only actual value now that the parameters + // were shifted around - if ((userOper == GT_AND) && (oper == GT_XOR) && op2->IsVectorAllBitsSet()) - { - // We have something that will transform to AND_NOT, which we want to - // prefer over TernaryLogic or any other transform. - return node->gtNext; - } + assert(op1->IsVectorAllBitsSet()); + assert(op3->IsVectorAllBitsSet()); - GenTree* op3 = userIntrin->Op(1) == node ? userIntrin->Op(2) : userIntrin->Op(1); - GenTree* control = comp->gtNewIconNode(node->GetTernaryControlByte(userIntrin)); - CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); - unsigned simdSize = node->GetSimdSize(); - var_types simdType = Compiler::getSIMDTypeForSize(simdSize); - GenTree* ternaryNode = - comp->gtNewSimdTernaryLogicNode(simdType, op1, op2, op3, control, simdBaseJitType, simdSize); - BlockRange().InsertBefore(userIntrin, control, ternaryNode); - LIR::Use finalRes; - if (BlockRange().TryGetUse(userIntrin, &finalRes)) - { - finalRes.ReplaceWith(ternaryNode); - } - else - { - ternaryNode->SetUnusedValue(); + LIR::Use superUse; + if (BlockRange().TryGetUse(user, &superUse)) + { + superUse.ReplaceWith(op2); + } + else + { + op2->SetUnusedValue(); + } + + BlockRange().Remove(op3); + BlockRange().Remove(op1); + BlockRange().Remove(user); + + return nextNode; + } + else + { + // We're now doing NOT(OP(B, C)) + assert(op1->IsVectorAllBitsSet()); + + controlByte = TernaryLogicInfo::GetTernaryControlByte(oper, B, C); + controlByte = static_cast(~controlByte); + } + } + else if (isOperNot) + { + // A is unused, so we just want OP(NOT(B), C) + + assert(op2->IsVectorAllBitsSet()); + std::swap(op1, op2); + + controlByte = static_cast(~B); + controlByte = TernaryLogicInfo::GetTernaryControlByte(userOper, controlByte, C); + } + else + { + // We have OP2(OP1(A, B), C) + controlByte = TernaryLogicInfo::GetTernaryControlByte(oper, A, B); + controlByte = TernaryLogicInfo::GetTernaryControlByte(userOper, controlByte, C); + } + + NamedIntrinsic ternaryLogicId = NI_AVX512F_TernaryLogic; + + if (simdSize != 64) + { + ternaryLogicId = isV512Supported ? NI_AVX512F_VL_TernaryLogic : NI_AVX10v1_TernaryLogic; + } + + GenTree* op4 = comp->gtNewIconNode(controlByte); + BlockRange().InsertBefore(userIntrin, op4); + + userIntrin->ResetHWIntrinsicId(ternaryLogicId, comp, op1, op2, op3, op4); + return nextNode; + } } - GenTree* next = node->gtNext; - BlockRange().Remove(node); - BlockRange().Remove(userIntrin); - return next; } - break; } - default: + if (isOperNot && comp->compIsEvexOpportunisticallySupported(isV512Supported)) { - break; - } - } - - if ((oper == GT_XOR) && node->Op(2)->IsVectorAllBitsSet()) - { - bool isV512Supported = false; - if ((genTypeSize(node->GetSimdBaseType()) >= 4) && comp->compIsEvexOpportunisticallySupported(isV512Supported)) - { - var_types simdType = node->TypeGet(); - unsigned simdSize = node->GetSimdSize(); + // Lowering this to TernaryLogic(zero, zero, op1, ~C) is smaller + // and faster than emitting the pcmpeqd; pxor sequence. - GenTree* op1 = node->Op(1); - BlockRange().Remove(node->Op(2)); - - // We can't use the mask, but we can emit a ternary logic node + BlockRange().Remove(op2); NamedIntrinsic ternaryLogicId = NI_AVX512F_TernaryLogic; if (simdSize != 64) { - ternaryLogicId = !isV512Supported ? NI_AVX10v1_TernaryLogic : NI_AVX512F_VL_TernaryLogic; + ternaryLogicId = isV512Supported ? NI_AVX512F_VL_TernaryLogic : NI_AVX10v1_TernaryLogic; } - GenTree* op2 = comp->gtNewZeroConNode(simdType); + op3 = op1; + + op2 = comp->gtNewZeroConNode(simdType); BlockRange().InsertBefore(node, op2); - GenTree* op3 = comp->gtNewZeroConNode(simdType); - BlockRange().InsertBefore(node, op3); + op1 = comp->gtNewZeroConNode(simdType); + BlockRange().InsertBefore(node, op1); - GenTree* control = comp->gtNewIconNode(static_cast(~0xAA)); // ~C + GenTree* control = comp->gtNewIconNode(static_cast(~C)); BlockRange().InsertBefore(node, control); - node->ResetHWIntrinsicId(ternaryLogicId, comp, op3, op2, op1, control); + node->ResetHWIntrinsicId(ternaryLogicId, comp, op1, op2, op3, control); return LowerNode(node); } } @@ -3604,6 +3606,9 @@ GenTree* Lowering::LowerHWIntrinsicTernaryLogic(GenTreeHWIntrinsic* node) } } + // TODO-XARCH-AVX512: We should look for nested TernaryLogic and BitwiseOper + // nodes so that we can fully take advantage of the instruction where possible + ContainCheckHWIntrinsic(node); return node->gtNext; } From 763997c1256a92aebafcc411da2465159d39c78b Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Mon, 8 Jul 2024 01:13:52 -0700 Subject: [PATCH 07/10] Ensure that TernaryLogic picks the best operand for containment --- src/coreclr/jit/gentree.cpp | 3 +- src/coreclr/jit/hwintrinsic.cpp | 332 +++++++++++++++- src/coreclr/jit/hwintrinsic.h | 10 +- src/coreclr/jit/lowerxarch.cpp | 644 ++++++++++++++++++++++++-------- 4 files changed, 836 insertions(+), 153 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 62f748bca3cf27..fd4f85bf176e3a 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -29666,7 +29666,8 @@ bool GenTreeLclVar::IsNeverNegative(Compiler* comp) const unsigned GenTreeHWIntrinsic::GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree* op1, GenTree* op2, GenTree* op3) { #if defined(TARGET_XARCH) - assert(HWIntrinsicInfo::IsFmaIntrinsic(gtHWIntrinsicId) || HWIntrinsicInfo::IsPermuteVar2x(gtHWIntrinsicId)); + assert(HWIntrinsicInfo::IsFmaIntrinsic(gtHWIntrinsicId) || HWIntrinsicInfo::IsPermuteVar2x(gtHWIntrinsicId) || + HWIntrinsicInfo::IsTernaryLogic(gtHWIntrinsicId)); #elif defined(TARGET_ARM64) assert(HWIntrinsicInfo::IsFmaIntrinsic(gtHWIntrinsicId)); #endif diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index 195af93de557b1..451074ab3234fb 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -353,6 +353,18 @@ const TernaryLogicInfo& TernaryLogicInfo::lookup(uint8_t control) return ternaryLogicFlags[control]; } +//------------------------------------------------------------------------ +// GetTernaryControlByte: Get the control byte for a TernaryLogic operation +// given the oper and two existing control bytes +// +// Arguments: +// oper -- the operation being performed +// op1 -- the control byte for op1 +// op2 -- the control byte for op2 +// +// Return Value: +// The new control byte evaluated from performing oper on op1 and op2 +// uint8_t TernaryLogicInfo::GetTernaryControlByte(genTreeOps oper, uint8_t op1, uint8_t op2) { switch (oper) @@ -369,12 +381,12 @@ uint8_t TernaryLogicInfo::GetTernaryControlByte(genTreeOps oper, uint8_t op1, ui case GT_OR: { - return static_cast(op1 & op2); + return static_cast(op1 | op2); } case GT_XOR: { - return static_cast(op1 & op2); + return static_cast(op1 ^ op2); } default: @@ -383,6 +395,322 @@ uint8_t TernaryLogicInfo::GetTernaryControlByte(genTreeOps oper, uint8_t op1, ui } } } + +//------------------------------------------------------------------------ +// GetTernaryControlByte: Get the control byte for a TernaryLogic operation +// given a ternary logic oper and two inputs +// +// Arguments: +// oper -- the operation being performed +// op1 -- the control byte for op1, this is ignored for unary oper +// op2 -- the control byte for op2 +// +// Return Value: +// The new control byte evaluated from performing oper on op1 and op2 +// +uint8_t TernaryLogicInfo::GetTernaryControlByte(TernaryLogicOperKind oper, uint8_t op1, uint8_t op2) +{ + switch (oper) + { + case TernaryLogicOperKind::Select: + { + return op2; + } + + case TernaryLogicOperKind::Not: + { + return ~op2; + } + + case TernaryLogicOperKind::And: + { + return op1 & op2; + } + + case TernaryLogicOperKind::Nand: + { + return ~(op1 & op2); + } + + case TernaryLogicOperKind::Or: + { + return op1 | op2; + } + + case TernaryLogicOperKind::Nor: + { + return ~(op1 | op2); + } + + case TernaryLogicOperKind::Xor: + { + return op1 ^ op2; + } + + case TernaryLogicOperKind::Xnor: + { + return ~(op1 ^ op2); + } + + default: + { + unreached(); + } + } +} + +//------------------------------------------------------------------------ +// GetTernaryControlByte: Get the control byte for a TernaryLogic operation +// given an existing info and three control bytes +// +// Arguments: +// info -- the info describing the operation being performed +// op1 -- the control byte for op1 +// op2 -- the control byte for op2 +// op3 -- the control byte for op3 +// +// Return Value: +// The new control byte evaluated from performing info on op1, op2, and op3 +// +uint8_t TernaryLogicInfo::GetTernaryControlByte(const TernaryLogicInfo& info, uint8_t op1, uint8_t op2, uint8_t op3) +{ + uint8_t oper1Result; + + switch (info.oper1Use) + { + case TernaryLogicUseFlags::None: + { + assert(info.oper2 == TernaryLogicOperKind::None); + assert(info.oper2Use == TernaryLogicUseFlags::None); + + assert(info.oper3 == TernaryLogicOperKind::None); + assert(info.oper3Use == TernaryLogicUseFlags::None); + + switch (info.oper1) + { + case TernaryLogicOperKind::False: + { + oper1Result = 0x00; + break; + } + + case TernaryLogicOperKind::True: + { + oper1Result = 0xFF; + break; + } + + default: + { + unreached(); + } + } + break; + } + + case TernaryLogicUseFlags::A: + { + oper1Result = GetTernaryControlByte(info.oper1, 0x00, op1); + break; + } + + case TernaryLogicUseFlags::B: + { + oper1Result = GetTernaryControlByte(info.oper1, 0x00, op2); + break; + } + + case TernaryLogicUseFlags::C: + { + oper1Result = GetTernaryControlByte(info.oper1, 0x00, op3); + break; + } + + case TernaryLogicUseFlags::AB: + { + oper1Result = GetTernaryControlByte(info.oper1, op1, op2); + break; + } + + case TernaryLogicUseFlags::AC: + { + oper1Result = GetTernaryControlByte(info.oper1, op1, op3); + break; + } + + case TernaryLogicUseFlags::BC: + { + oper1Result = GetTernaryControlByte(info.oper1, op2, op3); + break; + } + + case TernaryLogicUseFlags::ABC: + { + assert(info.oper2 == TernaryLogicOperKind::None); + assert(info.oper2Use == TernaryLogicUseFlags::None); + + assert(info.oper3 == TernaryLogicOperKind::None); + assert(info.oper3Use == TernaryLogicUseFlags::None); + + switch (info.oper1) + { + case TernaryLogicOperKind::Nor: + { + oper1Result = ~(op1 | op2 | op3); + break; + } + + case TernaryLogicOperKind::Minor: + { + oper1Result = 0x17; + break; + } + + case TernaryLogicOperKind::Xnor: + { + oper1Result = ~(op1 ^ op2 ^ op3); + break; + } + + case TernaryLogicOperKind::Nand: + { + oper1Result = ~(op1 & op2 & op3); + break; + } + + case TernaryLogicOperKind::And: + { + oper1Result = op1 & op2 & op3; + break; + } + + case TernaryLogicOperKind::Xor: + { + oper1Result = op1 ^ op2 ^ op3; + break; + } + + case TernaryLogicOperKind::Major: + { + oper1Result = 0xE8; + break; + } + + case TernaryLogicOperKind::Or: + { + oper1Result = op1 | op2 | op3; + break; + } + + default: + { + unreached(); + } + } + break; + } + + default: + { + unreached(); + } + } + + uint8_t oper2Result; + + switch (info.oper2Use) + { + case TernaryLogicUseFlags::None: + { + assert(info.oper3 == TernaryLogicOperKind::None); + assert(info.oper3Use == TernaryLogicUseFlags::None); + + oper2Result = oper1Result; + break; + } + + case TernaryLogicUseFlags::A: + { + oper2Result = GetTernaryControlByte(info.oper2, oper1Result, op1); + break; + } + + case TernaryLogicUseFlags::B: + { + oper2Result = GetTernaryControlByte(info.oper2, oper1Result, op2); + break; + } + + case TernaryLogicUseFlags::C: + { + oper2Result = GetTernaryControlByte(info.oper2, oper1Result, op3); + break; + } + + case TernaryLogicUseFlags::AB: + { + oper2Result = GetTernaryControlByte(info.oper1, op1, op2); + break; + } + + case TernaryLogicUseFlags::AC: + { + oper2Result = GetTernaryControlByte(info.oper1, op1, op3); + break; + } + + case TernaryLogicUseFlags::BC: + { + oper2Result = GetTernaryControlByte(info.oper1, op2, op3); + break; + } + + default: + { + unreached(); + } + } + + uint8_t oper3Result; + + switch (info.oper3Use) + { + case TernaryLogicUseFlags::None: + { + assert(info.oper3 == TernaryLogicOperKind::None); + oper3Result = oper2Result; + break; + } + + case TernaryLogicUseFlags::A: + { + assert(info.oper3 == TernaryLogicOperKind::Cond); + oper3Result = (oper1Result & op1) | (oper2Result & ~op1); + break; + } + + case TernaryLogicUseFlags::B: + { + assert(info.oper3 == TernaryLogicOperKind::Cond); + oper3Result = (oper1Result & op2) | (oper2Result & ~op2); + break; + } + + case TernaryLogicUseFlags::C: + { + assert(info.oper3 == TernaryLogicOperKind::Cond); + oper3Result = (oper1Result & op3) | (oper2Result & ~op3); + break; + } + + default: + { + unreached(); + } + } + + return oper3Result; +} #endif // TARGET_XARCH //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h index e6a802920c0571..52cb9eb9a48069 100644 --- a/src/coreclr/jit/hwintrinsic.h +++ b/src/coreclr/jit/hwintrinsic.h @@ -480,7 +480,10 @@ struct TernaryLogicInfo TernaryLogicUseFlags oper3Use : 3; static const TernaryLogicInfo& lookup(uint8_t control); - static uint8_t GetTernaryControlByte(genTreeOps oper, uint8_t op1, uint8_t op2); + + static uint8_t GetTernaryControlByte(genTreeOps oper, uint8_t op1, uint8_t op2); + static uint8_t GetTernaryControlByte(TernaryLogicOperKind oper, uint8_t op1, uint8_t op2); + static uint8_t GetTernaryControlByte(const TernaryLogicInfo& info, uint8_t op1, uint8_t op2, uint8_t op3); TernaryLogicUseFlags GetAllUseFlags() const { @@ -1025,6 +1028,11 @@ struct HWIntrinsicInfo HWIntrinsicFlag flags = lookupFlags(id); return (flags & HW_Flag_PermuteVar2x) != 0; } + + static bool IsTernaryLogic(NamedIntrinsic id) + { + return (id == NI_AVX512F_TernaryLogic) || (id == NI_AVX512F_VL_TernaryLogic) || (id == NI_AVX10v1_TernaryLogic); + } #endif // TARGET_XARCH #if defined(TARGET_ARM64) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index d7732e0cbe8fe1..02e52f00dfc4a6 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1576,6 +1576,40 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) // and faster than emitting the pcmpeqd; pxor sequence. BlockRange().Remove(op2); + + if (op1->OperIsHWIntrinsic()) + { + GenTreeHWIntrinsic* opIntrin = op1->AsHWIntrinsic(); + + if (HWIntrinsicInfo::IsTernaryLogic(opIntrin->GetHWIntrinsicId())) + { + GenTree* opControl = opIntrin->Op(4); + + if (opControl->IsCnsIntOrI()) + { + // When the input is already a ternary logic node, we want to invert it rather + // than introduce a new ternary logic node. + + GenTree* nextNode = node->gtNext; + + GenTreeIntConCommon* opControlCns = opControl->AsIntConCommon(); + opControlCns->SetIconValue(static_cast(~opControlCns->IconValue())); + + if (BlockRange().TryGetUse(node, &use)) + { + use.ReplaceWith(op1); + } + else + { + op1->SetUnusedValue(); + } + + BlockRange().Remove(node); + return nextNode; + } + } + } + NamedIntrinsic ternaryLogicId = NI_AVX512F_TernaryLogic; if (simdSize != 64) @@ -3346,6 +3380,12 @@ GenTree* Lowering::LowerHWIntrinsicTernaryLogic(GenTreeHWIntrinsic* node) { assert(comp->canUseEvexEncodingDebugOnly()); + // These are the control bytes used for TernaryLogic + + const uint8_t A = 0xF0; + const uint8_t B = 0xCC; + const uint8_t C = 0xAA; + var_types simdType = node->gtType; CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); var_types simdBaseType = node->GetSimdBaseType(); @@ -3362,66 +3402,66 @@ GenTree* Lowering::LowerHWIntrinsicTernaryLogic(GenTreeHWIntrinsic* node) if (op4->IsCnsIntOrI()) { - uint8_t control = static_cast(op4->AsIntConCommon()->IconValue()); + uint8_t control = static_cast(op4->AsIntConCommon()->IconValue()); + const TernaryLogicInfo& info = TernaryLogicInfo::lookup(control); + TernaryLogicUseFlags useFlags = info.GetAllUseFlags(); switch (control) { - case 0xAC: // A ? C : B; (C & A) | (B & ~A) - case 0xB8: // B ? C : A; (C & B) | (A & ~B) - case 0xD8: // C ? B : A; (B & C) | (A & ~C) - case 0xCA: // A ? B : C; (B & A) | (C & ~A) - case 0xE2: // B ? A : C; (A & B) | (C & ~B) - case 0xE4: // C ? A : B; (A & C) | (B & ~C) + case static_cast((C & A) | (B & ~A)): // A ? C : B + case static_cast((C & B) | (A & ~B)): // B ? C : A + case static_cast((B & C) | (A & ~C)): // C ? B : A + case static_cast((B & A) | (C & ~A)): // A ? B : C + case static_cast((A & B) | (C & ~B)): // B ? A : C + case static_cast((A & C) | (B & ~C)): // C ? A : B { // For the operations that work as a conditional select, we want // to try and optimize it to use BlendVariableMask when the condition // is already a TYP_MASK - const TernaryLogicInfo& ternLogInfo = TernaryLogicInfo::lookup(control); - - assert(ternLogInfo.oper1 == TernaryLogicOperKind::Select); - assert(ternLogInfo.oper2 == TernaryLogicOperKind::Select); - assert(ternLogInfo.oper3 == TernaryLogicOperKind::Cond); + assert(info.oper1 == TernaryLogicOperKind::Select); + assert(info.oper2 == TernaryLogicOperKind::Select); + assert(info.oper3 == TernaryLogicOperKind::Cond); GenTree* condition = nullptr; GenTree* selectTrue = nullptr; GenTree* selectFalse = nullptr; - if (ternLogInfo.oper1Use == TernaryLogicUseFlags::A) + if (info.oper1Use == TernaryLogicUseFlags::A) { selectTrue = op1; - if (ternLogInfo.oper2Use == TernaryLogicUseFlags::B) + if (info.oper2Use == TernaryLogicUseFlags::B) { - assert(ternLogInfo.oper3Use == TernaryLogicUseFlags::C); + assert(info.oper3Use == TernaryLogicUseFlags::C); selectFalse = op2; condition = op3; } else { - assert(ternLogInfo.oper2Use == TernaryLogicUseFlags::C); - assert(ternLogInfo.oper3Use == TernaryLogicUseFlags::B); + assert(info.oper2Use == TernaryLogicUseFlags::C); + assert(info.oper3Use == TernaryLogicUseFlags::B); selectFalse = op3; condition = op2; } } - else if (ternLogInfo.oper1Use == TernaryLogicUseFlags::B) + else if (info.oper1Use == TernaryLogicUseFlags::B) { selectTrue = op2; - if (ternLogInfo.oper2Use == TernaryLogicUseFlags::A) + if (info.oper2Use == TernaryLogicUseFlags::A) { - assert(ternLogInfo.oper3Use == TernaryLogicUseFlags::C); + assert(info.oper3Use == TernaryLogicUseFlags::C); selectFalse = op1; condition = op3; } else { - assert(ternLogInfo.oper2Use == TernaryLogicUseFlags::C); - assert(ternLogInfo.oper3Use == TernaryLogicUseFlags::A); + assert(info.oper2Use == TernaryLogicUseFlags::C); + assert(info.oper3Use == TernaryLogicUseFlags::A); selectFalse = op3; condition = op1; @@ -3429,21 +3469,21 @@ GenTree* Lowering::LowerHWIntrinsicTernaryLogic(GenTreeHWIntrinsic* node) } else { - assert(ternLogInfo.oper1Use == TernaryLogicUseFlags::C); + assert(info.oper1Use == TernaryLogicUseFlags::C); selectTrue = op3; - if (ternLogInfo.oper2Use == TernaryLogicUseFlags::A) + if (info.oper2Use == TernaryLogicUseFlags::A) { - assert(ternLogInfo.oper3Use == TernaryLogicUseFlags::B); + assert(info.oper3Use == TernaryLogicUseFlags::B); selectFalse = op1; condition = op2; } else { - assert(ternLogInfo.oper2Use == TernaryLogicUseFlags::B); - assert(ternLogInfo.oper3Use == TernaryLogicUseFlags::A); + assert(info.oper2Use == TernaryLogicUseFlags::B); + assert(info.oper3Use == TernaryLogicUseFlags::A); selectFalse = op2; condition = op1; @@ -3601,6 +3641,92 @@ GenTree* Lowering::LowerHWIntrinsicTernaryLogic(GenTreeHWIntrinsic* node) default: { + switch (useFlags) + { + case TernaryLogicUseFlags::A: + { + // Mark the other operands as unused and replace them with zero constants + op3->SetUnusedValue(); + op3 = comp->gtNewZeroConNode(simdType); + BlockRange().InsertBefore(node, op3); + node->Op(3) = op3; + + op2->SetUnusedValue(); + op2 = comp->gtNewZeroConNode(simdType); + BlockRange().InsertBefore(node, op2); + node->Op(2) = op2; + + // Swap the operands here to make the containment checks in codegen significantly simpler + std::swap(node->Op(1), node->Op(3)); + + // Make sure we also fixup the control byte + control = TernaryLogicInfo::GetTernaryControlByte(info, C, B, A); + op4->AsIntCon()->SetIconValue(control); + break; + } + + case TernaryLogicUseFlags::B: + { + // Mark the other operands as unused and replace them with zero constants + op3->SetUnusedValue(); + op3 = comp->gtNewZeroConNode(simdType); + BlockRange().InsertBefore(node, op3); + node->Op(3) = op3; + + op1->SetUnusedValue(); + op1 = comp->gtNewZeroConNode(simdType); + BlockRange().InsertBefore(node, op1); + node->Op(1) = op1; + + // Swap the operands here to make the containment checks in codegen significantly simpler + std::swap(node->Op(2), node->Op(3)); + + // Make sure we also fixup the control byte + control = TernaryLogicInfo::GetTernaryControlByte(info, A, C, B); + op4->AsIntCon()->SetIconValue(control); + break; + } + + case TernaryLogicUseFlags::AB: + { + // Mark the other operands as unused and replace them with zero constants + op3->SetUnusedValue(); + op3 = comp->gtNewZeroConNode(simdType); + BlockRange().InsertBefore(node, op3); + node->Op(3) = op3; + + // Swap the operands here to make the containment checks in codegen significantly simpler + std::swap(node->Op(2), node->Op(3)); + std::swap(node->Op(1), node->Op(2)); + + // Make sure we also fixup the control byte + control = TernaryLogicInfo::GetTernaryControlByte(info, C, A, B); + op4->AsIntCon()->SetIconValue(control); + break; + } + + case TernaryLogicUseFlags::AC: + { + // Mark the other operands as unused and replace them with zero constants + op2->SetUnusedValue(); + op2 = comp->gtNewZeroConNode(simdType); + BlockRange().InsertBefore(node, op2); + node->Op(2) = op2; + + // Swap the operands here to make the containment checks in codegen significantly simpler + std::swap(node->Op(1), node->Op(2)); + + // Make sure we also fixup the control byte + control = TernaryLogicInfo::GetTernaryControlByte(info, B, A, C); + op4->AsIntCon()->SetIconValue(control); + break; + } + + default: + { + break; + } + } break; } } @@ -7385,6 +7511,11 @@ GenTree* Lowering::PreferredRegOptionalOperand(GenTree* op1, GenTree* op2) // f) If neither of them are local vars (i.e. tree temps), prefer to // mark op1 as reg optional for the same reason as mentioned in (d) above. + if (op1 == nullptr) + { + return op2; + } + assert(!op1->IsRegOptional()); assert(!op2->IsRegOptional()); @@ -9429,7 +9560,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre assert(childBaseType == TYP_DOUBLE); } - if (comp->canUseEvexEncoding() && parentNode->OperIsEmbBroadcastCompatible()) + if (parentNode->OperIsEmbBroadcastCompatible() && comp->canUseEvexEncoding()) { GenTree* broadcastOperand = hwintrinsic->Op(1); @@ -9467,7 +9598,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre return false; } - return parentNode->OperIsEmbBroadcastCompatible(); + return parentNode->OperIsEmbBroadcastCompatible() && comp->canUseEvexEncoding(); } default: @@ -10030,59 +10161,60 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) bool supportsOp1RegOptional = false; bool supportsOp2RegOptional = false; + GenTree* containedOperand = nullptr; + GenTree* regOptionalOperand = nullptr; + bool swapOperands = false; + if (IsContainableHWIntrinsicOp(node, op2, &supportsOp2RegOptional)) { - if (op2->IsCnsVec() && comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F) && - node->OperIsEmbBroadcastCompatible()) - { - TryFoldCnsVecForEmbeddedBroadcast(node, op2->AsVecCon()); - } - else - { - MakeSrcContained(node, op2); - } + containedOperand = op2; } else if ((isCommutative || (intrinsicId == NI_BMI2_MultiplyNoFlags) || (intrinsicId == NI_BMI2_X64_MultiplyNoFlags)) && IsContainableHWIntrinsicOp(node, op1, &supportsOp1RegOptional)) { - if (op1->IsCnsVec() && comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F) && - node->OperIsEmbBroadcastCompatible()) + containedOperand = op1; + swapOperands = true; + } + else + { + if (supportsOp1RegOptional) { - TryFoldCnsVecForEmbeddedBroadcast(node, op1->AsVecCon()); + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op1); } - else + + if (supportsOp2RegOptional) { - MakeSrcContained(node, op1); + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op2); } - // Swap the operands here to make the containment checks in codegen significantly simpler - std::swap(node->Op(1), node->Op(2)); + if (regOptionalOperand == op1) + { + swapOperands = true; + } } - else if (supportsOp1RegOptional) + + if (containedOperand != nullptr) { - if (supportsOp2RegOptional) + if (containedOperand->IsCnsVec() && node->OperIsEmbBroadcastCompatible() && + comp->canUseEvexEncoding()) { - GenTree* regOptionalOperand = PreferredRegOptionalOperand(op1, op2); - MakeSrcRegOptional(node, regOptionalOperand); - - if (regOptionalOperand == op1) - { - // Swap the operands here to make the containment checks in codegen simpler - std::swap(node->Op(1), node->Op(2)); - } + TryFoldCnsVecForEmbeddedBroadcast(node, containedOperand->AsVecCon()); } else { - MakeSrcRegOptional(node, op1); - - // Swap the operands here to make the containment checks in codegen simpler - std::swap(node->Op(1), node->Op(2)); + MakeSrcContained(node, containedOperand); } } - else if (supportsOp2RegOptional) + else if (regOptionalOperand != nullptr) { - MakeSrcRegOptional(node, op2); + MakeSrcRegOptional(node, regOptionalOperand); + } + + if (swapOperands) + { + // Swap the operands here to make the containment checks in codegen significantly simpler + std::swap(node->Op(1), node->Op(2)); } break; } @@ -10407,6 +10539,9 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) bool supportsOp2RegOptional = false; bool supportsOp3RegOptional = false; + GenTree* containedOperand = nullptr; + GenTree* regOptionalOperand = nullptr; + LIR::Use use; GenTree* user = nullptr; @@ -10423,61 +10558,58 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if ((resultOpNum != 3) && IsContainableHWIntrinsicOp(node, op3, &supportsOp3RegOptional)) { // result = (op1 * op2) + [op3] - MakeSrcContained(node, op3); + containedOperand = op3; } else if ((resultOpNum != 2) && IsContainableHWIntrinsicOp(node, op2, &supportsOp2RegOptional)) { // result = (op1 * [op2]) + op3 - MakeSrcContained(node, op2); + containedOperand = op2; } else if ((resultOpNum != 1) && !HWIntrinsicInfo::CopiesUpperBits(intrinsicId) && IsContainableHWIntrinsicOp(node, op1, &supportsOp1RegOptional)) { // result = ([op1] * op2) + op3 - MakeSrcContained(node, op1); + containedOperand = op1; } - else if (supportsOp1RegOptional) + else { - if (supportsOp2RegOptional) + if (supportsOp1RegOptional) { - GenTree* regOptionalOperand = PreferredRegOptionalOperand(op1, op2); - - if (supportsOp3RegOptional) - { - regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op3); - } - - MakeSrcRegOptional(node, regOptionalOperand); + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op1); } - else if (supportsOp3RegOptional) + + if (supportsOp2RegOptional) { - GenTree* regOptionalOperand = PreferredRegOptionalOperand(op1, op3); - MakeSrcRegOptional(node, regOptionalOperand); + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op2); } - else + + if (supportsOp3RegOptional) { - MakeSrcRegOptional(node, op1); + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op3); } } - else if (supportsOp2RegOptional) + + if (containedOperand != nullptr) { - if (supportsOp3RegOptional) + if (containedOperand->IsCnsVec() && node->OperIsEmbBroadcastCompatible() && + comp->canUseEvexEncoding()) { - GenTree* regOptionalOperand = PreferredRegOptionalOperand(op2, op3); - MakeSrcRegOptional(node, regOptionalOperand); + TryFoldCnsVecForEmbeddedBroadcast(node, containedOperand->AsVecCon()); } else { - MakeSrcRegOptional(node, op2); + MakeSrcContained(node, containedOperand); } } - else if (supportsOp3RegOptional) + else if (regOptionalOperand != nullptr) { - MakeSrcRegOptional(node, op3); + MakeSrcRegOptional(node, regOptionalOperand); } } else if (HWIntrinsicInfo::IsPermuteVar2x(intrinsicId)) { + assert(comp->canUseEvexEncodingDebugOnly()); + // PermuteVar2x is similarly special in that op1 and op3 // are commutative and op1 or op2 can be the RMW operand. // @@ -10493,8 +10625,11 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) bool supportsOp1RegOptional = false; bool supportsOp3RegOptional = false; - bool swapOperands = false; - bool isOp2Cns = op2->IsCnsVec(); + + GenTree* containedOperand = nullptr; + GenTree* regOptionalOperand = nullptr; + bool swapOperands = false; + bool isOp2Cns = op2->IsCnsVec(); LIR::Use use; GenTree* user = nullptr; @@ -10512,44 +10647,51 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (((resultOpNum != 3) || !isOp2Cns) && IsContainableHWIntrinsicOp(node, op3, &supportsOp3RegOptional)) { - MakeSrcContained(node, op3); + containedOperand = op3; } else if ((resultOpNum != 2) && isOp2Cns && IsContainableHWIntrinsicOp(node, op1, &supportsOp1RegOptional)) { - MakeSrcContained(node, op1); - - // Swap the operands here to make the containment checks in codegen significantly simpler - swapOperands = true; + containedOperand = op1; + swapOperands = true; } - else if (supportsOp1RegOptional) + else { + if (supportsOp1RegOptional) + { + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op1); + } + if (supportsOp3RegOptional) { - GenTree* regOptionalOperand = PreferredRegOptionalOperand(op1, op3); - MakeSrcRegOptional(node, regOptionalOperand); + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op3); + } - if (regOptionalOperand == op1) - { - // Swap the operands here to make the containment checks in codegen simpler - swapOperands = true; - } + if (regOptionalOperand == op1) + { + swapOperands = true; + } + } + + if (containedOperand != nullptr) + { + if (containedOperand->IsCnsVec() && node->OperIsEmbBroadcastCompatible()) + { + TryFoldCnsVecForEmbeddedBroadcast(node, containedOperand->AsVecCon()); } else { - MakeSrcRegOptional(node, op1); - - // Swap the operands here to make the containment checks in codegen simpler - swapOperands = true; + MakeSrcContained(node, containedOperand); } } - else if (supportsOp3RegOptional) + else if (regOptionalOperand != nullptr) { - MakeSrcRegOptional(node, op3); + MakeSrcRegOptional(node, regOptionalOperand); } if (swapOperands) { + // Swap the operands here to make the containment checks in codegen significantly simpler assert(op2->IsCnsVec()); std::swap(node->Op(1), node->Op(3)); @@ -10779,41 +10921,51 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) bool supportsOp1RegOptional = false; bool supportsOp2RegOptional = false; + GenTree* containedOperand = nullptr; + GenTree* regOptionalOperand = nullptr; + bool swapOperands = false; + if (IsContainableHWIntrinsicOp(node, op2, &supportsOp2RegOptional)) { - MakeSrcContained(node, op2); + containedOperand = op2; } else if (IsContainableHWIntrinsicOp(node, op1, &supportsOp1RegOptional)) { - MakeSrcContained(node, op1); - - // Swap the operands here to make the containment checks in codegen simpler - std::swap(node->Op(1), node->Op(2)); + containedOperand = op1; + swapOperands = true; } - else if (supportsOp1RegOptional) + else { - if (supportsOp2RegOptional) + if (supportsOp1RegOptional) { - GenTree* regOptionalOperand = PreferredRegOptionalOperand(op1, op2); - MakeSrcRegOptional(node, regOptionalOperand); - - if (regOptionalOperand == op1) - { - // Swap the operands here to make the containment checks in codegen simpler - std::swap(node->Op(1), node->Op(2)); - } + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op1); } - else + + if (supportsOp2RegOptional) { - MakeSrcRegOptional(node, op1); + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op2); + } - // Swap the operands here to make the containment checks in codegen simpler - std::swap(node->Op(1), node->Op(2)); + if (regOptionalOperand == op1) + { + swapOperands = true; } } - else if (supportsOp2RegOptional) + + if (containedOperand != nullptr) { - MakeSrcRegOptional(node, op2); + MakeSrcContained(node, containedOperand); + } + else if (regOptionalOperand != nullptr) + { + MakeSrcRegOptional(node, regOptionalOperand); + } + + if (swapOperands) + { + // Swap the operands here to make the containment checks in codegen significantly + // simpler + std::swap(node->Op(1), node->Op(2)); } break; } @@ -11071,45 +11223,239 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) case NI_AVX512F_VL_TernaryLogic: case NI_AVX10v1_TernaryLogic: { + assert(comp->canUseEvexEncodingDebugOnly()); + + // These are the control bytes used for TernaryLogic + + const uint8_t A = 0xF0; + const uint8_t B = 0xCC; + const uint8_t C = 0xAA; + if (!isContainedImm) { // Don't contain if we're generating a jmp table fallback break; } - if (IsContainableHWIntrinsicOp(node, op3, &supportsRegOptional)) - { - MakeSrcContained(node, op3); - } - else if (supportsRegOptional) - { - MakeSrcRegOptional(node, op3); - } - uint8_t control = static_cast(op4->AsIntCon()->gtIconVal); - const TernaryLogicInfo& info = TernaryLogicInfo::lookup(control); - TernaryLogicUseFlags useFlags = info.GetAllUseFlags(); + const TernaryLogicInfo* info = &TernaryLogicInfo::lookup(control); + TernaryLogicUseFlags useFlags = info->GetAllUseFlags(); - if (useFlags != TernaryLogicUseFlags::ABC) - { - assert(!node->isRMWHWIntrinsic(comp)); + bool supportsOp1RegOptional = false; + bool supportsOp2RegOptional = false; + bool supportsOp3RegOptional = false; - // op1, and possibly op2, are never selected - // by the table so we can contain and ignore - // any register allocated to it resulting in - // better non-RMW based codegen. + GenTree* containedOperand = nullptr; + GenTree* regOptionalOperand = nullptr; + TernaryLogicUseFlags swapOperands = TernaryLogicUseFlags::None; - MakeSrcContained(node, op1); + switch (useFlags) + { + case TernaryLogicUseFlags::None: + { + break; + } - if (useFlags == TernaryLogicUseFlags::C) + case TernaryLogicUseFlags::C: { + // We're only using op3, so that's the one to try and contain + + assert(op1->IsCnsVec()); + MakeSrcContained(node, op1); + + assert(op2->IsCnsVec()); MakeSrcContained(node, op2); + + if (IsContainableHWIntrinsicOp(node, op3, &supportsOp3RegOptional)) + { + containedOperand = op3; + } + else if (supportsOp3RegOptional) + { + regOptionalOperand = op3; + } + break; + } + + case TernaryLogicUseFlags::BC: + { + // We're only using op2 and op3, so find the right one to contain + // using the standard commutative rules, fixing up the control byte + // as needed to ensure the operation remains the same + + assert(op1->IsCnsVec()); + MakeSrcContained(node, op1); + + if (IsContainableHWIntrinsicOp(node, op3, &supportsOp3RegOptional)) + { + containedOperand = op3; + } + else if (IsContainableHWIntrinsicOp(node, op2, &supportsOp2RegOptional)) + { + containedOperand = op2; + swapOperands = TernaryLogicUseFlags::BC; + } + else + { + if (supportsOp2RegOptional) + { + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op2); + } + + if (supportsOp3RegOptional) + { + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op3); + } + + if (regOptionalOperand == op2) + { + swapOperands = TernaryLogicUseFlags::BC; + } + } + break; + } + + case TernaryLogicUseFlags::ABC: + { + // TernaryLogic is special in that any operand can be contained + // and any other operand can be the RMW operand. + // + // This comes about from having a control byte that indicates + // the operation to be performed per operand. + + LIR::Use use; + GenTree* user = nullptr; + + if (BlockRange().TryGetUse(node, &use)) + { + user = use.User(); + } + unsigned resultOpNum = node->GetResultOpNumForRmwIntrinsic(user, op1, op2, op3); + + // Prioritize Containable op. Check if any one of the op is containable first. + // Set op regOptional only if none of them is containable. + + if (resultOpNum == 2) + { + // Swap the operands here to make the containment checks in codegen + // significantly simpler + std::swap(node->Op(1), node->Op(2)); + + // Make sure we also fixup the control byte + control = TernaryLogicInfo::GetTernaryControlByte(*info, B, A, C); + op4->AsIntCon()->SetIconValue(control); + + // Result is now in op1, but also get the updated info + resultOpNum = 1; + info = &TernaryLogicInfo::lookup(control); + } + else if (resultOpNum == 3) + { + // Swap the operands here to make the containment checks in codegen + // significantly simpler + std::swap(node->Op(1), node->Op(3)); + + // Make sure we also fixup the control byte + control = TernaryLogicInfo::GetTernaryControlByte(*info, C, B, A); + op4->AsIntCon()->SetIconValue(control); + + // Result is now in op1, but also get the updated info + resultOpNum = 1; + info = &TernaryLogicInfo::lookup(control); + } + + // Prefer to make op3 contained as it doesn't require reordering operands + if (IsContainableHWIntrinsicOp(node, op3, &supportsOp3RegOptional)) + { + containedOperand = op3; + } + else if (IsContainableHWIntrinsicOp(node, op2, &supportsOp2RegOptional)) + { + containedOperand = op2; + swapOperands = TernaryLogicUseFlags::BC; + } + else if ((resultOpNum != 1) && + IsContainableHWIntrinsicOp(node, op1, &supportsOp1RegOptional)) + { + containedOperand = op1; + swapOperands = TernaryLogicUseFlags::AC; + } + else + { + if (supportsOp1RegOptional) + { + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op1); + } + + if (supportsOp2RegOptional) + { + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op2); + } + + if (supportsOp3RegOptional) + { + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op3); + } + + if (regOptionalOperand == op1) + { + swapOperands = TernaryLogicUseFlags::AC; + } + else if (regOptionalOperand == op2) + { + swapOperands = TernaryLogicUseFlags::BC; + } + } + break; + } + + default: + { + // Lowering should have normalized to one of the above + unreached(); + } + } + + if (containedOperand != nullptr) + { + if (containedOperand->IsCnsVec() && node->OperIsEmbBroadcastCompatible()) + { + TryFoldCnsVecForEmbeddedBroadcast(node, containedOperand->AsVecCon()); } else { - assert(useFlags == TernaryLogicUseFlags::BC); + MakeSrcContained(node, containedOperand); } } + else if (regOptionalOperand != nullptr) + { + MakeSrcRegOptional(node, regOptionalOperand); + } + + if (swapOperands == TernaryLogicUseFlags::AC) + { + // Swap the operands here to make the containment checks in codegen + // significantly simpler + std::swap(node->Op(1), node->Op(3)); + + // Make sure we also fixup the control byte + control = TernaryLogicInfo::GetTernaryControlByte(*info, C, B, A); + op4->AsIntCon()->SetIconValue(control); + } + else if (swapOperands == TernaryLogicUseFlags::BC) + { + // Swap the operands here to make the containment checks in codegen + // significantly simpler + std::swap(node->Op(2), node->Op(3)); + + // Make sure we also fixup the control byte + control = TernaryLogicInfo::GetTernaryControlByte(*info, A, C, B); + op4->AsIntCon()->SetIconValue(control); + } + else + { + assert(swapOperands == TernaryLogicUseFlags::None); + } break; } From bd9f4f68cc0144309edd3f37d8210862b148cbc6 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Mon, 8 Jul 2024 13:21:42 -0700 Subject: [PATCH 08/10] Ensure we swap the operands that are being checked for containment --- src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 17 ++++++++++++++--- src/coreclr/jit/lowerxarch.cpp | 2 ++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index bb3876dd085e25..ed26f3d7490b09 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -1298,9 +1298,7 @@ void CodeGen::genHWIntrinsic_R_R_R_RM_I( // non-RMW based codegen. #if defined(DEBUG) - NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); - assert((intrinsicId == NI_AVX512F_TernaryLogic) || (intrinsicId == NI_AVX512F_VL_TernaryLogic) || - (intrinsicId == NI_AVX10v1_TernaryLogic)); + assert(HWIntrinsicInfo::IsTernaryLogic(node->GetHWIntrinsicId())); uint8_t control = static_cast(ival); const TernaryLogicInfo& info = TernaryLogicInfo::lookup(control); @@ -1311,6 +1309,19 @@ void CodeGen::genHWIntrinsic_R_R_R_RM_I( op2Reg = targetReg; } + else + { +#if defined(DEBUG) + if (HWIntrinsicInfo::IsTernaryLogic(node->GetHWIntrinsicId())) + { + uint8_t control = static_cast(ival); + const TernaryLogicInfo& info = TernaryLogicInfo::lookup(control); + TernaryLogicUseFlags useFlags = info.GetAllUseFlags(); + + assert(useFlags == TernaryLogicUseFlags::BC); + } +#endif // DEBUG + } } assert(targetReg != REG_NA); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 02e52f00dfc4a6..2bb1870c4b13b8 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -11340,6 +11340,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) // Swap the operands here to make the containment checks in codegen // significantly simpler std::swap(node->Op(1), node->Op(2)); + std::swap(op1, op2); // Make sure we also fixup the control byte control = TernaryLogicInfo::GetTernaryControlByte(*info, B, A, C); @@ -11354,6 +11355,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) // Swap the operands here to make the containment checks in codegen // significantly simpler std::swap(node->Op(1), node->Op(3)); + std::swap(op1, op3); // Make sure we also fixup the control byte control = TernaryLogicInfo::GetTernaryControlByte(*info, C, B, A); From c4b0c3a91d195fbc2b94796c6ebffa0f48dec23f Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Mon, 8 Jul 2024 15:48:41 -0700 Subject: [PATCH 09/10] Ensure that TernaryLogic is simplified where possible --- src/coreclr/jit/hwintrinsic.cpp | 6 +- src/coreclr/jit/lowerxarch.cpp | 141 ++++++++++++++++++++++++-------- 2 files changed, 110 insertions(+), 37 deletions(-) diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index 451074ab3234fb..c9f84c3507e6a4 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -649,19 +649,19 @@ uint8_t TernaryLogicInfo::GetTernaryControlByte(const TernaryLogicInfo& info, ui case TernaryLogicUseFlags::AB: { - oper2Result = GetTernaryControlByte(info.oper1, op1, op2); + oper2Result = GetTernaryControlByte(info.oper2, op1, op2); break; } case TernaryLogicUseFlags::AC: { - oper2Result = GetTernaryControlByte(info.oper1, op1, op3); + oper2Result = GetTernaryControlByte(info.oper2, op1, op3); break; } case TernaryLogicUseFlags::BC: { - oper2Result = GetTernaryControlByte(info.oper1, op2, op3); + oper2Result = GetTernaryControlByte(info.oper2, op2, op3); break; } diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 2bb1870c4b13b8..c68c14c7559f56 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -3645,56 +3645,32 @@ GenTree* Lowering::LowerHWIntrinsicTernaryLogic(GenTreeHWIntrinsic* node) { case TernaryLogicUseFlags::A: { - // Mark the other operands as unused and replace them with zero constants - op3->SetUnusedValue(); - op3 = comp->gtNewZeroConNode(simdType); - BlockRange().InsertBefore(node, op3); - node->Op(3) = op3; - - op2->SetUnusedValue(); - op2 = comp->gtNewZeroConNode(simdType); - BlockRange().InsertBefore(node, op2); - node->Op(2) = op2; - // Swap the operands here to make the containment checks in codegen significantly simpler std::swap(node->Op(1), node->Op(3)); // Make sure we also fixup the control byte control = TernaryLogicInfo::GetTernaryControlByte(info, C, B, A); op4->AsIntCon()->SetIconValue(control); + + useFlags = TernaryLogicUseFlags::C; break; } case TernaryLogicUseFlags::B: { - // Mark the other operands as unused and replace them with zero constants - op3->SetUnusedValue(); - op3 = comp->gtNewZeroConNode(simdType); - BlockRange().InsertBefore(node, op3); - node->Op(3) = op3; - - op1->SetUnusedValue(); - op1 = comp->gtNewZeroConNode(simdType); - BlockRange().InsertBefore(node, op1); - node->Op(1) = op1; - // Swap the operands here to make the containment checks in codegen significantly simpler std::swap(node->Op(2), node->Op(3)); // Make sure we also fixup the control byte control = TernaryLogicInfo::GetTernaryControlByte(info, A, C, B); op4->AsIntCon()->SetIconValue(control); + + useFlags = TernaryLogicUseFlags::C; break; } case TernaryLogicUseFlags::AB: { - // Mark the other operands as unused and replace them with zero constants - op3->SetUnusedValue(); - op3 = comp->gtNewZeroConNode(simdType); - BlockRange().InsertBefore(node, op3); - node->Op(3) = op3; - // Swap the operands here to make the containment checks in codegen significantly simpler std::swap(node->Op(2), node->Op(3)); std::swap(node->Op(1), node->Op(2)); @@ -3702,31 +3678,128 @@ GenTree* Lowering::LowerHWIntrinsicTernaryLogic(GenTreeHWIntrinsic* node) // Make sure we also fixup the control byte control = TernaryLogicInfo::GetTernaryControlByte(info, C, A, B); op4->AsIntCon()->SetIconValue(control); + + useFlags = TernaryLogicUseFlags::BC; break; } case TernaryLogicUseFlags::AC: { - // Mark the other operands as unused and replace them with zero constants - op2->SetUnusedValue(); - op2 = comp->gtNewZeroConNode(simdType); - BlockRange().InsertBefore(node, op2); - node->Op(2) = op2; - // Swap the operands here to make the containment checks in codegen significantly simpler std::swap(node->Op(1), node->Op(2)); // Make sure we also fixup the control byte control = TernaryLogicInfo::GetTernaryControlByte(info, B, A, C); op4->AsIntCon()->SetIconValue(control); + + useFlags = TernaryLogicUseFlags::BC; + break; + } + + default: + { + break; + } + } + + GenTree* replacementNode = nullptr; + + switch (useFlags) + { + case TernaryLogicUseFlags::None: + { + // Encountering none should be very rare and so we'll handle + // it, but we won't try to optimize it by finding an existing + // constant to reuse or similar, as that's more expensive + + op1->SetUnusedValue(); + op2->SetUnusedValue(); + op3->SetUnusedValue(); + + if (control == 0x00) + { + replacementNode = comp->gtNewZeroConNode(simdType); + } + else + { + assert(control == 0xFF); + replacementNode = comp->gtNewAllBitsSetConNode(simdType); + } + + BlockRange().InsertBefore(node, replacementNode); + break; + } + + case TernaryLogicUseFlags::C: + { + // Encountering `select(c)` instead of `not(c)` should likewise + // be rare, but we'll handle it in case the combined operations + // are just right to cause it to appear + + if (control == C) + { + op1->SetUnusedValue(); + op2->SetUnusedValue(); + + replacementNode = op3; + break; + } + + // For not, we do want to check if we already have reusable constants as + // this can occur for the normal lowering pattern around `xor(c, AllBitsSet)` + + if (!op1->IsCnsVec()) + { + op1->SetUnusedValue(); + op1 = comp->gtNewZeroConNode(simdType); + + BlockRange().InsertBefore(node, op1); + node->Op(1) = op1; + } + + if (!op2->IsCnsVec()) + { + op2->SetUnusedValue(); + op2 = comp->gtNewZeroConNode(simdType); + + BlockRange().InsertBefore(node, op2); + node->Op(2) = op2; + } + break; + } + + case TernaryLogicUseFlags::BC: + { + if (!op1->IsCnsVec()) + { + op1->SetUnusedValue(); + op1 = comp->gtNewZeroConNode(simdType); + + BlockRange().InsertBefore(node, op1); + node->Op(1) = op1; + } break; } default: { + assert(useFlags == TernaryLogicUseFlags::ABC); break; } } + + if (replacementNode != nullptr) + { + LIR::Use use; + if (BlockRange().TryGetUse(node, &use)) + { + use.ReplaceWith(replacementNode); + } + else + { + replacementNode->SetUnusedValue(); + } + } break; } } From 5dfc7ed5c24aecc44b400978766012a07f638825 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Mon, 8 Jul 2024 17:09:10 -0700 Subject: [PATCH 10/10] Apply formatting patch --- src/coreclr/jit/lowerxarch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index c68c14c7559f56..5e47f774bdeaec 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -3708,7 +3708,7 @@ GenTree* Lowering::LowerHWIntrinsicTernaryLogic(GenTreeHWIntrinsic* node) { case TernaryLogicUseFlags::None: { - // Encountering none should be very rare and so we'll handle + // Encountering none should be very rare and so we'll handle // it, but we won't try to optimize it by finding an existing // constant to reuse or similar, as that's more expensive