diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index e04b137aeb3849..3d2748659ef185 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -19677,6 +19677,45 @@ GenTree* Compiler::gtNewSimdBinOpNode( intrinsic = NI_AVX2_ShiftRightLogical; } } + else if (simdSize == 64) + { + assert(IsBaselineVector512IsaSupportedDebugOnly()); + + if (op == GT_LSH) + { + if (varTypeIsShort(simdBaseType)) + { + intrinsic = NI_AVX512BW_ShiftLeftLogical; + } + else + { + intrinsic = NI_AVX512F_ShiftLeftLogical; + } + } + else if (op == GT_RSH) + { + if (varTypeIsShort(simdBaseType)) + { + intrinsic = NI_AVX512BW_ShiftRightArithmetic; + } + else + { + intrinsic = NI_AVX512F_ShiftRightArithmetic; + } + } + else + { + assert(op == GT_RSZ); + if (varTypeIsShort(simdBaseType)) + { + intrinsic = NI_AVX512BW_ShiftRightLogical; + } + else + { + intrinsic = NI_AVX512F_ShiftRightLogical; + } + } + } else if (op == GT_LSH) { intrinsic = NI_SSE2_ShiftLeftLogical; @@ -20350,11 +20389,6 @@ GenTree* Compiler::gtNewSimdCmpOpNode( NamedIntrinsic intrinsic = NI_Illegal; - if (simdSize == 64) - { - assert(op == GT_EQ); - } - switch (op) { #if defined(TARGET_XARCH) @@ -20429,6 +20463,11 @@ GenTree* Compiler::gtNewSimdCmpOpNode( intrinsic = NI_AVX_CompareGreaterThanOrEqual; } } + else if (simdSize == 64) + { + assert(IsBaselineVector512IsaSupportedDebugOnly()); + intrinsic = NI_AVX512F_CompareGreaterThanOrEqualSpecial; + } else if (simdBaseType == TYP_FLOAT) { intrinsic = NI_SSE_CompareGreaterThanOrEqual; @@ -20575,6 +20614,11 @@ GenTree* Compiler::gtNewSimdCmpOpNode( intrinsic = NI_AVX2_CompareGreaterThan; } } + else if (simdSize == 64) + { + assert(IsBaselineVector512IsaSupportedDebugOnly()); + intrinsic = NI_AVX512F_CompareGreaterThanSpecial; + } else if (simdBaseType == TYP_FLOAT) { intrinsic = NI_SSE_CompareGreaterThan; @@ -20655,6 +20699,11 @@ GenTree* Compiler::gtNewSimdCmpOpNode( intrinsic = NI_AVX_CompareLessThanOrEqual; } } + else if (simdSize == 64) + { + assert(IsBaselineVector512IsaSupportedDebugOnly()); + intrinsic = NI_AVX512F_CompareLessThanOrEqualSpecial; + } else if (simdBaseType == TYP_FLOAT) { intrinsic = NI_SSE_CompareLessThanOrEqual; @@ -20801,6 +20850,11 @@ GenTree* Compiler::gtNewSimdCmpOpNode( intrinsic = NI_AVX2_CompareLessThan; } } + else if (simdSize == 64) + { + assert(IsBaselineVector512IsaSupportedDebugOnly()); + intrinsic = NI_AVX512F_CompareLessThanSpecial; + } else if (simdBaseType == TYP_FLOAT) { intrinsic = NI_SSE_CompareLessThan; @@ -20989,18 +21043,18 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode( #if defined(TARGET_XARCH) case GT_EQ: { - if (simdSize == 64) - { - assert(IsBaselineVector512IsaSupportedDebugOnly()); - intrinsic = NI_Vector512_op_Equality; - } - else if (simdSize == 32) + if (simdSize == 32) { assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); assert(varTypeIsFloating(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2)); intrinsic = NI_Vector256_op_Equality; } + else if (simdSize == 64) + { + assert(IsBaselineVector512IsaSupportedDebugOnly()); + intrinsic = NI_Vector512_op_Equality; + } else { intrinsic = NI_Vector128_op_Equality; @@ -21009,8 +21063,125 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode( } case GT_GE: + { + // We want to generate a comparison along the lines of + // GT_XX(op1, op2).As() == Vector128.AllBitsSet + + if (simdSize == 32) + { + // TODO-XArch-CQ: It's a non-trivial amount of work to support these + // for floating-point while only utilizing AVX. It would require, among + // other things, inverting the comparison and potentially support for a + // new Avx.TestNotZ intrinsic to ensure the codegen remains efficient. + assert(compIsaSupportedDebugOnly(InstructionSet_AVX2)); + intrinsic = NI_Vector256_op_Equality; + } + else if (simdSize == 64) + { + assert(IsBaselineVector512IsaSupportedDebugOnly()); + intrinsic = NI_Vector512_GreaterThanOrEqualAll; + break; + } + else + { + intrinsic = NI_Vector128_op_Equality; + } + + op1 = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize); + op2 = gtNewAllBitsSetConNode(simdType); + + if (simdBaseType == TYP_FLOAT) + { + simdBaseType = TYP_INT; + simdBaseJitType = CORINFO_TYPE_INT; + } + else if (simdBaseType == TYP_DOUBLE) + { + simdBaseType = TYP_LONG; + simdBaseJitType = CORINFO_TYPE_LONG; + } + break; + } case GT_GT: + { + // We want to generate a comparison along the lines of + // GT_XX(op1, op2).As() == Vector128.AllBitsSet + + if (simdSize == 32) + { + // TODO-XArch-CQ: It's a non-trivial amount of work to support these + // for floating-point while only utilizing AVX. It would require, among + // other things, inverting the comparison and potentially support for a + // new Avx.TestNotZ intrinsic to ensure the codegen remains efficient. + assert(compIsaSupportedDebugOnly(InstructionSet_AVX2)); + intrinsic = NI_Vector256_op_Equality; + } + else if (simdSize == 64) + { + assert(IsBaselineVector512IsaSupportedDebugOnly()); + intrinsic = NI_Vector512_GreaterThanAll; + break; + } + else + { + intrinsic = NI_Vector128_op_Equality; + } + + op1 = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize); + op2 = gtNewAllBitsSetConNode(simdType); + + if (simdBaseType == TYP_FLOAT) + { + simdBaseType = TYP_INT; + simdBaseJitType = CORINFO_TYPE_INT; + } + else if (simdBaseType == TYP_DOUBLE) + { + simdBaseType = TYP_LONG; + simdBaseJitType = CORINFO_TYPE_LONG; + } + break; + } case GT_LE: + { + // We want to generate a comparison along the lines of + // GT_XX(op1, op2).As() == Vector128.AllBitsSet + + if (simdSize == 32) + { + // TODO-XArch-CQ: It's a non-trivial amount of work to support these + // for floating-point while only utilizing AVX. It would require, among + // other things, inverting the comparison and potentially support for a + // new Avx.TestNotZ intrinsic to ensure the codegen remains efficient. + assert(compIsaSupportedDebugOnly(InstructionSet_AVX2)); + intrinsic = NI_Vector256_op_Equality; + } + else if (simdSize == 64) + { + assert(IsBaselineVector512IsaSupportedDebugOnly()); + intrinsic = NI_Vector512_LessThanOrEqualAll; + break; + } + else + { + intrinsic = NI_Vector128_op_Equality; + } + + op1 = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize); + op2 = gtNewAllBitsSetConNode(simdType); + + if (simdBaseType == TYP_FLOAT) + { + simdBaseType = TYP_INT; + simdBaseJitType = CORINFO_TYPE_INT; + } + else if (simdBaseType == TYP_DOUBLE) + { + simdBaseType = TYP_LONG; + simdBaseJitType = CORINFO_TYPE_LONG; + } + break; + } case GT_LT: { // We want to generate a comparison along the lines of @@ -21025,6 +21196,12 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode( assert(compIsaSupportedDebugOnly(InstructionSet_AVX2)); intrinsic = NI_Vector256_op_Equality; } + else if (simdSize == 64) + { + assert(IsBaselineVector512IsaSupportedDebugOnly()); + intrinsic = NI_Vector512_LessThanAll; + break; + } else { intrinsic = NI_Vector128_op_Equality; @@ -21257,7 +21434,18 @@ GenTree* Compiler::gtNewSimdCndSelNode( #if defined(TARGET_XARCH) assert((simdSize != 32) || compIsaSupportedDebugOnly(InstructionSet_AVX)); - intrinsic = (simdSize == 32) ? NI_Vector256_ConditionalSelect : NI_Vector128_ConditionalSelect; + if (simdSize == 32) + { + intrinsic = NI_Vector256_ConditionalSelect; + } + else if (simdSize == 64) + { + intrinsic = NI_Vector512_ConditionalSelect; + } + else + { + intrinsic = NI_Vector128_ConditionalSelect; + } return gtNewSimdHWIntrinsicNode(type, op1, op2, op3, intrinsic, simdBaseJitType, simdSize); #elif defined(TARGET_ARM64) return gtNewSimdHWIntrinsicNode(type, op1, op2, op3, NI_AdvSimd_BitwiseSelect, simdBaseJitType, simdSize); diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 772b8439ba65d5..a4c9fbd83eda25 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -1760,6 +1760,64 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_AVX512F_CompareGreaterThanOrEqualSpecial: + { + GenTree* op2 = node->Op(2); + op1Reg = op1->GetRegNum(); + regNumber op2Reg = op2->GetRegNum(); + + instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareGreaterThanOrEqualSpecial, baseType); + + assert(compareIns != INS_invalid); + assert(emitter::isMaskReg(targetReg)); + + emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 5); + break; + } + + case NI_AVX512F_CompareGreaterThanSpecial: + { + GenTree* op2 = node->Op(2); + op1Reg = op1->GetRegNum(); + regNumber op2Reg = op2->GetRegNum(); + + instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareGreaterThanSpecial, baseType); + + assert(compareIns != INS_invalid); + assert(emitter::isMaskReg(targetReg)); + + emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 6); + break; + } + case NI_AVX512F_CompareLessThanOrEqualSpecial: + { + GenTree* op2 = node->Op(2); + op1Reg = op1->GetRegNum(); + regNumber op2Reg = op2->GetRegNum(); + + instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareLessThanOrEqualSpecial, baseType); + + assert(compareIns != INS_invalid); + assert(emitter::isMaskReg(targetReg)); + + emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 2); + break; + } + + case NI_AVX512F_CompareLessThanSpecial: + { + GenTree* op2 = node->Op(2); + op1Reg = op1->GetRegNum(); + regNumber op2Reg = op2->GetRegNum(); + + instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareLessThanSpecial, baseType); + + assert(compareIns != INS_invalid); + assert(emitter::isMaskReg(targetReg)); + + emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 1); + break; + } case NI_AVX512F_MoveMaskToVectorSpecial: { op1Reg = op1->GetRegNum(); diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index cb7039116612fa..3c9ac8f2d1ddb8 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -259,6 +259,7 @@ HARDWARE_INTRINSIC(Vector512, AsUInt64, HARDWARE_INTRINSIC(Vector512, AsVector, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, AsVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, BitwiseAnd, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, ConditionalSelect, 64, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, BitwiseOr, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, Create, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, CreateScalar, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) @@ -277,6 +278,18 @@ HARDWARE_INTRINSIC(Vector512, GetElement, HARDWARE_INTRINSIC(Vector512, GetLower, 64, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector512, GetLower128, 64, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector512, GetUpper, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, GreaterThan, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Vector512, GreaterThanAll, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, GreaterThanAny, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, GreaterThanOrEqual, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Vector512, GreaterThanOrEqualAll, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, GreaterThanOrEqualAny, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, LessThan, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Vector512, LessThanAll, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, LessThanAny, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, LessThanOrEqual, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Vector512, LessThanOrEqualAll, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, LessThanOrEqualAny, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, Load, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, LoadAligned, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, LoadAlignedNonTemporal, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) @@ -291,9 +304,16 @@ HARDWARE_INTRINSIC(Vector512, op_Division, HARDWARE_INTRINSIC(Vector512, op_Equality, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative) HARDWARE_INTRINSIC(Vector512, op_ExclusiveOr, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, op_Inequality, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative) +HARDWARE_INTRINSIC(Vector512, op_LeftShift, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, op_Multiply, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, op_OnesComplement, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, op_RightShift, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, op_Subtraction, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, op_UnsignedRightShift, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, ShiftLeft, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, ShiftRightArithmetic, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, ShiftRightLogical, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, Shuffle, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, Store, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, StoreAligned, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, StoreAlignedNonTemporal, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) @@ -1052,6 +1072,10 @@ HARDWARE_INTRINSIC(AVX, PTEST, HARDWARE_INTRINSIC(AVX512F, MoveMaskSpecial, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX512F, CompareEqualSpecial, 64, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_cmpps, INS_cmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX512F, CompareGreaterThanOrEqualSpecial, 64, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_cmpps, INS_cmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX512F, CompareGreaterThanSpecial, 64, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_cmpps, INS_cmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX512F, CompareLessThanOrEqualSpecial, 64, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_cmpps, INS_cmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX512F, CompareLessThanSpecial, 64, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_cmpps, INS_cmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX512F, MoveMaskToVectorSpecial, 64, 1, {INS_vpmovm2b, INS_vpmovm2b, INS_vpmovm2w, INS_vpmovm2w, INS_vpmovm2d, INS_vpmovm2d, INS_vpmovm2q, INS_vpmovm2q, INS_vpmovm2d, INS_vpmovm2q}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX512F, KORTEST, 0, 1, {INS_kortestq, INS_kortestq, INS_kortestd, INS_kortestd, INS_kortestw, INS_kortestw, INS_kortestb, INS_kortestb, INS_kortestw, INS_kortestb}, HW_Category_Special, HW_Flag_NoRMWSemantics) diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 245d9bcc98eb58..5f80754bb55ea4 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -977,6 +977,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_ConditionalSelect: case NI_Vector256_ConditionalSelect: + case NI_Vector512_ConditionalSelect: { assert(sig->numArgs == 3); @@ -1388,25 +1389,12 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } - case NI_Vector512_EqualsAll: - case NI_Vector512_op_Equality: - { - assert(sig->numArgs == 2); - assert(IsBaselineVector512IsaSupportedDebugOnly()); - - var_types simdType = getSIMDTypeForSize(simdSize); - - op2 = impSIMDPopStack(); - op1 = impSIMDPopStack(); - - retNode = gtNewSimdCmpOpAllNode(GT_EQ, retType, op1, op2, simdBaseJitType, simdSize); - break; - } - case NI_Vector128_EqualsAll: case NI_Vector256_EqualsAll: + case NI_Vector512_EqualsAll: case NI_Vector128_op_Equality: case NI_Vector256_op_Equality: + case NI_Vector512_op_Equality: { assert(sig->numArgs == 2); @@ -1422,23 +1410,9 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } - case NI_Vector512_EqualsAny: - { - assert(sig->numArgs == 2); - assert(simdSize == 64); - assert(IsBaselineVector512IsaSupportedDebugOnly()); - - var_types simdType = getSIMDTypeForSize(simdSize); - - op2 = impSIMDPopStack(); - op1 = impSIMDPopStack(); - - retNode = gtNewSimdCmpOpAnyNode(GT_EQ, retType, op1, op2, simdBaseJitType, simdSize); - break; - } - case NI_Vector128_EqualsAny: case NI_Vector256_EqualsAny: + case NI_Vector512_EqualsAny: { assert(sig->numArgs == 2); @@ -1677,6 +1651,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_GreaterThan: case NI_Vector256_GreaterThan: + case NI_Vector512_GreaterThan: { assert(sig->numArgs == 2); @@ -1692,6 +1667,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_GreaterThanAll: case NI_Vector256_GreaterThanAll: + case NI_Vector512_GreaterThanAll: { assert(sig->numArgs == 2); @@ -1709,6 +1685,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_GreaterThanAny: case NI_Vector256_GreaterThanAny: + case NI_Vector512_GreaterThanAny: { assert(sig->numArgs == 2); @@ -1726,6 +1703,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_GreaterThanOrEqual: case NI_Vector256_GreaterThanOrEqual: + case NI_Vector512_GreaterThanOrEqual: { assert(sig->numArgs == 2); @@ -1741,6 +1719,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_GreaterThanOrEqualAll: case NI_Vector256_GreaterThanOrEqualAll: + case NI_Vector512_GreaterThanOrEqualAll: { assert(sig->numArgs == 2); @@ -1758,6 +1737,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_GreaterThanOrEqualAny: case NI_Vector256_GreaterThanOrEqualAny: + case NI_Vector512_GreaterThanOrEqualAny: { assert(sig->numArgs == 2); @@ -1775,6 +1755,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_LessThan: case NI_Vector256_LessThan: + case NI_Vector512_LessThan: { assert(sig->numArgs == 2); @@ -1790,6 +1771,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_LessThanAll: case NI_Vector256_LessThanAll: + case NI_Vector512_LessThanAll: { assert(sig->numArgs == 2); @@ -1807,6 +1789,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_LessThanAny: case NI_Vector256_LessThanAny: + case NI_Vector512_LessThanAny: { assert(sig->numArgs == 2); @@ -1824,6 +1807,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_LessThanOrEqual: case NI_Vector256_LessThanOrEqual: + case NI_Vector512_LessThanOrEqual: { assert(sig->numArgs == 2); @@ -1839,6 +1823,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_LessThanOrEqualAll: case NI_Vector256_LessThanOrEqualAll: + case NI_Vector512_LessThanOrEqualAll: { assert(sig->numArgs == 2); @@ -1856,6 +1841,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_LessThanOrEqualAny: case NI_Vector256_LessThanOrEqualAny: + case NI_Vector512_LessThanOrEqualAny: { assert(sig->numArgs == 2); @@ -2139,8 +2125,10 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_ShiftLeft: case NI_Vector256_ShiftLeft: + case NI_Vector512_ShiftLeft: case NI_Vector128_op_LeftShift: case NI_Vector256_op_LeftShift: + case NI_Vector512_op_LeftShift: { assert(sig->numArgs == 2); @@ -2162,8 +2150,10 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_ShiftRightArithmetic: case NI_Vector256_ShiftRightArithmetic: + case NI_Vector512_ShiftRightArithmetic: case NI_Vector128_op_RightShift: case NI_Vector256_op_RightShift: + case NI_Vector512_op_RightShift: { assert(sig->numArgs == 2); @@ -2196,8 +2186,10 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_ShiftRightLogical: case NI_Vector256_ShiftRightLogical: + case NI_Vector512_ShiftRightLogical: case NI_Vector128_op_UnsignedRightShift: case NI_Vector256_op_UnsignedRightShift: + case NI_Vector512_op_UnsignedRightShift: { assert(sig->numArgs == 2); diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 8b7cae89422bbb..16677ef3304e92 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -353,6 +353,7 @@ class Lowering final : public Phase GenTree* LowerHWIntrinsic(GenTreeHWIntrinsic* node); void LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIntrinsicId, GenCondition condition); GenTree* LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp); + GenTree* LowerHWIntrinsicCmpOpWithKReg(GenTreeHWIntrinsic* node); GenTree* LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node); GenTree* LowerHWIntrinsicDot(GenTreeHWIntrinsic* node); #if defined(TARGET_XARCH) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 4525a8a81bf172..f2ab01def6d9e5 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1041,6 +1041,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) { case NI_Vector128_ConditionalSelect: case NI_Vector256_ConditionalSelect: + case NI_Vector512_ConditionalSelect: { return LowerHWIntrinsicCndSel(node); } @@ -1188,18 +1189,30 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) case NI_Vector128_op_Equality: case NI_Vector256_op_Equality: - case NI_Vector512_op_Equality: { return LowerHWIntrinsicCmpOp(node, GT_EQ); } case NI_Vector128_op_Inequality: case NI_Vector256_op_Inequality: - case NI_Vector512_op_Inequality: { return LowerHWIntrinsicCmpOp(node, GT_NE); } + case NI_Vector512_GreaterThanAll: + case NI_Vector512_GreaterThanAny: + case NI_Vector512_GreaterThanOrEqualAll: + case NI_Vector512_GreaterThanOrEqualAny: + case NI_Vector512_LessThanAll: + case NI_Vector512_LessThanAny: + case NI_Vector512_LessThanOrEqualAll: + case NI_Vector512_LessThanOrEqualAny: + case NI_Vector512_op_Equality: + case NI_Vector512_op_Inequality: + { + return LowerHWIntrinsicCmpOpWithKReg(node); + } + case NI_Vector128_ToScalar: case NI_Vector256_ToScalar: case NI_Vector512_ToScalar: @@ -1649,8 +1662,7 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm var_types simdType = Compiler::getSIMDTypeForSize(simdSize); assert((intrinsicId == NI_Vector128_op_Equality) || (intrinsicId == NI_Vector128_op_Inequality) || - (intrinsicId == NI_Vector256_op_Equality) || (intrinsicId == NI_Vector256_op_Inequality) || - (intrinsicId == NI_Vector512_op_Equality) || (intrinsicId == NI_Vector512_op_Inequality)); + (intrinsicId == NI_Vector256_op_Equality) || (intrinsicId == NI_Vector256_op_Inequality)); assert(varTypeIsSIMD(simdType)); assert(varTypeIsArithmetic(simdBaseType)); @@ -1663,23 +1675,8 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm // /--* op1 simd // node = * HWINTRINSIC simd T op_Equality - GenTree* op1 = node->Op(1); - GenTree* op2 = node->Op(2); - - if (simdSize == 64) - { - GenTree* cmp = comp->gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512F_CompareEqualSpecial, - simdBaseJitType, simdSize); - BlockRange().InsertBefore(node, cmp); - LowerNode(cmp); - - node->ResetHWIntrinsicId(NI_AVX512F_KORTEST, cmp); - GenCondition cmpCnd = (cmpOp == GT_EQ) ? GenCondition::C : GenCondition::NC; - LowerHWIntrinsicCC(node, NI_AVX512F_KORTEST, cmpCnd); - - return node->gtNext; - } - + GenTree* op1 = node->Op(1); + GenTree* op2 = node->Op(2); GenCondition cmpCnd = (cmpOp == GT_EQ) ? GenCondition::EQ : GenCondition::NE; if (!varTypeIsFloating(simdBaseType) && op2->IsVectorZero() && @@ -1882,6 +1879,80 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm return LowerNode(node); } +//---------------------------------------------------------------------------------------------- +// Lowering::LowerHWIntrinsicCmpOpWithKReg: Lowers a Vector512 comparison intrinsic +// +// Arguments: +// node - The hardware intrinsic node. +// +GenTree* Lowering::LowerHWIntrinsicCmpOpWithKReg(GenTreeHWIntrinsic* node) +{ + NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); + CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); + var_types simdBaseType = node->GetSimdBaseType(); + unsigned simdSize = node->GetSimdSize(); + var_types simdType = Compiler::getSIMDTypeForSize(simdSize); + + assert((intrinsicId == NI_Vector512_GreaterThanAll) || (intrinsicId == NI_Vector512_GreaterThanOrEqualAll) || + (intrinsicId == NI_Vector512_LessThanAll) || (intrinsicId == NI_Vector512_LessThanOrEqualAll) || + (intrinsicId == NI_Vector512_op_Equality) || (intrinsicId == NI_Vector512_op_Inequality)); + + assert(varTypeIsSIMD(simdType)); + assert(varTypeIsArithmetic(simdBaseType)); + assert(simdSize == 64); + assert(node->gtType == TYP_BOOL); + + NamedIntrinsic newIntrinsicId = NI_Illegal; + switch (intrinsicId) + { + case NI_Vector512_GreaterThanAll: + { + newIntrinsicId = NI_AVX512F_CompareGreaterThanSpecial; + break; + } + case NI_Vector512_GreaterThanOrEqualAll: + { + newIntrinsicId = NI_AVX512F_CompareGreaterThanOrEqualSpecial; + break; + } + case NI_Vector512_LessThanAll: + { + newIntrinsicId = NI_AVX512F_CompareLessThanSpecial; + break; + } + case NI_Vector512_LessThanOrEqualAll: + { + newIntrinsicId = NI_AVX512F_CompareLessThanOrEqualSpecial; + break; + } + case NI_Vector512_op_Equality: + case NI_Vector512_op_Inequality: + { + newIntrinsicId = NI_AVX512F_CompareEqualSpecial; + break; + } + + default: + { + assert(false); + break; + } + } + + GenTree* op1 = node->Op(1); + GenTree* op2 = node->Op(2); + + GenTree* cmp = comp->gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, newIntrinsicId, simdBaseJitType, simdSize); + BlockRange().InsertBefore(node, cmp); + LowerNode(cmp); + + node->ResetHWIntrinsicId(NI_AVX512F_KORTEST, cmp); + GenCondition cmpCnd = (intrinsicId != NI_Vector512_op_Inequality) ? GenCondition::C : GenCondition::NC; + LowerHWIntrinsicCC(node, NI_AVX512F_KORTEST, cmpCnd); + + return node->gtNext; +} + //---------------------------------------------------------------------------------------------- // Lowering::LowerHWIntrinsicCndSel: Lowers a Vector128 or Vector256 Conditional Select call // @@ -1912,6 +1983,7 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node) // we can optimize the entire conditional select to // a single BlendVariable instruction (if supported by the architecture) + // TODO-XARCH-AVX512 Use VPBLENDM* and take input directly from K registers if cond is from MoveMaskToVectorSpecial. // First, determine if the condition is a per-element mask if (op1->OperIsHWIntrinsic() && HWIntrinsicInfo::ReturnsPerElementMask(op1->AsHWIntrinsic()->GetHWIntrinsicId())) {