From eafe8187aaef010016a6b76fccc0b2db4f678e2f Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Thu, 7 Dec 2023 01:43:07 +0000 Subject: [PATCH] Add Arm64 encodings for IF_SVE_EQ_3A to IF_SVE_HR_3A (#95679) --- src/coreclr/jit/codegenarm64.cpp | 72 +++++++++++ src/coreclr/jit/emit.h | 4 +- src/coreclr/jit/emitarm64.cpp | 198 ++++++++++++++++++++++++++++++- 3 files changed, 271 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 0ffcecddb0214..6ac2377ac4663 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -10513,6 +10513,30 @@ void CodeGen::genArm64EmitterUnitTests() INS_OPTS_SCALABLE_D); /* REVH ., /M, . */ theEmitter->emitIns_R_R_R(INS_sve_revw, EA_SCALABLE, REG_V25, REG_P4, REG_V16, INS_OPTS_SCALABLE_D); /* REVW .D, /M, .D */ + + // IF_SVE_EQ_3A + // Note: Scalable size is the size of the destination , not the source . + theEmitter->emitIns_R_R_R(INS_sve_sadalp, EA_SCALABLE, REG_V26, REG_P3, REG_V8, + INS_OPTS_SCALABLE_H); /* SADALP ., /M, . */ + theEmitter->emitIns_R_R_R(INS_sve_uadalp, EA_SCALABLE, REG_V27, REG_P2, REG_V9, + INS_OPTS_SCALABLE_S); /* UADALP ., /M, . */ + theEmitter->emitIns_R_R_R(INS_sve_uadalp, EA_SCALABLE, REG_V28, REG_P0, REG_V31, + INS_OPTS_SCALABLE_D); /* UADALP ., /M, . */ + + // IF_SVE_ES_3A + theEmitter->emitIns_R_R_R(INS_sve_sqabs, EA_SCALABLE, REG_V29, REG_P7, REG_V0, + INS_OPTS_SCALABLE_B); /* SQABS ., /M, . */ + theEmitter->emitIns_R_R_R(INS_sve_sqneg, EA_SCALABLE, REG_V31, REG_P6, REG_V1, + INS_OPTS_SCALABLE_H); /* SQNEG ., /M, . */ + theEmitter->emitIns_R_R_R(INS_sve_sqneg, EA_SCALABLE, REG_V0, REG_P5, REG_V2, + INS_OPTS_SCALABLE_S); /* SQNEG ., /M, . */ + theEmitter->emitIns_R_R_R(INS_sve_sqneg, EA_SCALABLE, REG_V1, REG_P4, REG_V3, + INS_OPTS_SCALABLE_D); /* SQNEG ., /M, . */ + theEmitter->emitIns_R_R_R(INS_sve_urecpe, EA_SCALABLE, REG_V2, REG_P3, REG_V4, + INS_OPTS_SCALABLE_S); /* URECPE .S, /M, .S */ + theEmitter->emitIns_R_R_R(INS_sve_ursqrte, EA_SCALABLE, REG_V3, REG_P0, REG_V5, + INS_OPTS_SCALABLE_S); /* URSQRTE .S, /M, .S */ + // IF_SVE_GA_2A theEmitter->emitIns_R_R_I(INS_sve_sqrshrn, EA_SCALABLE, REG_V0, REG_V0, 5, INS_OPTS_SCALABLE_H); // SQRSHRN .H, {.S-.S }, # @@ -10533,6 +10557,54 @@ void CodeGen::genArm64EmitterUnitTests() theEmitter->emitIns_R_R_I(INS_sve_uqrshrn, EA_SCALABLE, REG_V15, REG_V12, 1, INS_OPTS_SCALABLE_H); // UQRSHRN .H, {.S-.S }, # +// IF_SVE_GS_3A +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS_SVE_UNSUPPORTED + theEmitter->emitIns_R_R_R(INS_sve_faddqv, EA_8BYTE, REG_V16, REG_P0, REG_V12, + INS_OPTS_SCALABLE_H_WITH_SIMD_VECTOR); /* FADDQV ., , . */ + theEmitter->emitIns_R_R_R(INS_sve_fmaxnmqv, EA_8BYTE, REG_V17, REG_P1, REG_V11, + INS_OPTS_SCALABLE_S_WITH_SIMD_VECTOR); /* FMAXNMQV ., , . */ + theEmitter->emitIns_R_R_R(INS_sve_fmaxqv, EA_8BYTE, REG_V18, REG_P3, REG_V10, + INS_OPTS_SCALABLE_D_WITH_SIMD_VECTOR); /* FMAXQV ., , . */ + theEmitter->emitIns_R_R_R(INS_sve_fminnmqv, EA_8BYTE, REG_V19, REG_P4, REG_V9, + INS_OPTS_SCALABLE_H_WITH_SIMD_VECTOR); /* FMINNMQV ., , . */ + theEmitter->emitIns_R_R_R(INS_sve_fminqv, EA_8BYTE, REG_V20, REG_P5, REG_V8, + INS_OPTS_SCALABLE_D_WITH_SIMD_VECTOR); /* FMINQV ., , . */ +#endif // ALL_ARM64_EMITTER_UNIT_TESTS_SVE_UNSUPPORTED + + // IF_SVE_HE_3A + theEmitter->emitIns_R_R_R(INS_sve_faddv, EA_2BYTE, REG_V21, REG_P7, REG_V7, + INS_OPTS_SCALABLE_H_WITH_SIMD_SCALAR); /* FADDV , , . */ + theEmitter->emitIns_R_R_R(INS_sve_fmaxnmv, EA_2BYTE, REG_V22, REG_P6, REG_V6, + INS_OPTS_SCALABLE_H_WITH_SIMD_SCALAR); /* FMAXNMV , , . */ + theEmitter->emitIns_R_R_R(INS_sve_fmaxv, EA_4BYTE, REG_V23, REG_P5, REG_V5, + INS_OPTS_SCALABLE_S_WITH_SIMD_SCALAR); /* FMAXV , , . */ + theEmitter->emitIns_R_R_R(INS_sve_fminnmv, EA_8BYTE, REG_V24, REG_P4, REG_V4, + INS_OPTS_SCALABLE_D_WITH_SIMD_SCALAR); /* FMINNMV , , . */ + theEmitter->emitIns_R_R_R(INS_sve_fminv, EA_4BYTE, REG_V25, REG_P3, REG_V3, + INS_OPTS_SCALABLE_S_WITH_SIMD_SCALAR); /* FMINV , , . */ + + // IF_SVE_HQ_3A + theEmitter->emitIns_R_R_R(INS_sve_frinta, EA_SCALABLE, REG_V26, REG_P7, REG_V2, + INS_OPTS_SCALABLE_H); /* FRINTA ., /M, . */ + theEmitter->emitIns_R_R_R(INS_sve_frinti, EA_SCALABLE, REG_V27, REG_P6, REG_V1, + INS_OPTS_SCALABLE_S); /* FRINTI ., /M, . */ + theEmitter->emitIns_R_R_R(INS_sve_frintm, EA_SCALABLE, REG_V28, REG_P5, REG_V0, + INS_OPTS_SCALABLE_D); /* FRINTM ., /M, . */ + theEmitter->emitIns_R_R_R(INS_sve_frintn, EA_SCALABLE, REG_V29, REG_P4, REG_V10, + INS_OPTS_SCALABLE_H); /* FRINTN ., /M, . */ + theEmitter->emitIns_R_R_R(INS_sve_frintp, EA_SCALABLE, REG_V30, REG_P3, REG_V11, + INS_OPTS_SCALABLE_S); /* FRINTP ., /M, . */ + theEmitter->emitIns_R_R_R(INS_sve_frintx, EA_SCALABLE, REG_V31, REG_P2, REG_V12, + INS_OPTS_SCALABLE_D); /* FRINTX ., /M, . */ + theEmitter->emitIns_R_R_R(INS_sve_frintz, EA_SCALABLE, REG_V0, REG_P0, REG_V13, + INS_OPTS_SCALABLE_H); /* FRINTZ ., /M, . */ + + // IF_SVE_HR_3A + theEmitter->emitIns_R_R_R(INS_sve_frecpx, EA_SCALABLE, REG_V5, REG_P5, REG_V5, + INS_OPTS_SCALABLE_H); /* FRECPX ., /M, . */ + theEmitter->emitIns_R_R_R(INS_sve_fsqrt, EA_SCALABLE, REG_V6, REG_P6, REG_V6, + INS_OPTS_SCALABLE_S); /* FSQRT ., /M, . */ + #endif // ALL_ARM64_EMITTER_UNIT_TESTS_SVE #ifdef ALL_ARM64_EMITTER_UNIT_TESTS diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 3e960949bbc44..45574a2ec7438 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -1705,7 +1705,8 @@ class emitter #define PERFSCORE_THROUGHPUT_10C 10.0f // slower - 10 cycles #define PERFSCORE_THROUGHPUT_11C 10.0f // slower - 10 cycles #define PERFSCORE_THROUGHPUT_13C 13.0f // slower - 13 cycles -#define PERFSCORE_THROUGHPUT_14C 13.0f // slower - 13 cycles +#define PERFSCORE_THROUGHPUT_14C 14.0f // slower - 13 cycles +#define PERFSCORE_THROUGHPUT_16C 16.0f // slower - 13 cycles #define PERFSCORE_THROUGHPUT_19C 19.0f // slower - 19 cycles #define PERFSCORE_THROUGHPUT_25C 25.0f // slower - 25 cycles #define PERFSCORE_THROUGHPUT_33C 33.0f // slower - 33 cycles @@ -1730,6 +1731,7 @@ class emitter #define PERFSCORE_LATENCY_11C 11.0f #define PERFSCORE_LATENCY_12C 12.0f #define PERFSCORE_LATENCY_13C 13.0f +#define PERFSCORE_LATENCY_14C 14.0f #define PERFSCORE_LATENCY_15C 15.0f #define PERFSCORE_LATENCY_16C 16.0f #define PERFSCORE_LATENCY_18C 18.0f diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index dfdd84461cc01..f53332e145987 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -985,12 +985,12 @@ void emitter::emitInsSanityCheck(instrDesc* id) break; // Scalable to/from SIMD scalar. + case IF_SVE_AF_3A: // ........xx...... ...gggnnnnnddddd -- SVE bitwise logical reduction (predicated) + case IF_SVE_AK_3A: // ........xx...... ...gggnnnnnddddd -- SVE integer min/max reduction (predicated) case IF_SVE_CN_3A: // ........xx...... ...gggmmmmmddddd -- SVE conditionally extract element to SIMD&FP scalar case IF_SVE_CP_3A: // ........xx...... ...gggnnnnnddddd -- SVE copy SIMD&FP scalar register to vector // (predicated) case IF_SVE_CR_3A: // ........xx...... ...gggnnnnnddddd -- SVE extract element to SIMD&FP scalar register - case IF_SVE_AF_3A: // ........xx...... ...gggnnnnnddddd -- SVE bitwise logical reduction (predicated) - case IF_SVE_AK_3A: // ........xx...... ...gggnnnnnddddd -- SVE integer min/max reduction (predicated) elemsize = id->idOpSize(); assert(insOptsScalableWithSimdScalar(id->idInsOpt())); // xx assert(isVectorRegister(id->idReg1())); // ddddd @@ -1000,6 +1000,7 @@ void emitter::emitInsSanityCheck(instrDesc* id) break; // Scalable to FP SIMD scalar. + case IF_SVE_HE_3A: // ........xx...... ...gggnnnnnddddd -- SVE floating-point recursive reduction case IF_SVE_HJ_3A: // ........xx...... ...gggmmmmmddddd -- SVE floating-point serial reduction (predicated) elemsize = id->idOpSize(); assert(insOptsScalableWithSimdFPScalar(id->idInsOpt())); // xx @@ -1023,6 +1024,7 @@ void emitter::emitInsSanityCheck(instrDesc* id) // Scalable FP. case IF_SVE_GR_3A: // ........xx...... ...gggmmmmmddddd -- SVE2 floating-point pairwise operations case IF_SVE_HL_3A: // ........xx...... ...gggmmmmmddddd -- SVE floating-point arithmetic (predicated) + case IF_SVE_HR_3A: // ........xx...... ...gggnnnnnddddd -- SVE floating-point unary operations elemsize = id->idOpSize(); assert(insOptsScalableFloat(id->idInsOpt())); // xx assert(isVectorRegister(id->idReg1())); // ddddd @@ -1035,6 +1037,7 @@ void emitter::emitInsSanityCheck(instrDesc* id) case IF_SVE_AG_3A: // ........xx...... ...gggnnnnnddddd -- SVE bitwise logical reduction (quadwords) case IF_SVE_AJ_3A: // ........xx...... ...gggnnnnnddddd -- SVE integer add reduction (quadwords) case IF_SVE_AL_3A: // ........xx...... ...gggnnnnnddddd -- SVE integer min/max reduction (quadwords) + case IF_SVE_GS_3A: // ........xx...... ...gggnnnnnddddd -- SVE floating-point recursive reduction (quadwords) datasize = id->idOpSize(); assert(insOptsScalableWithSimdVector(id->idInsOpt())); // xx assert(isVectorRegister(id->idReg1())); // ddddd @@ -1117,6 +1120,37 @@ void emitter::emitInsSanityCheck(instrDesc* id) assert(isValidScalarDatasize(elemsize)); break; + // Scalable, .H, .S or .D + case IF_SVE_EQ_3A: // ........xx...... ...gggnnnnnddddd -- SVE2 integer pairwise add and accumulate long + case IF_SVE_HQ_3A: // ........xx...... ...gggnnnnnddddd -- SVE floating-point round to integral value + elemsize = id->idOpSize(); + assert(insOptsScalableAtLeastHalf(id->idInsOpt())); // xx + assert(isVectorRegister(id->idReg1())); // ddddd + assert(isLowPredicateRegister(id->idReg2())); // ggg + assert(isVectorRegister(id->idReg3())); // mmmmm + assert(isScalableVectorSize(elemsize)); + break; + + // Scalable, possibly fixed to .S + case IF_SVE_ES_3A: // ........xx...... ...gggnnnnnddddd -- SVE2 integer unary operations (predicated) + elemsize = id->idOpSize(); + switch (id->idIns()) + { + case INS_sve_sqabs: + case INS_sve_sqneg: + assert(insOptsScalableSimple(id->idInsOpt())); + break; + + default: + assert(id->idInsOpt() == INS_OPTS_SCALABLE_S); + break; + } + assert(isVectorRegister(id->idReg1())); // ddddd + assert(isLowPredicateRegister(id->idReg2())); // ggg + assert(isVectorRegister(id->idReg3())); // mmmmm + assert(isScalableVectorSize(elemsize)); + break; + case IF_SVE_GA_2A: // ............iiii ......nnnn.ddddd -- SME2 multi-vec shift narrow elemsize = id->idOpSize(); assert(isVectorRegister(id->idReg1())); // nnnn @@ -8570,6 +8604,15 @@ void emitter::emitIns_R_R_R( fmt = IF_SVE_EP_3A; break; + case INS_sve_sadalp: + case INS_sve_uadalp: + assert(isVectorRegister(reg1)); + assert(isLowPredicateRegister(reg2)); + assert(isVectorRegister(reg3)); + assert(insOptsScalableAtLeastHalf(opt)); + fmt = IF_SVE_EQ_3A; + break; + case INS_sve_addp: case INS_sve_smaxp: case INS_sve_sminp: @@ -8582,6 +8625,24 @@ void emitter::emitIns_R_R_R( fmt = IF_SVE_ER_3A; break; + case INS_sve_sqabs: + case INS_sve_sqneg: + assert(isVectorRegister(reg1)); + assert(isLowPredicateRegister(reg2)); + assert(isVectorRegister(reg3)); + assert(insOptsScalableSimple(opt)); + fmt = IF_SVE_ES_3A; + break; + + case INS_sve_urecpe: + case INS_sve_ursqrte: + assert(isVectorRegister(reg1)); + assert(isLowPredicateRegister(reg2)); + assert(isVectorRegister(reg3)); + assert(opt == INS_OPTS_SCALABLE_S); + fmt = IF_SVE_ES_3A; + break; + case INS_sve_sqadd: case INS_sve_sqsub: case INS_sve_sqsubr: @@ -8628,6 +8689,32 @@ void emitter::emitIns_R_R_R( fmt = IF_SVE_GR_3A; break; + case INS_sve_faddqv: + case INS_sve_fmaxnmqv: + case INS_sve_fminnmqv: + case INS_sve_fmaxqv: + case INS_sve_fminqv: + unreached(); // TODO-SVE: Not yet supported. + assert(isVectorRegister(reg1)); + assert(isLowPredicateRegister(reg2)); + assert(isVectorRegister(reg3)); + assert(insOptsScalableWithSimdVector(opt)); + fmt = IF_SVE_GS_3A; + break; + + case INS_sve_fmaxnmv: + case INS_sve_fmaxv: + case INS_sve_fminnmv: + case INS_sve_fminv: + case INS_sve_faddv: + assert(isFloatReg(reg1)); + assert(isLowPredicateRegister(reg2)); + assert(isVectorRegister(reg3)); + assert(insOptsScalableWithSimdFPScalar(opt)); + assert(isValidVectorElemsizeSveFloat(size)); + fmt = IF_SVE_HE_3A; + break; + case INS_sve_fadda: assert(isFloatReg(reg1)); assert(isLowPredicateRegister(reg2)); @@ -8667,6 +8754,29 @@ void emitter::emitIns_R_R_R( fmt = IF_SVE_HL_3A; break; + case INS_sve_frintn: + case INS_sve_frintm: + case INS_sve_frintp: + case INS_sve_frintz: + case INS_sve_frinta: + case INS_sve_frintx: + case INS_sve_frinti: + assert(isVectorRegister(reg1)); + assert(isLowPredicateRegister(reg2)); + assert(isVectorRegister(reg3)); + assert(insOptsScalableFloat(opt)); + fmt = IF_SVE_HQ_3A; + break; + + case INS_sve_frecpx: + case INS_sve_fsqrt: + assert(isVectorRegister(reg1)); + assert(isLowPredicateRegister(reg2)); + assert(isVectorRegister(reg3)); + assert(insOptsScalableFloat(opt)); + fmt = IF_SVE_HR_3A; + break; + default: unreached(); break; @@ -14307,13 +14417,19 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_SVE_CR_3A: // ........xx...... ...gggnnnnnddddd -- SVE extract element to SIMD&FP scalar register case IF_SVE_CU_3A: // ........xx...... ...gggnnnnnddddd -- SVE reverse within elements case IF_SVE_EP_3A: // ........xx...... ...gggmmmmmddddd -- SVE2 integer halving add/subtract (predicated) + case IF_SVE_EQ_3A: // ........xx...... ...gggnnnnnddddd -- SVE2 integer pairwise add and accumulate long case IF_SVE_ER_3A: // ........xx...... ...gggmmmmmddddd -- SVE2 integer pairwise arithmetic + case IF_SVE_ES_3A: // ........xx...... ...gggnnnnnddddd -- SVE2 integer unary operations (predicated) case IF_SVE_ET_3A: // ........xx...... ...gggmmmmmddddd -- SVE2 saturating add/subtract case IF_SVE_EU_3A: // ........xx...... ...gggmmmmmddddd -- SVE2 saturating/rounding bitwise shift left // (predicated) case IF_SVE_GR_3A: // ........xx...... ...gggmmmmmddddd -- SVE2 floating-point pairwise operations + case IF_SVE_GS_3A: // ........xx...... ...gggnnnnnddddd -- SVE floating-point recursive reduction (quadwords) + case IF_SVE_HE_3A: // ........xx...... ...gggnnnnnddddd -- SVE floating-point recursive reduction case IF_SVE_HJ_3A: // ........xx...... ...gggmmmmmddddd -- SVE floating-point serial reduction (predicated) case IF_SVE_HL_3A: // ........xx...... ...gggmmmmmddddd -- SVE floating-point arithmetic (predicated) + case IF_SVE_HQ_3A: // ........xx...... ...gggnnnnnddddd -- SVE floating-point round to integral value + case IF_SVE_HR_3A: // ........xx...... ...gggnnnnnddddd -- SVE floating-point unary operations code = emitInsCodeSve(ins, fmt); code |= insEncodeReg_V_4_to_0(id->idReg1()); // ddddd code |= insEncodeReg_P_12_to_10(id->idReg2()); // ggg @@ -16637,6 +16753,7 @@ void emitter::emitDispInsHelp( case IF_SVE_AK_3A: // ........xx...... ...gggnnnnnddddd -- SVE integer min/max reduction (predicated) case IF_SVE_CR_3A: // ........xx...... ...gggnnnnnddddd -- SVE extract element to SIMD&FP scalar register case IF_SVE_CS_3A: // ........xx...... ...gggnnnnnddddd -- SVE extract element to general register + case IF_SVE_HE_3A: // ........xx...... ...gggnnnnnddddd -- SVE floating-point recursive reduction emitDispReg(id->idReg1(), size, true); // ddddd emitDispPredicateReg(id->idReg2(), PREDICATE_NONE, true); // ggg emitDispSveReg(id->idReg3(), id->idInsOpt(), false); // mmmmm @@ -16646,6 +16763,7 @@ void emitter::emitDispInsHelp( case IF_SVE_AG_3A: // ........xx...... ...gggnnnnnddddd -- SVE bitwise logical reduction (quadwords) case IF_SVE_AJ_3A: // ........xx...... ...gggnnnnnddddd -- SVE integer add reduction (quadwords) case IF_SVE_AL_3A: // ........xx...... ...gggnnnnnddddd -- SVE integer min/max reduction (quadwords) + case IF_SVE_GS_3A: // ........xx...... ...gggnnnnnddddd -- SVE floating-point recursive reduction (quadwords) emitDispVectorReg(id->idReg1(), id->idInsOpt(), true); // ddddd emitDispPredicateReg(id->idReg2(), PREDICATE_NONE, true); // ggg emitDispSveReg(id->idReg3(), id->idInsOpt(), false); // mmmmm @@ -16662,6 +16780,9 @@ void emitter::emitDispInsHelp( case IF_SVE_AP_3A: // ........xx...... ...gggnnnnnddddd -- SVE bitwise unary operations (predicated) case IF_SVE_AQ_3A: // ........xx...... ...gggnnnnnddddd -- SVE integer unary operations (predicated) case IF_SVE_CU_3A: // ........xx...... ...gggnnnnnddddd -- SVE reverse within elements + case IF_SVE_ES_3A: // ........xx...... ...gggnnnnnddddd -- SVE2 integer unary operations (predicated) + case IF_SVE_HQ_3A: // ........xx...... ...gggnnnnnddddd -- SVE floating-point round to integral value + case IF_SVE_HR_3A: // ........xx...... ...gggnnnnnddddd -- SVE floating-point unary operations emitDispSveReg(id->idReg1(), id->idInsOpt(), true); // ddddd emitDispPredicateReg(id->idReg2(), PREDICATE_MERGE, true); // ggg emitDispSveReg(id->idReg3(), id->idInsOpt(), false); // mmmmm @@ -16689,6 +16810,13 @@ void emitter::emitDispInsHelp( emitDispReg(encodingZRtoSP(id->idReg3()), size, false); // mmmmm break; + // ., /M, . + case IF_SVE_EQ_3A: // ........xx...... ...gggnnnnnddddd -- SVE2 integer pairwise add and accumulate long + emitDispSveReg(id->idReg1(), id->idInsOpt(), true); // ddddd + emitDispLowPredicateReg(id->idReg2(), PREDICATE_MERGE, true); // ggg + emitDispSveReg(id->idReg3(), (insOpts)((unsigned)id->idInsOpt() - 1), false); // mmmmm + break; + case IF_SVE_GA_2A: // ............iiii ......nnnn.ddddd -- SME2 multi-vec shift narrow emitDispSveReg(id->idReg1(), id->idInsOpt(), true); // ddddd emitDispSveRegList(id->idReg2(), 2, INS_OPTS_SCALABLE_S, true); // nnnn @@ -19015,6 +19143,31 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins result.insThroughput = PERFSCORE_THROUGHPUT_2X; break; + case IF_SVE_ES_3A: // ........xx...... ...gggnnnnnddddd -- SVE2 integer unary operations (predicated) + switch (ins) + { + // Arithmetic, complex + case INS_sve_sqabs: + case INS_sve_sqneg: + // Reciprocal estimate + result.insLatency = PERFSCORE_LATENCY_2C; + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + break; + + // Reciprocal estimate + case INS_sve_urecpe: + case INS_sve_ursqrte: + result.insLatency = PERFSCORE_LATENCY_4C; + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + break; + + default: + // all other instructions + perfScoreUnhandledInstruction(id, &result); + break; + } + break; + // Arithmetic, complex case IF_SVE_ET_3A: // ........xx...... ...gggmmmmmddddd -- SVE2 saturating add/subtract result.insLatency = PERFSCORE_LATENCY_2C; @@ -19028,6 +19181,12 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins result.insThroughput = PERFSCORE_THROUGHPUT_1C; break; + // Arithmetic, pairwise add and accum long + case IF_SVE_EQ_3A: // ........xx...... ...gggnnnnnddddd -- SVE2 integer pairwise add and accumulate long + result.insLatency = PERFSCORE_LATENCY_4C; + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + break; + // Floating point arithmetic // Floating point min/max pairwise case IF_SVE_GR_3A: // ........xx...... ...gggmmmmmddddd -- SVE2 floating-point pairwise operations @@ -19035,6 +19194,12 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins result.insThroughput = PERFSCORE_THROUGHPUT_2X; break; + // Floating point reduction, F64. (Note: Worse for F32 and F16) + case IF_SVE_HE_3A: // ........xx...... ...gggnnnnnddddd -- SVE floating-point recursive reduction + result.insLatency = PERFSCORE_LATENCY_2C; + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + break; + // Floating point associative add, F64. (Note: Worse for F32 and F16) case IF_SVE_HJ_3A: // ........xx...... ...gggmmmmmddddd -- SVE floating-point serial reduction (predicated) result.insLatency = PERFSCORE_LATENCY_4C; @@ -19087,10 +19252,39 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins } break; + // Floating point round to integral, F64. (Note: Worse for F32 and F16) + case IF_SVE_HQ_3A: // ........xx...... ...gggnnnnnddddd -- SVE floating-point round to integral value + result.insLatency = PERFSCORE_LATENCY_3C; + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + break; + + case IF_SVE_HR_3A: // ........xx...... ...gggnnnnnddddd -- SVE floating-point unary operations + switch (ins) + { + // Floating point reciprocal estimate, F64. (Note: Worse for F32 and F16) + case INS_sve_frecpx: + result.insThroughput = PERFSCORE_THROUGHPUT_3C; + result.insLatency = PERFSCORE_LATENCY_1C; + break; + + // Floating point square root F64. (Note: Worse for F32 and F16) + case INS_sve_fsqrt: + result.insThroughput = PERFSCORE_THROUGHPUT_16C; + result.insLatency = PERFSCORE_LATENCY_14C; + break; + + default: + // all other instructions + perfScoreUnhandledInstruction(id, &result); + break; + } + break; + // Not available in Arm Neoverse N2 Software Optimization Guide. case IF_SVE_AG_3A: // ........xx...... ...gggnnnnnddddd -- SVE bitwise logical reduction (quadwords) case IF_SVE_AJ_3A: // ........xx...... ...gggnnnnnddddd -- SVE integer add reduction (quadwords) case IF_SVE_AL_3A: // ........xx...... ...gggnnnnnddddd -- SVE integer min/max reduction (quadwords) + case IF_SVE_GS_3A: // ........xx...... ...gggnnnnnddddd -- SVE floating-point recursive reduction (quadwords) result.insLatency = PERFSCORE_LATENCY_20C; // TODO-SVE: Placeholder result.insThroughput = PERFSCORE_THROUGHPUT_25C; // TODO-SVE: Placeholder break;