JIT ARM64-SVE: Add Sve.Abs() and Sve.Add() #100134

a74nh · 2024-03-22T10:24:12Z

Adds to API methods which require embedded masks (There is no mask parameter in the API call, but within the IR a truemask needs adding before calling codegen). Also adds all the embedded mask code.

dotnet-issue-labeler · 2024-03-22T10:24:17Z

Note regarding the new-api-needs-documentation label:

This serves as a reminder for when your PR is modifying a ref *.cs file and adding/modifying public APIs, please make sure the API implementation in the src *.cs file is documented with triple slash comments, so the PR reviewers can sign off that change.

dotnet-policy-service · 2024-03-22T10:24:47Z

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch
See info in area-owners.md if you want to be subscribed.

a74nh · 2024-03-22T10:31:49Z

From Sve_Add_int.cs:

        public void RunBasicScenario_UnsafeRead()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_UnsafeRead));

            var result = Sve.Add(
                Unsafe.Read<Vector<Int32>>(_dataTable.inArray1Ptr),
                Unsafe.Read<Vector<Int32>>(_dataTable.inArray2Ptr)
            );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr);
        }

Disassembly with DOTNET_TieredCompilation=0

note the use of ptrue and add z10.s, p7/m, z10.s, z16.s

G_M44345_IG01:  ;; offset=0x0000
            stp     fp, lr, [sp, #-0x50]!
            stp     d8, d9, [sp, #0x18]
            str     d10, [sp, #0x28]
            stp     x19, x20, [sp, #0x30]
            stp     x21, x22, [sp, #0x40]
            mov     fp, sp
            mov     x19, x0
						;; size=28 bbWeight=1 PerfScore 6.00
G_M44345_IG02:  ;; offset=0x001C
            movz    x0, #0x2CB8
            movk    x0, #0xB857 LSL #16
            movk    x0, #0xFFFF LSL #32
            movz    x1, #0x2CF8
            movk    x1, #0xB857 LSL #16
            movk    x1, #0xFFFF LSL #32
            movz    x2, #0x62F8      // code for System.String:Concat(System.String,System.String):System.String
            movk    x2, #0x6D80 LSL #16
            movk    x2, #0xFFFF LSL #32
            ldr     x2, [x2]
            blr     x2
            movz    x1, #0x5DB8      // code for System.Console:WriteLine(System.String)
            movk    x1, #0x6DD3 LSL #16
            movk    x1, #0xFFFF LSL #32
            ldr     x1, [x1]
            blr     x1
            add     x20, x19, #48
            mov     x21, x20
            ldrsb   wzr, [x21]
            add     x0, x21, #32
            movz    x1, #0xE220      // code for System.Runtime.InteropServices.GCHandle:AddrOfPinnedObject():long:this
            movk    x1, #0x6DA1 LSL #16
            movk    x1, #0xFFFF LSL #32
            ldr     x1, [x1]
            blr     x1
            ldr     x1, [x21, #0x18]
            add     x0, x0, x1
            sub     x0, x0, #1
            sub     x1, x1, #1
            bic     x0, x0, x1
            ldr     q8, [x0]
            mov     x21, x20
            add     x0, x21, #40
            movz    x1, #0xE220      // code for System.Runtime.InteropServices.GCHandle:AddrOfPinnedObject():long:this
            movk    x1, #0x6DA1 LSL #16
            movk    x1, #0xFFFF LSL #32
            ldr     x1, [x1]
            mov     v9.d[0], v8.d[1]
            blr     x1
            ldr     x1, [x21, #0x18]
            ptrue   p7.s
            add     x0, x0, x1
            sub     x0, x0, #1
            sub     x1, x1, #1
            bic     x0, x0, x1
            ldr     q16, [x0]
            mov     v8.d[1], v9.d[0]
            mov     v10.16b, v8.16b
            add     z10.s, p7/m, z10.s, z16.s
            mov     x21, x20
            add     x0, x21, #48
            movz    x1, #0xE220      // code for System.Runtime.InteropServices.GCHandle:AddrOfPinnedObject():long:this
            movk    x1, #0x6DA1 LSL #16
            movk    x1, #0xFFFF LSL #32
            ldr     x1, [x1]
            mov     v8.d[0], v10.d[1]
            blr     x1
            ldr     x1, [x21, #0x18]
            add     x0, x0, x1
            sub     x0, x0, #1
            sub     x1, x1, #1
            bic     x0, x0, x1
            mov     v10.d[1], v8.d[0]
            str     q10, [x0]
            mov     x21, x20
            add     x0, x21, #32
            movz    x1, #0xE220      // code for System.Runtime.InteropServices.GCHandle:AddrOfPinnedObject():long:this
            movk    x1, #0x6DA1 LSL #16
            movk    x1, #0xFFFF LSL #32
            ldr     x1, [x1]
            blr     x1
            ldr     x1, [x21, #0x18]
            add     x0, x0, x1
            sub     x0, x0, #1
            sub     x1, x1, #1
            bic     x21, x0, x1
            mov     x22, x20
            add     x0, x22, #40
            movz    x1, #0xE220      // code for System.Runtime.InteropServices.GCHandle:AddrOfPinnedObject():long:this
            movk    x1, #0x6DA1 LSL #16
            movk    x1, #0xFFFF LSL #32
            ldr     x1, [x1]
            blr     x1
            ldr     x1, [x22, #0x18]
            add     x0, x0, x1
            sub     x0, x0, #1
            sub     x1, x1, #1
            bic     x22, x0, x1
            add     x0, x20, #48
            movz    x1, #0xE220      // code for System.Runtime.InteropServices.GCHandle:AddrOfPinnedObject():long:this
            movk    x1, #0x6DA1 LSL #16
            movk    x1, #0xFFFF LSL #32
            ldr     x1, [x1]
            blr     x1
            ldr     x3, [x20, #0x18]
            add     x0, x0, x3
            sub     x0, x0, #1
            sub     x3, x3, #1
            bic     x3, x0, x3
            mov     x0, x19
            mov     x1, x21
            mov     x2, x22
            movz    x4, #0x2CF8
            movk    x4, #0xB857 LSL #16
            movk    x4, #0xFFFF LSL #32
            movz    x5, #0xF768      // code for JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_Add_int:ValidateResult(ulong,ulong,ulong,System.String):this
            movk    x5, #0x6E50 LSL #16
            movk    x5, #0xFFFF LSL #32
            ldr     x5, [x5]
            blr     x5
						;; size=440 bbWeight=1 PerfScore 109.00
G_M44345_IG03:  ;; offset=0x01D4
            ldp     x21, x22, [sp, #0x40]
            ldp     x19, x20, [sp, #0x30]
            ldr     d10, [sp, #0x28]
            ldp     d8, d9, [sp, #0x18]
            ldp     fp, lr, [sp], #0x50
            ret     lr
						;; size=24 bbWeight=1 PerfScore 7.00

a74nh · 2024-03-22T10:36:07Z

Without DOTNET_TieredCompilation set the test currently fails due to incorrect results.
I'm guessing something is going wrong saving/restoring the local variable to/from the stack

G_M44345_IG01:  ;; offset=0x0000
            stp     fp, lr, [sp, #-0x70]!
            mov     fp, sp
            str     xzr, [fp, #0x50]	// [V01 loc0]
            str     xzr, [fp, #0x58]	// [V01 loc0+0x08]
            str     x0, [fp, #0x68]	// [V00 this]
						;; size=20 bbWeight=1 PerfScore 4.50
G_M44345_IG02:  ;; offset=0x0014
            movz    x0, #0x7F20
            movk    x0, #0xB1A0 LSL #16
            movk    x0, #0xFFBE LSL #32
            movz    x1, #0xD8F0      // code for TestLibrary.TestFramework:BeginScenario(System.String)
            movk    x1, #0x5A2D LSL #16
            movk    x1, #0xFFFF LSL #32
            ldr     x1, [x1]
            blr     x1
            ldr     x0, [fp, #0x68]	// [V00 this]
            ldrsb   wzr, [x0]
            ldr     x0, [fp, #0x68]	// [V00 this]
            add     x0, x0, #48
            movz    x1, #0x54D0      // code for JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_Add_int+DataTable:get_inArray1Ptr():ulong:this
            movk    x1, #0x5A52 LSL #16
            movk    x1, #0xFFFF LSL #32
            ldr     x1, [x1]
            blr     x1
            ldr     q16, [x0]
            str     q16, [fp, #0x40]	// [V03 tmp1]
            ptrue   p7.s
            str     p7, [fp, #16, mul vl]	// [TEMP_02]
            ldr     q16, [fp, #0x40]	// [V03 tmp1]
            str     q16, [fp, #0x10]	// [TEMP_01]
            ldr     x0, [fp, #0x68]	// [V00 this]
            ldrsb   wzr, [x0]
            ldr     x0, [fp, #0x68]	// [V00 this]
            add     x0, x0, #48
            movz    x1, #0x54E8      // code for JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_Add_int+DataTable:get_inArray2Ptr():ulong:this
            movk    x1, #0x5A52 LSL #16
            movk    x1, #0xFFFF LSL #32
            ldr     x1, [x1]
            blr     x1
            ldr     q16, [x0]
            ldr     p7, [fp, #16, mul vl]	// [TEMP_02]
            ldr     q17, [fp, #0x10]	// [TEMP_01]
            mov     v18.16b, v17.16b
            add     z18.s, p7/m, z18.s, z16.s
            str     q18, [fp, #0x50]	// [V01 loc0]
            ldr     x0, [fp, #0x68]	// [V00 this]
            ldrsb   wzr, [x0]
            ldr     x0, [fp, #0x68]	// [V00 this]
            add     x0, x0, #48
            movz    x1, #0x5500      // code for JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_Add_int+DataTable:get_outArrayPtr():ulong:this
            movk    x1, #0x5A52 LSL #16
            movk    x1, #0xFFFF LSL #32
            ldr     x1, [x1]
            blr     x1
            ldr     q16, [fp, #0x50]	// [V01 loc0]
            str     q16, [x0]
            ldr     x0, [fp, #0x68]	// [V00 this]
            ldrsb   wzr, [x0]
            ldr     x0, [fp, #0x68]	// [V00 this]
            add     x0, x0, #48
            movz    x1, #0x54D0      // code for JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_Add_int+DataTable:get_inArray1Ptr():ulong:this
            movk    x1, #0x5A52 LSL #16
            movk    x1, #0xFFFF LSL #32
            ldr     x1, [x1]
            blr     x1
            str     x0, [fp, #0x38]	// [V04 tmp2]
            ldr     x0, [fp, #0x68]	// [V00 this]
            ldrsb   wzr, [x0]
            ldr     x0, [fp, #0x68]	// [V00 this]
            add     x0, x0, #48
            movz    x1, #0x54E8      // code for JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_Add_int+DataTable:get_inArray2Ptr():ulong:this
            movk    x1, #0x5A52 LSL #16
            movk    x1, #0xFFFF LSL #32
            ldr     x1, [x1]
            blr     x1
            str     x0, [fp, #0x30]	// [V05 tmp3]
            ldr     x0, [fp, #0x68]	// [V00 this]
            ldrsb   wzr, [x0]
            ldr     x0, [fp, #0x68]	// [V00 this]
            add     x0, x0, #48
            movz    x1, #0x5500      // code for JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_Add_int+DataTable:get_outArrayPtr():ulong:this
            movk    x1, #0x5A52 LSL #16
            movk    x1, #0xFFFF LSL #32
            ldr     x1, [x1]
            blr     x1
            str     x0, [fp, #0x28]	// [V06 tmp4]
            ldr     x3, [fp, #0x28]	// [V06 tmp4]
            ldr     x1, [fp, #0x38]	// [V04 tmp2]
            ldr     x2, [fp, #0x30]	// [V05 tmp3]
            ldr     x0, [fp, #0x68]	// [V00 this]
            movz    x4, #0x7F20
            movk    x4, #0xB1A0 LSL #16
            movk    x4, #0xFFBE LSL #32
            movz    x5, #0x5680      // code for JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_Add_int:ValidateResult(ulong,ulong,ulong,System.String):this
            movk    x5, #0x5A52 LSL #16
            movk    x5, #0xFFFF LSL #32
            ldr     x5, [x5]
            blr     x5
						;; size=364 bbWeight=1 PerfScore 129.50
G_M44345_IG03:  ;; offset=0x0180
            ldp     fp, lr, [sp], #0x70
            ret     lr
						;; size=8 bbWeight=1 PerfScore 2.00

Change-Id: Ie8cfe828595da9a87adbc0857c0c44c0ce12f5b2

a74nh · 2024-03-22T10:48:32Z

@kunalspathak @tannergooding - FYI. This is almost ready (bar the incorrect result).

a74nh · 2024-03-22T13:32:36Z

Fixed the issues - the scaling was wrong in emitIns_R_S/emitIns_S_R. However, this does make me concerned we're not consistent between general purpose instructions and SVE.

This is ready for review now.

cc @dotnet/arm64-contrib

a74nh · 2024-03-22T13:36:07Z

The fixed version of RunBasicScenario_UnsafeRead without DOTNET_TieredCompilation set. All that has changed is the address TEMP_02 is stored to / loaded from.

G_M44345_IG01:  ;; offset=0x0000
            stp     fp, lr, [sp, #-0x70]!
            mov     fp, sp
            str     xzr, [fp, #0x50]	// [V01 loc0]
            str     xzr, [fp, #0x58]	// [V01 loc0+0x08]
            str     x0, [fp, #0x68]	// [V00 this]
						;; size=20 bbWeight=1 PerfScore 4.50
G_M44345_IG02:  ;; offset=0x0014
            movz    x0, #0x7F20
            movk    x0, #0x6460 LSL #16
            movk    x0, #0xFFFF LSL #32
            movz    x1, #0xD920      // code for TestLibrary.TestFramework:BeginScenario(System.String)
            movk    x1, #0x6A30 LSL #16
            movk    x1, #0xFFFF LSL #32
            ldr     x1, [x1]
            blr     x1
            ldr     x0, [fp, #0x68]	// [V00 this]
            ldrsb   wzr, [x0]
            ldr     x0, [fp, #0x68]	// [V00 this]
            add     x0, x0, #48
            movz    x1, #0xF6D8      // code for JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_Add_int+DataTable:get_inArray1Ptr():ulong:this
            movk    x1, #0x6A44 LSL #16
            movk    x1, #0xFFFF LSL #32
            ldr     x1, [x1]
            blr     x1
            ldr     q16, [x0]
            str     q16, [fp, #0x40]	// [V03 tmp1]
            ptrue   p7.s
            str     p7, [fp, #32, mul vl]	// [TEMP_02]
            ldr     q16, [fp, #0x40]	// [V03 tmp1]
            str     q16, [fp, #0x10]	// [TEMP_01]
            ldr     x0, [fp, #0x68]	// [V00 this]
            ldrsb   wzr, [x0]
            ldr     x0, [fp, #0x68]	// [V00 this]
            add     x0, x0, #48
            movz    x1, #0xF6F0      // code for JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_Add_int+DataTable:get_inArray2Ptr():ulong:this
            movk    x1, #0x6A44 LSL #16
            movk    x1, #0xFFFF LSL #32
            ldr     x1, [x1]
            blr     x1
            ldr     q16, [x0]
            ldr     p7, [fp, #32, mul vl]	// [TEMP_02]
            ldr     q17, [fp, #0x10]	// [TEMP_01]
            mov     v18.16b, v17.16b
            add     z18.s, p7/m, z18.s, z16.s
            str     q18, [fp, #0x50]	// [V01 loc0]
            ldr     x0, [fp, #0x68]	// [V00 this]
            ldrsb   wzr, [x0]
            ldr     x0, [fp, #0x68]	// [V00 this]
            add     x0, x0, #48
            movz    x1, #0xF708      // code for JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_Add_int+DataTable:get_outArrayPtr():ulong:this
            movk    x1, #0x6A44 LSL #16
            movk    x1, #0xFFFF LSL #32
            ldr     x1, [x1]
            blr     x1
            ldr     q16, [fp, #0x50]	// [V01 loc0]
            str     q16, [x0]
            ldr     x0, [fp, #0x68]	// [V00 this]
            ldrsb   wzr, [x0]
            ldr     x0, [fp, #0x68]	// [V00 this]
            add     x0, x0, #48
            movz    x1, #0xF6D8      // code for JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_Add_int+DataTable:get_inArray1Ptr():ulong:this
            movk    x1, #0x6A44 LSL #16
            movk    x1, #0xFFFF LSL #32
            ldr     x1, [x1]
            blr     x1
            str     x0, [fp, #0x38]	// [V04 tmp2]
            ldr     x0, [fp, #0x68]	// [V00 this]
            ldrsb   wzr, [x0]
            ldr     x0, [fp, #0x68]	// [V00 this]
            add     x0, x0, #48
            movz    x1, #0xF6F0      // code for JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_Add_int+DataTable:get_inArray2Ptr():ulong:this
            movk    x1, #0x6A44 LSL #16
            movk    x1, #0xFFFF LSL #32
            ldr     x1, [x1]
            blr     x1
            str     x0, [fp, #0x30]	// [V05 tmp3]
            ldr     x0, [fp, #0x68]	// [V00 this]
            ldrsb   wzr, [x0]
            ldr     x0, [fp, #0x68]	// [V00 this]
            add     x0, x0, #48
            movz    x1, #0xF708      // code for JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_Add_int+DataTable:get_outArrayPtr():ulong:this
            movk    x1, #0x6A44 LSL #16
            movk    x1, #0xFFFF LSL #32
            ldr     x1, [x1]
            blr     x1
            str     x0, [fp, #0x28]	// [V06 tmp4]
            ldr     x3, [fp, #0x28]	// [V06 tmp4]
            ldr     x1, [fp, #0x38]	// [V04 tmp2]
            ldr     x2, [fp, #0x30]	// [V05 tmp3]
            ldr     x0, [fp, #0x68]	// [V00 this]
            movz    x4, #0x7F20
            movk    x4, #0x6460 LSL #16
            movk    x4, #0xFFFF LSL #32
            movz    x5, #0xF888      // code for JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_Add_int:ValidateResult(ulong,ulong,ulong,System.String):this
            movk    x5, #0x6A44 LSL #16
            movk    x5, #0xFFFF LSL #32
            ldr     x5, [x5]
            blr     x5
						;; size=364 bbWeight=1 PerfScore 129.50
G_M44345_IG03:  ;; offset=0x0180
            ldp     fp, lr, [sp], #0x70
            ret     lr
						;; size=8 bbWeight=1 PerfScore 2.00

kunalspathak

Overall looks good. I need to understand your comment about emitarm64.cpp a bit more. Will check offline.

kunalspathak · 2024-03-22T16:39:46Z

src/coreclr/jit/hwintrinsiccodegenarm64.cpp

+                if (targetReg != op2Reg)
+                {
+                    assert(targetReg != op3Reg);
+                    GetEmitter()->emitIns_Mov(INS_mov, emitTypeSize(node), targetReg, op2Reg, /* canSkip */ true);


I am not sure if it is needed here, but we should start document the rules of using movprfx instruction. Do you have a good documentation that explains it and where it is used? I remember seeing it long back but not sure what it was.

It's the same general mov needed for any RMW instruction since you need the destination register and one of the input registers to be the same. The only really different thing is that movprfx is specifically the one you want for SVE instructions that have an embedded mask. -- The potentially confusing part here is that it's checking IsEmbeddedMaskedOperation when that check as currently implemented just means "I have a mask, but its all true" (I gave feedback that should be renamed above)

So I'm not sure there's really any rules to document here, we certainly don't document it at all for x86 or x64 or any of the existing Arm64 RMW cases.

This reverts commit e9fa735.

a74nh · 2024-03-25T09:02:25Z

I need to understand your comment about emitarm64.cpp a bit more

I've redone the those changes. Just needed to make sure we scale by the right size (the size of the vector or predicate).

kunalspathak · 2024-03-25T17:07:23Z

So if I see https://helixre8s23ayyeko0k025g8.blob.core.windows.net/dotnet-runtime-refs-pull-100134-merge-c2bf8feefece4940bf/HardwareIntrinsics_Arm_r/1/console.7afb876d.log?helixlogtype=result , these tests seems to run on non-sve hardware, which might be something that we don't want currently, specially because 1) we do not have the SVE hardware in lab, 2) We are testing the same thing for all the tests which is that it throws NotSupportedException 3) It consumes test execution time specially when we start adding more tests.

In long term, we should find a way to include/exclude arm64 tests based on if they are running on sve vs. non-sve hardware, something that I have asked @TIHan to investigate.

kunalspathak · 2024-03-25T17:58:23Z

So if I see https://helixre8s23ayyeko0k025g8.blob.core.windows.net/dotnet-runtime-refs-pull-100134-merge-c2bf8feefece4940bf/HardwareIntrinsics_Arm_r/1/console.7afb876d.log?helixlogtype=result , these tests seems to run on non-sve hardware, which might be something that we don't want currently, specially because 1) we do not have the SVE hardware in lab, 2) We are testing the same thing for all the tests which is that it throws NotSupportedException 3) It consumes test execution time specially when we start adding more tests.

In long term, we should find a way to include/exclude arm64 tests based on if they are running on sve vs. non-sve hardware, something that I have asked @TIHan to investigate.

Spoke to @tannergooding and for now, we should continue running the sve tests on non-sve hardware to test NotSupportedException for every API. After .NET 9, when we have high confidence, we will mark the sve apis on non-sve hardware to run just on outerloop CI

kunalspathak · 2024-03-25T17:59:00Z

src/coreclr/jit/emitarm64.cpp

@@ -9836,7 +9836,7 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va
            fmt      = IF_SVE_IE_2A;

            // TODO-SVE: Don't assume 128bit vectors
-            scale        = NaturalScale_helper(EA_16BYTE);
+            scale        = 4;


wondering why is this change? It will return 4 here, right?

wondering why is this change? It will return 4 here, right?

Agreed. The changes for the mask load/stores were the required ones.

Reverted.

kunalspathak · 2024-03-27T17:47:57Z

ping @tannergooding

tannergooding · 2024-03-28T15:03:21Z

src/coreclr/jit/emitarm64.cpp

@@ -7875,7 +7875,7 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va

            // TODO-SVE: Don't assume 128bit vectors
            // Predicate size is vector length / 8
-            scale        = NaturalScale_helper(EA_2BYTE);
+            scale        = 2;


Given the todo above, are we potentially missing an assert which would ensure this is updated for future instructions that need it?

Given the todo above, are we potentially missing an assert which would ensure this is updated for future instructions that need it?

This is within code just for predicates. So will always be valid as long as the vector length is 128bits.

There is a question of what testing to do for >128bit vectors for .Net9. I suspect a lot of assumptions are made elsewhere that vector length is 128bits, and will require some major debugging. At some point I can do some testing on larger vector length machines. Due to time constraints, maybe the solution for .Net9 is have a check on startup: if vector length is >128bits then ask kernel to reduce to 128bits? Or just disable on >128 bits machines.

That question probably needs a much broader discussion.

There's notably not a lot of benefit of 128-bit SVE over AdvSimd. There's a few new instructions and the ability to emit denser assembly in a few cases, but most of that isn't for the typical hot loop of a method and in some case the code can be less dense (emitting SVE Abs requires predication and a ptrue to be generated; while AdvSimd Abs does not, so given 128-bit vectors and no predication; AdvSimd is better to use for that instruction).

At the same time, there is hardware (AWS Graviton) that has 256-bit SVE support that will most likely run on .NET 9. So it would probably be best if we could ensure it is appropriately handled and we're best able to take advantage of such hardware, not artificially limit it.

tannergooding · 2024-03-28T15:04:16Z

src/coreclr/jit/hwintrinsic.cpp

+                break;
+
+            case 3:
+                op3 = getArgForHWIntrinsic(sigReader.GetOp3Type(), sigReader.op3ClsHnd);


Is there no case where a range check is needed for 3 arguments?

Should there be an assert to validate that?

Is there no case where a range check is needed for 3 arguments?

That seems to be the case.

The case for 3 args is checked further down and depending on the intrinsics, does the addRangeCheckIfNeeded on appropriate arg. Not sure if we should still add assert.

src/coreclr/jit/hwintrinsic.cpp

tannergooding · 2024-03-28T15:09:09Z

src/coreclr/jit/hwintrinsic.cpp

+                default:
+                    break;
+            }
+            op1 = gtNewSimdEmbeddedMaskNode(simdBaseJitType, simdSize);


Why does this need to be done here?

This seems like its just inserting the implicit AllTrue mask that some instructions require, which is effectively allocating and forcing an extra node to be carried through all of HIR when the high level operation doesn't actually care about it.

Seemingly this could just be inserted as part of lowering instead so that it only impacts LSRA and codegen?

src/coreclr/jit/hwintrinsicarm64.cpp

tannergooding · 2024-03-28T15:23:30Z

src/coreclr/jit/hwintrinsiccodegenarm64.cpp

+                        break;
+
+                    case 3:
+                        GetEmitter()->emitIns_R_R_R(ins, emitSize, targetReg, op2Reg, op3Reg, opt);


Do we need an assert that none of the input registers are mask registers?

I'd guess the only RMW instructions with masks are ones that have at least 4 operands, so we shouldn't ever see that here.

tannergooding · 2024-03-28T15:29:46Z

src/coreclr/jit/hwintrinsiclistarm64sve.h

@@ -17,6 +17,10 @@
 //  SVE Intrinsics

 // Sve
+HARDWARE_INTRINSIC(Sve,           Abs,                                                              -1,     -1,      false, {INS_sve_abs,        INS_invalid,        INS_sve_abs,        INS_invalid,        INS_sve_abs,        INS_invalid,        INS_sve_abs,        INS_invalid,        INS_sve_fabs,       INS_sve_fabs},    HW_Category_SIMD,                  HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation)


Do we need the Scalable flag? Can that not just be detected by the InstructionSet being Sve?

tannergooding · 2024-03-28T15:30:57Z

src/coreclr/jit/hwintrinsiclistarm64sve.h

@@ -17,6 +17,10 @@
 //  SVE Intrinsics

 // Sve
+HARDWARE_INTRINSIC(Sve,           Abs,                                                              -1,     -1,      false, {INS_sve_abs,        INS_invalid,        INS_sve_abs,        INS_invalid,        INS_sve_abs,        INS_invalid,        INS_sve_abs,        INS_invalid,        INS_sve_fabs,       INS_sve_fabs},    HW_Category_SIMD,                  HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation)
+
+HARDWARE_INTRINSIC(Sve,           Add,                                                              -1,     -1,      false, {INS_sve_add,        INS_sve_add,        INS_sve_add,        INS_sve_add,        INS_sve_add,        INS_sve_add,        INS_sve_add,        INS_sve_add,        INS_sve_fadd,       INS_sve_fadd},    HW_Category_SIMD,                  HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)


Can you elaborate a bit on what LowMaskedOperation means?

I'm looking at the architecture manual and don't see any limitations on the Add (predicated) instruction. The attached comment to the enum entry is // The intrinsic uses a mask in arg1 to select elements present in the result, and must use a low register.`

Can you elaborate a bit on what LowMaskedOperation means?

The instruction only has 3bits for a predicate register so is limited to using predicate registers 0 to 7. This is quite a common pattern across Sve, hence using an common enum for it (I wouldn't do similar for the handful have only 2 bits)

Ah, I see. I think a flag makes sense then, but I'm not a huge fan of the name given how low is used in various other contexts.

Maybe something more explicit like RestrictedPredicateRegisterSet (or an alternative name) would work and make it clearer what the mask means/implies?

tannergooding · 2024-03-28T15:31:47Z

src/coreclr/jit/hwintrinsiclistarm64sve.h

@@ -17,6 +17,10 @@
 //  SVE Intrinsics

 // Sve
+HARDWARE_INTRINSIC(Sve,           Abs,                                                              -1,     -1,      false, {INS_sve_abs,        INS_invalid,        INS_sve_abs,        INS_invalid,        INS_sve_abs,        INS_invalid,        INS_sve_abs,        INS_invalid,        INS_sve_fabs,       INS_sve_fabs},    HW_Category_SIMD,                  HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation)
+
+HARDWARE_INTRINSIC(Sve,           Add,                                                              -1,     -1,      false, {INS_sve_add,        INS_sve_add,        INS_sve_add,        INS_sve_add,        INS_sve_add,        INS_sve_add,        INS_sve_add,        INS_sve_add,        INS_sve_fadd,       INS_sve_fadd},    HW_Category_SIMD,                  HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)


Why does this one in particular need EmbeddedMaskedOperation, the manual has entries for both Add (vectors, predicated) and Add (vectors, unpredicated). Both entries look to be for SVE1

tannergooding · 2024-03-28T15:38:29Z

...braries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Arm/Sve.PlatformNotSupported.cs

+        ///  Abs : Absolute value
+
+        /// <summary>
+        /// svint8_t svabs[_s8]_m(svint8_t inactive, svbool_t pg, svint8_t op)


nit: We've historically also included the primary underlying instruction that gets emitted for a given API.

For example, with AdvSimd:

/// <summary> /// float64x2_t vabsq_f64 (float64x2_t a) /// A64: FABS Vd.2D, Vn.2D /// </summary>

In this case, I'd expect something like A64: ABS Zd.B, Pg/M, Zn.B (assuming I got that mostly right)

tannergooding · 2024-03-28T15:39:27Z

src/libraries/System.Runtime.Intrinsics/ref/System.Runtime.Intrinsics.cs

+        public static System.Numerics.Vector<long> Abs(System.Numerics.Vector<long> value) { throw null; }
+        public static System.Numerics.Vector<float> Abs(System.Numerics.Vector<float> value) { throw null; }
+        public static System.Numerics.Vector<double> Abs(System.Numerics.Vector<double> value) { throw null; }
+


There are no newlines when this is generated using the tooling (the tooling doesn't always work, unfortunately), so we shouldn't add any here ourselves either.

tannergooding · 2024-03-28T15:40:46Z

src/libraries/System.Runtime.Intrinsics/ref/System.Runtime.Intrinsics.cs

+        public static System.Numerics.Vector<sbyte> Abs(System.Numerics.Vector<sbyte> value) { throw null; }
+        public static System.Numerics.Vector<short> Abs(System.Numerics.Vector<short> value) { throw null; }
+        public static System.Numerics.Vector<int> Abs(System.Numerics.Vector<int> value) { throw null; }
+        public static System.Numerics.Vector<long> Abs(System.Numerics.Vector<long> value) { throw null; }
+        public static System.Numerics.Vector<float> Abs(System.Numerics.Vector<float> value) { throw null; }
+        public static System.Numerics.Vector<double> Abs(System.Numerics.Vector<double> value) { throw null; }


The tooling also expects these to be in alphabetical order, based on the fully qualified type name.

So byte, double, short, int, long, nint, sbyte, float, ushort, uint, ulong, nuint (corresponding to Byte, Double, Int16, Int32, Int64, IntPtr, SByte, Single, UInt16, UInt32, UInt64, UIntPtr)

tannergooding · 2024-03-28T15:42:24Z

src/tests/JIT/HardwareIntrinsics/Arm/Shared/_SveBinaryOpTestTemplate.template

@@ -0,0 +1,328 @@
+// Licensed to the .NET Foundation under one or more agreements.


What's "different" about this compared to the _BinaryOpTestTemplate.template?

I would have expected them to be essentially identical and any minor differences could likely have been adjusted, rather than requiring an entirely new template.

tannergooding · 2024-03-28T15:45:55Z

src/tests/Common/GenerateHWIntrinsicTests/GenerateHWIntrinsicTests_Arm.cs

@@ -2887,6 +2889,24 @@

 (string templateFileName, Dictionary<string, string> templateData)[] SveInputs = new []
 {
+    ("SveSimpleVecOpTest.template",       new Dictionary<string, string> { ["TestName"] = "Sve_Abs_float",                                                                                         ["Isa"] = "Sve",           ["LoadIsa"] = "Sve",     ["Method"] = "Abs",                                                                  ["RetVectorType"] = "Vector",    ["RetBaseType"] = "Single",  ["Op1VectorType"] = "Vector",    ["Op1BaseType"] = "Single",                                                                                                                           ["LargestVectorSize"] = "8",  ["NextValueOp1"] = "-TestLibrary.Generator.GetSingle()",                                                                                                                                                     ["ValidateIterResult"] = "Helpers.Abs(firstOp[i]) != result[i]"}),


Why is LargestVectorSize == 8, presumably it should be more like 256 which is the largest supported SVE vector length?

Or at worst it should be 16 (the minimum SVE vector length), 32 (the largest size in server grade hardware), or 64 (the largest size in any SVE implementation to date, notably only in a supercomputer).

yes, I think the same as I commented in #100366 (comment).

kunalspathak · 2024-04-24T21:44:06Z

This functionality got merged as part of #100743

dotnet-issue-labeler bot added area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI new-api-needs-documentation labels Mar 22, 2024

dotnet-policy-service bot added the community-contribution Indicates that the PR has been added by a community member label Mar 22, 2024

JIT ARM64-SVE: Add Sve.Abs() and Sve.Add()

0d437e3

Change-Id: Ie8cfe828595da9a87adbc0857c0c44c0ce12f5b2

a74nh force-pushed the api_abs_github branch from 5344235 to 0d437e3 Compare March 22, 2024 10:46

This was referenced Mar 22, 2024

Build linux-x64 Debug Mono_MiniJIT_LibrariesTests failure #99942

Closed

Assert failure(PID 13812 [0x000035f4], Thread: 14128 [0x3730]): promoted_bytes (heap_number) == promoted #100035

Closed

Fix sve scaling in enitIns_R_S/S_R

e9fa735

a74nh marked this pull request as ready for review March 22, 2024 13:32

build-analysis bot mentioned this pull request Mar 22, 2024

Crash in Microsoft.Extensions.Logging.Generators.Roslyn4.0.Tests.WorkItemExecution #90019

Open

kunalspathak added the arm-sve Work related to arm64 SVE/SVE2 support label Mar 22, 2024

kunalspathak reviewed Mar 22, 2024

View reviewed changes

kunalspathak mentioned this pull request Mar 22, 2024

Arm64: Implement SVE APIs #99957

Closed

a74nh added 3 commits March 23, 2024 13:21

Revert "Fix sve scaling in enitIns_R_S/S_R"

5acc122

This reverts commit e9fa735.

Fix sve scaling in enitIns_R_S/S_R

15e893d

Restore testing

c22f458

kunalspathak reviewed Mar 25, 2024

View reviewed changes

kunalspathak requested a review from tannergooding March 25, 2024 17:59

Use NaturalScale_helper for vector load/stores

508a52a

Merge remote-tracking branch 'origin/main' into api_abs_github

5825c20

tannergooding reviewed Mar 28, 2024

View reviewed changes

src/coreclr/jit/hwintrinsic.cpp Show resolved Hide resolved

tannergooding reviewed Mar 28, 2024

View reviewed changes

src/coreclr/jit/hwintrinsicarm64.cpp Show resolved Hide resolved

tannergooding reviewed Mar 28, 2024

View reviewed changes

This was referenced Apr 7, 2024

Arm64/Sve: Predicated Abs, Predicated/UnPredicated Add, Conditional Select #100743

Merged

Arm64/Sve: The scaling of immediate in ldr and other instructions should take into account variable VL #100991

Open

kunalspathak closed this Apr 24, 2024

github-actions bot locked and limited conversation to collaborators May 25, 2024

		@@ -0,0 +1,328 @@
		// Licensed to the .NET Foundation under one or more agreements.

JIT ARM64-SVE: Add Sve.Abs() and Sve.Add() #100134

JIT ARM64-SVE: Add Sve.Abs() and Sve.Add() #100134

Conversation

a74nh commented Mar 22, 2024

dotnet-issue-labeler bot commented Mar 22, 2024

dotnet-policy-service bot commented Mar 22, 2024

a74nh commented Mar 22, 2024

a74nh commented Mar 22, 2024

a74nh commented Mar 22, 2024

a74nh commented Mar 22, 2024

a74nh commented Mar 22, 2024

kunalspathak left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

a74nh commented Mar 25, 2024

kunalspathak commented Mar 25, 2024

kunalspathak commented Mar 25, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

kunalspathak commented Mar 27, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

tannergooding Mar 28, 2024 • edited Loading

Choose a reason for hiding this comment

tannergooding Mar 28, 2024 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

kunalspathak commented Apr 24, 2024

tannergooding Mar 28, 2024 •

edited

Loading

tannergooding Mar 28, 2024 •

edited

Loading