From 11a494d7b3372d855114667c2959e7fa17b9eb4b Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Fri, 5 Jul 2024 16:22:49 -0700 Subject: [PATCH 1/2] AVX128: Prescale addresses in gathers if possible If the host supports SVE128, if the address element size and data size is 64-bit, and the scale is not one of the two that is supported by SVE; Then prescale the addresses. 64-bit address overflow masks the top bits so is well defined that we can scale the vector elements and still execute the SVE code path in that case. Removing the ASIMD code paths from a lot of gathers. Fixes #3805 --- .../Interface/Core/OpcodeDispatcher/AVX_128.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp index f7dc51897d..16477b47df 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp @@ -2603,6 +2603,19 @@ OpDispatchBuilder::RefPair OpDispatchBuilder::AVX128_VPGatherImpl(OpSize Size, O BaseAddr = Invalid(); } + if (ElementLoadSize == OpSize::i64Bit && AddrElementSize == OpSize::i64Bit && (VSIB.Scale == 2 || VSIB.Scale == 4) && + CTX->HostFeatures.SupportsSVE128) { + // SVE gather instructions don't support scaling their vector elements by anything other than 1 or the address element size. + // Pre-scale 64-bit addresses in the case that scale doesn't match in-order to hit SVE code paths more frequently. + // Only hit this path if the host supports SVE. Otherwise it's a degradation for the ASIMD codepath. + VSIB.Low = _VShlI(OpSize::i128Bit, OpSize::i64Bit, VSIB.Low, FEXCore::ilog2(VSIB.Scale)); + if (!Is128Bit) { + VSIB.High = _VShlI(OpSize::i128Bit, OpSize::i64Bit, VSIB.High, FEXCore::ilog2(VSIB.Scale)); + } + ///< Set the scale to one now that it has been prescaled. + VSIB.Scale = 1; + } + RefPair Result {}; ///< Calculate the low-half. Result.Low = _VLoadVectorGatherMasked(OpSize::i128Bit, ElementLoadSize, Dest.Low, Mask.Low, BaseAddr, VSIB.Low, VSIB.High, From 6e8ca3bc6c0d28f07849a1fc79d6b57bf39088bd Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Fri, 5 Jul 2024 16:25:59 -0700 Subject: [PATCH 2/2] InstcountCI: Update for gather prescaling --- .../AVX128/VEX_map2_SVE128.json | 368 ++++++------------ unittests/InstructionCountCI/VEX_map2.json | 192 +++------ 2 files changed, 168 insertions(+), 392 deletions(-) diff --git a/unittests/InstructionCountCI/AVX128/VEX_map2_SVE128.json b/unittests/InstructionCountCI/AVX128/VEX_map2_SVE128.json index fcafd8c2e7..320175b8e5 100644 --- a/unittests/InstructionCountCI/AVX128/VEX_map2_SVE128.json +++ b/unittests/InstructionCountCI/AVX128/VEX_map2_SVE128.json @@ -867,22 +867,16 @@ ] }, "vpgatherqq xmm0, [xmm1*2 + rax], xmm2": { - "ExpectedInstructionCount": 15, + "ExpectedInstructionCount": 9, "Comment": [ "Map 2 0b01 0x91 128-bit" ], "ExpectedArm64ASM": [ + "shl v2.2d, v17.2d, #1", "mrs x20, nzcv", - "mov x0, v18.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[0]", - "add x1, x4, x0, lsl #1", - "ld1 {v16.d}[0], [x1]", - "mov x0, v18.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[1]", - "add x1, x4, x0, lsl #1", - "ld1 {v16.d}[1], [x1]", + "cmplt p0.d, p6/z, z18.d, #0", + "ld1d {z0.d}, p0/z, [x4, z2.d]", + "mov z16.d, p0/m, z0.d", "movi v18.2d, #0x0", "str q18, [x28, #16]", "str q18, [x28, #48]", @@ -890,22 +884,16 @@ ] }, "vpgatherqq xmm0, [xmm1*4 + rax], xmm2": { - "ExpectedInstructionCount": 15, + "ExpectedInstructionCount": 9, "Comment": [ "Map 2 0b01 0x91 128-bit" ], "ExpectedArm64ASM": [ + "shl v2.2d, v17.2d, #2", "mrs x20, nzcv", - "mov x0, v18.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[0]", - "add x1, x4, x0, lsl #2", - "ld1 {v16.d}[0], [x1]", - "mov x0, v18.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[1]", - "add x1, x4, x0, lsl #2", - "ld1 {v16.d}[1], [x1]", + "cmplt p0.d, p6/z, z18.d, #0", + "ld1d {z0.d}, p0/z, [x4, z2.d]", + "mov z16.d, p0/m, z0.d", "movi v18.2d, #0x0", "str q18, [x28, #16]", "str q18, [x28, #48]", @@ -951,7 +939,7 @@ ] }, "vpgatherqq ymm0, [ymm1*2 + rax], ymm2": { - "ExpectedInstructionCount": 28, + "ExpectedInstructionCount": 16, "Comment": [ "Map 2 0b01 0x91 256-bit" ], @@ -959,27 +947,15 @@ "ldr q2, [x28, #16]", "ldr q3, [x28, #32]", "ldr q4, [x28, #48]", + "shl v5.2d, v17.2d, #1", + "shl v3.2d, v3.2d, #1", "mrs x20, nzcv", - "mov x0, v18.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[0]", - "add x1, x4, x0, lsl #1", - "ld1 {v16.d}[0], [x1]", - "mov x0, v18.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[1]", - "add x1, x4, x0, lsl #1", - "ld1 {v16.d}[1], [x1]", - "mov x0, v4.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v3.d[0]", - "add x1, x4, x0, lsl #1", - "ld1 {v2.d}[0], [x1]", - "mov x0, v4.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v3.d[1]", - "add x1, x4, x0, lsl #1", - "ld1 {v2.d}[1], [x1]", + "cmplt p0.d, p6/z, z18.d, #0", + "ld1d {z0.d}, p0/z, [x4, z5.d]", + "mov z16.d, p0/m, z0.d", + "cmplt p0.d, p6/z, z4.d, #0", + "ld1d {z0.d}, p0/z, [x4, z3.d]", + "mov z2.d, p0/m, z0.d", "str q2, [x28, #16]", "movi v18.2d, #0x0", "str q18, [x28, #48]", @@ -987,7 +963,7 @@ ] }, "vpgatherqq ymm0, [ymm1*4 + rax], ymm2": { - "ExpectedInstructionCount": 28, + "ExpectedInstructionCount": 16, "Comment": [ "Map 2 0b01 0x91 256-bit" ], @@ -995,27 +971,15 @@ "ldr q2, [x28, #16]", "ldr q3, [x28, #32]", "ldr q4, [x28, #48]", + "shl v5.2d, v17.2d, #2", + "shl v3.2d, v3.2d, #2", "mrs x20, nzcv", - "mov x0, v18.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[0]", - "add x1, x4, x0, lsl #2", - "ld1 {v16.d}[0], [x1]", - "mov x0, v18.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[1]", - "add x1, x4, x0, lsl #2", - "ld1 {v16.d}[1], [x1]", - "mov x0, v4.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v3.d[0]", - "add x1, x4, x0, lsl #2", - "ld1 {v2.d}[0], [x1]", - "mov x0, v4.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v3.d[1]", - "add x1, x4, x0, lsl #2", - "ld1 {v2.d}[1], [x1]", + "cmplt p0.d, p6/z, z18.d, #0", + "ld1d {z0.d}, p0/z, [x4, z5.d]", + "mov z16.d, p0/m, z0.d", + "cmplt p0.d, p6/z, z4.d, #0", + "ld1d {z0.d}, p0/z, [x4, z3.d]", + "mov z2.d, p0/m, z0.d", "str q2, [x28, #16]", "movi v18.2d, #0x0", "str q18, [x28, #48]", @@ -1795,22 +1759,16 @@ ] }, "vgatherqpd xmm0, [xmm1*2 + rax], xmm2": { - "ExpectedInstructionCount": 15, + "ExpectedInstructionCount": 9, "Comment": [ "Map 2 0b01 0x93 128-bit" ], "ExpectedArm64ASM": [ + "shl v2.2d, v17.2d, #1", "mrs x20, nzcv", - "mov x0, v18.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[0]", - "add x1, x4, x0, lsl #1", - "ld1 {v16.d}[0], [x1]", - "mov x0, v18.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[1]", - "add x1, x4, x0, lsl #1", - "ld1 {v16.d}[1], [x1]", + "cmplt p0.d, p6/z, z18.d, #0", + "ld1d {z0.d}, p0/z, [x4, z2.d]", + "mov z16.d, p0/m, z0.d", "movi v18.2d, #0x0", "str q18, [x28, #16]", "str q18, [x28, #48]", @@ -1818,22 +1776,16 @@ ] }, "vgatherqpd xmm0, [xmm1*4 + rax], xmm2": { - "ExpectedInstructionCount": 15, + "ExpectedInstructionCount": 9, "Comment": [ "Map 2 0b01 0x93 128-bit" ], "ExpectedArm64ASM": [ + "shl v2.2d, v17.2d, #2", "mrs x20, nzcv", - "mov x0, v18.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[0]", - "add x1, x4, x0, lsl #2", - "ld1 {v16.d}[0], [x1]", - "mov x0, v18.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[1]", - "add x1, x4, x0, lsl #2", - "ld1 {v16.d}[1], [x1]", + "cmplt p0.d, p6/z, z18.d, #0", + "ld1d {z0.d}, p0/z, [x4, z2.d]", + "mov z16.d, p0/m, z0.d", "movi v18.2d, #0x0", "str q18, [x28, #16]", "str q18, [x28, #48]", @@ -1879,7 +1831,7 @@ ] }, "vgatherqpd ymm0, [ymm1*2 + rax], ymm2": { - "ExpectedInstructionCount": 28, + "ExpectedInstructionCount": 16, "Comment": [ "Map 2 0b01 0x93 256-bit" ], @@ -1887,27 +1839,15 @@ "ldr q2, [x28, #16]", "ldr q3, [x28, #32]", "ldr q4, [x28, #48]", + "shl v5.2d, v17.2d, #1", + "shl v3.2d, v3.2d, #1", "mrs x20, nzcv", - "mov x0, v18.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[0]", - "add x1, x4, x0, lsl #1", - "ld1 {v16.d}[0], [x1]", - "mov x0, v18.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[1]", - "add x1, x4, x0, lsl #1", - "ld1 {v16.d}[1], [x1]", - "mov x0, v4.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v3.d[0]", - "add x1, x4, x0, lsl #1", - "ld1 {v2.d}[0], [x1]", - "mov x0, v4.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v3.d[1]", - "add x1, x4, x0, lsl #1", - "ld1 {v2.d}[1], [x1]", + "cmplt p0.d, p6/z, z18.d, #0", + "ld1d {z0.d}, p0/z, [x4, z5.d]", + "mov z16.d, p0/m, z0.d", + "cmplt p0.d, p6/z, z4.d, #0", + "ld1d {z0.d}, p0/z, [x4, z3.d]", + "mov z2.d, p0/m, z0.d", "str q2, [x28, #16]", "movi v18.2d, #0x0", "str q18, [x28, #48]", @@ -1915,7 +1855,7 @@ ] }, "vgatherqpd ymm0, [ymm1*4 + rax], ymm2": { - "ExpectedInstructionCount": 28, + "ExpectedInstructionCount": 16, "Comment": [ "Map 2 0b01 0x93 256-bit" ], @@ -1923,27 +1863,15 @@ "ldr q2, [x28, #16]", "ldr q3, [x28, #32]", "ldr q4, [x28, #48]", + "shl v5.2d, v17.2d, #2", + "shl v3.2d, v3.2d, #2", "mrs x20, nzcv", - "mov x0, v18.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[0]", - "add x1, x4, x0, lsl #2", - "ld1 {v16.d}[0], [x1]", - "mov x0, v18.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[1]", - "add x1, x4, x0, lsl #2", - "ld1 {v16.d}[1], [x1]", - "mov x0, v4.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v3.d[0]", - "add x1, x4, x0, lsl #2", - "ld1 {v2.d}[0], [x1]", - "mov x0, v4.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v3.d[1]", - "add x1, x4, x0, lsl #2", - "ld1 {v2.d}[1], [x1]", + "cmplt p0.d, p6/z, z18.d, #0", + "ld1d {z0.d}, p0/z, [x4, z5.d]", + "mov z16.d, p0/m, z0.d", + "cmplt p0.d, p6/z, z4.d, #0", + "ld1d {z0.d}, p0/z, [x4, z3.d]", + "mov z2.d, p0/m, z0.d", "str q2, [x28, #16]", "movi v18.2d, #0x0", "str q18, [x28, #48]", @@ -2710,22 +2638,16 @@ ] }, "vpgatherqq xmm0, [xmm1*2], xmm2": { - "ExpectedInstructionCount": 15, + "ExpectedInstructionCount": 9, "Comment": [ "Map 2 0b01 0x91 128-bit" ], "ExpectedArm64ASM": [ + "shl v2.2d, v17.2d, #1", "mrs x20, nzcv", - "mov x0, v18.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[0]", - "lsl x1, x0, #1", - "ld1 {v16.d}[0], [x1]", - "mov x0, v18.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[1]", - "lsl x1, x0, #1", - "ld1 {v16.d}[1], [x1]", + "cmplt p0.d, p6/z, z18.d, #0", + "ld1d {z0.d}, p0/z, [z2.d]", + "mov z16.d, p0/m, z0.d", "movi v18.2d, #0x0", "str q18, [x28, #16]", "str q18, [x28, #48]", @@ -2733,22 +2655,16 @@ ] }, "vpgatherqq xmm0, [xmm1*4], xmm2": { - "ExpectedInstructionCount": 15, + "ExpectedInstructionCount": 9, "Comment": [ "Map 2 0b01 0x91 128-bit" ], "ExpectedArm64ASM": [ + "shl v2.2d, v17.2d, #2", "mrs x20, nzcv", - "mov x0, v18.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[0]", - "lsl x1, x0, #2", - "ld1 {v16.d}[0], [x1]", - "mov x0, v18.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[1]", - "lsl x1, x0, #2", - "ld1 {v16.d}[1], [x1]", + "cmplt p0.d, p6/z, z18.d, #0", + "ld1d {z0.d}, p0/z, [z2.d]", + "mov z16.d, p0/m, z0.d", "movi v18.2d, #0x0", "str q18, [x28, #16]", "str q18, [x28, #48]", @@ -2795,7 +2711,7 @@ ] }, "vpgatherqq ymm0, [ymm1*2], ymm2": { - "ExpectedInstructionCount": 28, + "ExpectedInstructionCount": 16, "Comment": [ "Map 2 0b01 0x91 256-bit" ], @@ -2803,27 +2719,15 @@ "ldr q2, [x28, #16]", "ldr q3, [x28, #32]", "ldr q4, [x28, #48]", + "shl v5.2d, v17.2d, #1", + "shl v3.2d, v3.2d, #1", "mrs x20, nzcv", - "mov x0, v18.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[0]", - "lsl x1, x0, #1", - "ld1 {v16.d}[0], [x1]", - "mov x0, v18.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[1]", - "lsl x1, x0, #1", - "ld1 {v16.d}[1], [x1]", - "mov x0, v4.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v3.d[0]", - "lsl x1, x0, #1", - "ld1 {v2.d}[0], [x1]", - "mov x0, v4.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v3.d[1]", - "lsl x1, x0, #1", - "ld1 {v2.d}[1], [x1]", + "cmplt p0.d, p6/z, z18.d, #0", + "ld1d {z0.d}, p0/z, [z5.d]", + "mov z16.d, p0/m, z0.d", + "cmplt p0.d, p6/z, z4.d, #0", + "ld1d {z0.d}, p0/z, [z3.d]", + "mov z2.d, p0/m, z0.d", "str q2, [x28, #16]", "movi v18.2d, #0x0", "str q18, [x28, #48]", @@ -2831,7 +2735,7 @@ ] }, "vpgatherqq ymm0, [ymm1*4], ymm2": { - "ExpectedInstructionCount": 28, + "ExpectedInstructionCount": 16, "Comment": [ "Map 2 0b01 0x91 256-bit" ], @@ -2839,27 +2743,15 @@ "ldr q2, [x28, #16]", "ldr q3, [x28, #32]", "ldr q4, [x28, #48]", + "shl v5.2d, v17.2d, #2", + "shl v3.2d, v3.2d, #2", "mrs x20, nzcv", - "mov x0, v18.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[0]", - "lsl x1, x0, #2", - "ld1 {v16.d}[0], [x1]", - "mov x0, v18.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[1]", - "lsl x1, x0, #2", - "ld1 {v16.d}[1], [x1]", - "mov x0, v4.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v3.d[0]", - "lsl x1, x0, #2", - "ld1 {v2.d}[0], [x1]", - "mov x0, v4.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v3.d[1]", - "lsl x1, x0, #2", - "ld1 {v2.d}[1], [x1]", + "cmplt p0.d, p6/z, z18.d, #0", + "ld1d {z0.d}, p0/z, [z5.d]", + "mov z16.d, p0/m, z0.d", + "cmplt p0.d, p6/z, z4.d, #0", + "ld1d {z0.d}, p0/z, [z3.d]", + "mov z2.d, p0/m, z0.d", "str q2, [x28, #16]", "movi v18.2d, #0x0", "str q18, [x28, #48]", @@ -3644,22 +3536,16 @@ ] }, "vgatherqpd xmm0, [xmm1*2], xmm2": { - "ExpectedInstructionCount": 15, + "ExpectedInstructionCount": 9, "Comment": [ "Map 2 0b01 0x93 128-bit" ], "ExpectedArm64ASM": [ + "shl v2.2d, v17.2d, #1", "mrs x20, nzcv", - "mov x0, v18.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[0]", - "lsl x1, x0, #1", - "ld1 {v16.d}[0], [x1]", - "mov x0, v18.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[1]", - "lsl x1, x0, #1", - "ld1 {v16.d}[1], [x1]", + "cmplt p0.d, p6/z, z18.d, #0", + "ld1d {z0.d}, p0/z, [z2.d]", + "mov z16.d, p0/m, z0.d", "movi v18.2d, #0x0", "str q18, [x28, #16]", "str q18, [x28, #48]", @@ -3667,22 +3553,16 @@ ] }, "vgatherqpd xmm0, [xmm1*4], xmm2": { - "ExpectedInstructionCount": 15, + "ExpectedInstructionCount": 9, "Comment": [ "Map 2 0b01 0x93 128-bit" ], "ExpectedArm64ASM": [ + "shl v2.2d, v17.2d, #2", "mrs x20, nzcv", - "mov x0, v18.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[0]", - "lsl x1, x0, #2", - "ld1 {v16.d}[0], [x1]", - "mov x0, v18.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[1]", - "lsl x1, x0, #2", - "ld1 {v16.d}[1], [x1]", + "cmplt p0.d, p6/z, z18.d, #0", + "ld1d {z0.d}, p0/z, [z2.d]", + "mov z16.d, p0/m, z0.d", "movi v18.2d, #0x0", "str q18, [x28, #16]", "str q18, [x28, #48]", @@ -3729,7 +3609,7 @@ ] }, "vgatherqpd ymm0, [ymm1*2], ymm2": { - "ExpectedInstructionCount": 28, + "ExpectedInstructionCount": 16, "Comment": [ "Map 2 0b01 0x93 256-bit" ], @@ -3737,27 +3617,15 @@ "ldr q2, [x28, #16]", "ldr q3, [x28, #32]", "ldr q4, [x28, #48]", + "shl v5.2d, v17.2d, #1", + "shl v3.2d, v3.2d, #1", "mrs x20, nzcv", - "mov x0, v18.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[0]", - "lsl x1, x0, #1", - "ld1 {v16.d}[0], [x1]", - "mov x0, v18.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[1]", - "lsl x1, x0, #1", - "ld1 {v16.d}[1], [x1]", - "mov x0, v4.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v3.d[0]", - "lsl x1, x0, #1", - "ld1 {v2.d}[0], [x1]", - "mov x0, v4.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v3.d[1]", - "lsl x1, x0, #1", - "ld1 {v2.d}[1], [x1]", + "cmplt p0.d, p6/z, z18.d, #0", + "ld1d {z0.d}, p0/z, [z5.d]", + "mov z16.d, p0/m, z0.d", + "cmplt p0.d, p6/z, z4.d, #0", + "ld1d {z0.d}, p0/z, [z3.d]", + "mov z2.d, p0/m, z0.d", "str q2, [x28, #16]", "movi v18.2d, #0x0", "str q18, [x28, #48]", @@ -3765,7 +3633,7 @@ ] }, "vgatherqpd ymm0, [ymm1*4], ymm2": { - "ExpectedInstructionCount": 28, + "ExpectedInstructionCount": 16, "Comment": [ "Map 2 0b01 0x93 256-bit" ], @@ -3773,27 +3641,15 @@ "ldr q2, [x28, #16]", "ldr q3, [x28, #32]", "ldr q4, [x28, #48]", + "shl v5.2d, v17.2d, #2", + "shl v3.2d, v3.2d, #2", "mrs x20, nzcv", - "mov x0, v18.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[0]", - "lsl x1, x0, #2", - "ld1 {v16.d}[0], [x1]", - "mov x0, v18.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[1]", - "lsl x1, x0, #2", - "ld1 {v16.d}[1], [x1]", - "mov x0, v4.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v3.d[0]", - "lsl x1, x0, #2", - "ld1 {v2.d}[0], [x1]", - "mov x0, v4.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v3.d[1]", - "lsl x1, x0, #2", - "ld1 {v2.d}[1], [x1]", + "cmplt p0.d, p6/z, z18.d, #0", + "ld1d {z0.d}, p0/z, [z5.d]", + "mov z16.d, p0/m, z0.d", + "cmplt p0.d, p6/z, z4.d, #0", + "ld1d {z0.d}, p0/z, [z3.d]", + "mov z2.d, p0/m, z0.d", "str q2, [x28, #16]", "movi v18.2d, #0x0", "str q18, [x28, #48]", diff --git a/unittests/InstructionCountCI/VEX_map2.json b/unittests/InstructionCountCI/VEX_map2.json index 0192c9b481..3e49b8984c 100644 --- a/unittests/InstructionCountCI/VEX_map2.json +++ b/unittests/InstructionCountCI/VEX_map2.json @@ -2810,23 +2810,16 @@ ] }, "vpgatherqq xmm0, [xmm1*2 + rax], xmm2": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 11, "Comment": [ "Map 2 0b01 0x91 128-bit" ], "ExpectedArm64ASM": [ + "shl v2.2d, v17.2d, #1", "mrs x20, nzcv", - "mov v2.16b, v16.16b", - "mov x0, v18.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[0]", - "add x1, x4, x0, lsl #1", - "ld1 {v2.d}[0], [x1]", - "mov x0, v18.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[1]", - "add x1, x4, x0, lsl #1", - "ld1 {v2.d}[1], [x1]", + "cmplt p0.d, p6/z, z18.d, #0", + "ld1d {z0.d}, p0/z, [x4, z2.d]", + "sel z2.d, p0, z0.d, z16.d", "movi v18.2d, #0x0", "mov z1.q, q18", "not p0.b, p7/z, p6.b", @@ -2836,23 +2829,16 @@ ] }, "vpgatherqq xmm0, [xmm1*4 + rax], xmm2": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 11, "Comment": [ "Map 2 0b01 0x91 128-bit" ], "ExpectedArm64ASM": [ + "shl v2.2d, v17.2d, #2", "mrs x20, nzcv", - "mov v2.16b, v16.16b", - "mov x0, v18.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[0]", - "add x1, x4, x0, lsl #2", - "ld1 {v2.d}[0], [x1]", - "mov x0, v18.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[1]", - "add x1, x4, x0, lsl #2", - "ld1 {v2.d}[1], [x1]", + "cmplt p0.d, p6/z, z18.d, #0", + "ld1d {z0.d}, p0/z, [x4, z2.d]", + "sel z2.d, p0, z0.d, z16.d", "movi v18.2d, #0x0", "mov z1.q, q18", "not p0.b, p7/z, p6.b", @@ -2891,7 +2877,7 @@ ] }, "vpgatherqq ymm0, [ymm1*2 + rax], ymm2": { - "ExpectedInstructionCount": 31, + "ExpectedInstructionCount": 18, "Comment": [ "Map 2 0b01 0x91 256-bit" ], @@ -2899,28 +2885,15 @@ "mov z2.q, z16.q[1]", "mov z3.q, z18.q[1]", "mov z4.q, z17.q[1]", + "shl v5.2d, v17.2d, #1", + "shl v4.2d, v4.2d, #1", "mrs x20, nzcv", - "mov v5.16b, v16.16b", - "mov x0, v18.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[0]", - "add x1, x4, x0, lsl #1", - "ld1 {v5.d}[0], [x1]", - "mov x0, v18.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[1]", - "add x1, x4, x0, lsl #1", - "ld1 {v5.d}[1], [x1]", - "mov x0, v3.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v4.d[0]", - "add x1, x4, x0, lsl #1", - "ld1 {v2.d}[0], [x1]", - "mov x0, v3.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v4.d[1]", - "add x1, x4, x0, lsl #1", - "ld1 {v2.d}[1], [x1]", + "cmplt p0.d, p6/z, z18.d, #0", + "ld1d {z0.d}, p0/z, [x4, z5.d]", + "sel z5.d, p0, z0.d, z16.d", + "cmplt p0.d, p6/z, z3.d, #0", + "ld1d {z0.d}, p0/z, [x4, z4.d]", + "mov z2.d, p0/m, z0.d", "mov z1.q, q2", "mov z16.d, z5.d", "not p0.b, p7/z, p6.b", @@ -2930,7 +2903,7 @@ ] }, "vpgatherqq ymm0, [ymm1*4 + rax], ymm2": { - "ExpectedInstructionCount": 31, + "ExpectedInstructionCount": 18, "Comment": [ "Map 2 0b01 0x91 256-bit" ], @@ -2938,28 +2911,15 @@ "mov z2.q, z16.q[1]", "mov z3.q, z18.q[1]", "mov z4.q, z17.q[1]", + "shl v5.2d, v17.2d, #2", + "shl v4.2d, v4.2d, #2", "mrs x20, nzcv", - "mov v5.16b, v16.16b", - "mov x0, v18.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[0]", - "add x1, x4, x0, lsl #2", - "ld1 {v5.d}[0], [x1]", - "mov x0, v18.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[1]", - "add x1, x4, x0, lsl #2", - "ld1 {v5.d}[1], [x1]", - "mov x0, v3.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v4.d[0]", - "add x1, x4, x0, lsl #2", - "ld1 {v2.d}[0], [x1]", - "mov x0, v3.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v4.d[1]", - "add x1, x4, x0, lsl #2", - "ld1 {v2.d}[1], [x1]", + "cmplt p0.d, p6/z, z18.d, #0", + "ld1d {z0.d}, p0/z, [x4, z5.d]", + "sel z5.d, p0, z0.d, z16.d", + "cmplt p0.d, p6/z, z3.d, #0", + "ld1d {z0.d}, p0/z, [x4, z4.d]", + "mov z2.d, p0/m, z0.d", "mov z1.q, q2", "mov z16.d, z5.d", "not p0.b, p7/z, p6.b", @@ -3806,23 +3766,16 @@ ] }, "vgatherqpd xmm0, [xmm1*2 + rax], xmm2": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 11, "Comment": [ "Map 2 0b01 0x93 128-bit" ], "ExpectedArm64ASM": [ + "shl v2.2d, v17.2d, #1", "mrs x20, nzcv", - "mov v2.16b, v16.16b", - "mov x0, v18.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[0]", - "add x1, x4, x0, lsl #1", - "ld1 {v2.d}[0], [x1]", - "mov x0, v18.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[1]", - "add x1, x4, x0, lsl #1", - "ld1 {v2.d}[1], [x1]", + "cmplt p0.d, p6/z, z18.d, #0", + "ld1d {z0.d}, p0/z, [x4, z2.d]", + "sel z2.d, p0, z0.d, z16.d", "movi v18.2d, #0x0", "mov z1.q, q18", "not p0.b, p7/z, p6.b", @@ -3832,23 +3785,16 @@ ] }, "vgatherqpd xmm0, [xmm1*4 + rax], xmm2": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 11, "Comment": [ "Map 2 0b01 0x93 128-bit" ], "ExpectedArm64ASM": [ + "shl v2.2d, v17.2d, #2", "mrs x20, nzcv", - "mov v2.16b, v16.16b", - "mov x0, v18.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[0]", - "add x1, x4, x0, lsl #2", - "ld1 {v2.d}[0], [x1]", - "mov x0, v18.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[1]", - "add x1, x4, x0, lsl #2", - "ld1 {v2.d}[1], [x1]", + "cmplt p0.d, p6/z, z18.d, #0", + "ld1d {z0.d}, p0/z, [x4, z2.d]", + "sel z2.d, p0, z0.d, z16.d", "movi v18.2d, #0x0", "mov z1.q, q18", "not p0.b, p7/z, p6.b", @@ -3887,7 +3833,7 @@ ] }, "vgatherqpd ymm0, [ymm1*2 + rax], ymm2": { - "ExpectedInstructionCount": 31, + "ExpectedInstructionCount": 18, "Comment": [ "Map 2 0b01 0x93 256-bit" ], @@ -3895,28 +3841,15 @@ "mov z2.q, z16.q[1]", "mov z3.q, z18.q[1]", "mov z4.q, z17.q[1]", + "shl v5.2d, v17.2d, #1", + "shl v4.2d, v4.2d, #1", "mrs x20, nzcv", - "mov v5.16b, v16.16b", - "mov x0, v18.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[0]", - "add x1, x4, x0, lsl #1", - "ld1 {v5.d}[0], [x1]", - "mov x0, v18.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[1]", - "add x1, x4, x0, lsl #1", - "ld1 {v5.d}[1], [x1]", - "mov x0, v3.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v4.d[0]", - "add x1, x4, x0, lsl #1", - "ld1 {v2.d}[0], [x1]", - "mov x0, v3.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v4.d[1]", - "add x1, x4, x0, lsl #1", - "ld1 {v2.d}[1], [x1]", + "cmplt p0.d, p6/z, z18.d, #0", + "ld1d {z0.d}, p0/z, [x4, z5.d]", + "sel z5.d, p0, z0.d, z16.d", + "cmplt p0.d, p6/z, z3.d, #0", + "ld1d {z0.d}, p0/z, [x4, z4.d]", + "mov z2.d, p0/m, z0.d", "mov z1.q, q2", "mov z16.d, z5.d", "not p0.b, p7/z, p6.b", @@ -3926,7 +3859,7 @@ ] }, "vgatherqpd ymm0, [ymm1*4 + rax], ymm2": { - "ExpectedInstructionCount": 31, + "ExpectedInstructionCount": 18, "Comment": [ "Map 2 0b01 0x93 256-bit" ], @@ -3934,28 +3867,15 @@ "mov z2.q, z16.q[1]", "mov z3.q, z18.q[1]", "mov z4.q, z17.q[1]", + "shl v5.2d, v17.2d, #2", + "shl v4.2d, v4.2d, #2", "mrs x20, nzcv", - "mov v5.16b, v16.16b", - "mov x0, v18.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[0]", - "add x1, x4, x0, lsl #2", - "ld1 {v5.d}[0], [x1]", - "mov x0, v18.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v17.d[1]", - "add x1, x4, x0, lsl #2", - "ld1 {v5.d}[1], [x1]", - "mov x0, v3.d[0]", - "tbz x0, #63, #+0x10", - "mov x0, v4.d[0]", - "add x1, x4, x0, lsl #2", - "ld1 {v2.d}[0], [x1]", - "mov x0, v3.d[1]", - "tbz x0, #63, #+0x10", - "mov x0, v4.d[1]", - "add x1, x4, x0, lsl #2", - "ld1 {v2.d}[1], [x1]", + "cmplt p0.d, p6/z, z18.d, #0", + "ld1d {z0.d}, p0/z, [x4, z5.d]", + "sel z5.d, p0, z0.d, z16.d", + "cmplt p0.d, p6/z, z3.d, #0", + "ld1d {z0.d}, p0/z, [x4, z4.d]", + "mov z2.d, p0/m, z0.d", "mov z1.q, q2", "mov z16.d, z5.d", "not p0.b, p7/z, p6.b",