From 33a3763b29fd3f9771a39adf8925fb25a408aac3 Mon Sep 17 00:00:00 2001 From: Paulo Matos Date: Fri, 22 Nov 2024 11:17:21 +0100 Subject: [PATCH] instcountci: testing multiple 80bit ldst using SVE In preparation for #4166 which should improve on these results. --- unittests/InstructionCountCI/Primary.json | 22 +- unittests/InstructionCountCI/X87ldst-SVE.json | 343 ++++++++++++++++++ 2 files changed, 354 insertions(+), 11 deletions(-) create mode 100644 unittests/InstructionCountCI/X87ldst-SVE.json diff --git a/unittests/InstructionCountCI/Primary.json b/unittests/InstructionCountCI/Primary.json index 9d81d4ac58..91807d2fd7 100644 --- a/unittests/InstructionCountCI/Primary.json +++ b/unittests/InstructionCountCI/Primary.json @@ -1996,10 +1996,10 @@ "ExpectedInstructionCount": 4, "Comment": "0x86", "ExpectedArm64ASM": [ - "mov x20, x7", - "bfxil x20, x6, #0, #8", - "bfxil x6, x7, #0, #8", - "mov x7, x20" + "mov x20, x6", + "bfxil x20, x7, #0, #8", + "bfxil x7, x6, #0, #8", + "mov x6, x20" ] }, "xchg [rax], cl": { @@ -2014,10 +2014,10 @@ "ExpectedInstructionCount": 4, "Comment": "0x87", "ExpectedArm64ASM": [ - "mov x20, x7", - "bfxil x20, x6, #0, #16", - "bfxil x6, x7, #0, #16", - "mov x7, x20" + "mov x20, x6", + "bfxil x20, x7, #0, #16", + "bfxil x7, x6, #0, #16", + "mov x6, x20" ] }, "xchg [rax], cx": { @@ -2032,9 +2032,9 @@ "ExpectedInstructionCount": 3, "Comment": "0x87", "ExpectedArm64ASM": [ - "mov w20, w6", - "mov w6, w7", - "mov x7, x20" + "mov w20, w7", + "mov w7, w6", + "mov x6, x20" ] }, "xchg [rax], ecx": { diff --git a/unittests/InstructionCountCI/X87ldst-SVE.json b/unittests/InstructionCountCI/X87ldst-SVE.json new file mode 100644 index 0000000000..4b63bd77e8 --- /dev/null +++ b/unittests/InstructionCountCI/X87ldst-SVE.json @@ -0,0 +1,343 @@ +{ + "Features": { + "Bitness": 64, + "EnabledHostFeatures": [ + "SVE128", + "SVE256" + ], + "DisabledHostFeatures": [ + "AFP", + "FLAGM", + "FLAGM2", + "RPRES" + ] + }, + "Instructions": { + "fstp tword [rax]": { + "ExpectedInstructionCount": 15, + "Comment": "Single 80-bit store.", + "ExpectedArm64ASM": [ + "ldrb w20, [x28, #1019]", + "add x0, x28, x20, lsl #4", + "ldr q2, [x0, #1040]", + "str d2, [x4]", + "mov x21, v2.d[1]", + "add x22, x4, #0x8 (8)", + "strh w21, [x22]", + "ldrb w21, [x28, #1298]", + "mov w22, #0x1", + "lsl w22, w22, w20", + "bic w21, w21, w22", + "strb w21, [x28, #1298]", + "add w20, w20, #0x1 (1)", + "and w20, w20, #0x7", + "strb w20, [x28, #1019]" + ] + }, + "2-store 80bit": { + "x86InstructionCount": 2, + "ExpectedInstructionCount": 29, + "x86Insts": [ + "fstp tword [rax]", + "fstp tword [rax+10]" + ], + "ExpectedArm64ASM": [ + "ldrb w20, [x28, #1019]", + "add x0, x28, x20, lsl #4", + "ldr q2, [x0, #1040]", + "str d2, [x4]", + "mov x21, v2.d[1]", + "add x22, x4, #0x8 (8)", + "strh w21, [x22]", + "ldrb w21, [x28, #1298]", + "mov w22, #0x1", + "lsl w23, w22, w20", + "bic w21, w21, w23", + "strb w21, [x28, #1298]", + "add w20, w20, #0x1 (1)", + "and w20, w20, #0x7", + "strb w20, [x28, #1019]", + "add x21, x4, #0xa (10)", + "add x0, x28, x20, lsl #4", + "ldr q2, [x0, #1040]", + "str d2, [x21]", + "mov x23, v2.d[1]", + "add x21, x21, #0x8 (8)", + "strh w23, [x21]", + "ldrb w21, [x28, #1298]", + "lsl w22, w22, w20", + "bic w21, w21, w22", + "strb w21, [x28, #1298]", + "add w20, w20, #0x1 (1)", + "and w20, w20, #0x7", + "strb w20, [x28, #1019]" + ] + }, + "8-store 80bit": { + "x86InstructionCount": 8, + "ExpectedInstructionCount": 113, + "x86Insts": [ + "fstp tword [rax]", + "fstp tword [rax+10]", + "fstp tword [rax+20]", + "fstp tword [rax+30]", + "fstp tword [rax+40]", + "fstp tword [rax+50]", + "fstp tword [rax+60]", + "fstp tword [rax+70]" + ], + "ExpectedArm64ASM": [ + "ldrb w20, [x28, #1019]", + "add x0, x28, x20, lsl #4", + "ldr q2, [x0, #1040]", + "str d2, [x4]", + "mov x21, v2.d[1]", + "add x22, x4, #0x8 (8)", + "strh w21, [x22]", + "ldrb w21, [x28, #1298]", + "mov w22, #0x1", + "lsl w23, w22, w20", + "bic w21, w21, w23", + "strb w21, [x28, #1298]", + "add w20, w20, #0x1 (1)", + "and w20, w20, #0x7", + "strb w20, [x28, #1019]", + "add x21, x4, #0xa (10)", + "add x0, x28, x20, lsl #4", + "ldr q2, [x0, #1040]", + "str d2, [x21]", + "mov x23, v2.d[1]", + "add x21, x21, #0x8 (8)", + "strh w23, [x21]", + "ldrb w21, [x28, #1298]", + "lsl w23, w22, w20", + "bic w21, w21, w23", + "strb w21, [x28, #1298]", + "add w20, w20, #0x1 (1)", + "and w20, w20, #0x7", + "strb w20, [x28, #1019]", + "add x21, x4, #0x14 (20)", + "add x0, x28, x20, lsl #4", + "ldr q2, [x0, #1040]", + "str d2, [x21]", + "mov x23, v2.d[1]", + "add x21, x21, #0x8 (8)", + "strh w23, [x21]", + "ldrb w21, [x28, #1298]", + "lsl w23, w22, w20", + "bic w21, w21, w23", + "strb w21, [x28, #1298]", + "add w20, w20, #0x1 (1)", + "and w20, w20, #0x7", + "strb w20, [x28, #1019]", + "add x21, x4, #0x1e (30)", + "add x0, x28, x20, lsl #4", + "ldr q2, [x0, #1040]", + "str d2, [x21]", + "mov x23, v2.d[1]", + "add x21, x21, #0x8 (8)", + "strh w23, [x21]", + "ldrb w21, [x28, #1298]", + "lsl w23, w22, w20", + "bic w21, w21, w23", + "strb w21, [x28, #1298]", + "add w20, w20, #0x1 (1)", + "and w20, w20, #0x7", + "strb w20, [x28, #1019]", + "add x21, x4, #0x28 (40)", + "add x0, x28, x20, lsl #4", + "ldr q2, [x0, #1040]", + "str d2, [x21]", + "mov x23, v2.d[1]", + "add x21, x21, #0x8 (8)", + "strh w23, [x21]", + "ldrb w21, [x28, #1298]", + "lsl w23, w22, w20", + "bic w21, w21, w23", + "strb w21, [x28, #1298]", + "add w20, w20, #0x1 (1)", + "and w20, w20, #0x7", + "strb w20, [x28, #1019]", + "add x21, x4, #0x32 (50)", + "add x0, x28, x20, lsl #4", + "ldr q2, [x0, #1040]", + "str d2, [x21]", + "mov x23, v2.d[1]", + "add x21, x21, #0x8 (8)", + "strh w23, [x21]", + "ldrb w21, [x28, #1298]", + "lsl w23, w22, w20", + "bic w21, w21, w23", + "strb w21, [x28, #1298]", + "add w20, w20, #0x1 (1)", + "and w20, w20, #0x7", + "strb w20, [x28, #1019]", + "add x21, x4, #0x3c (60)", + "add x0, x28, x20, lsl #4", + "ldr q2, [x0, #1040]", + "str d2, [x21]", + "mov x23, v2.d[1]", + "add x21, x21, #0x8 (8)", + "strh w23, [x21]", + "ldrb w21, [x28, #1298]", + "lsl w23, w22, w20", + "bic w21, w21, w23", + "strb w21, [x28, #1298]", + "add w20, w20, #0x1 (1)", + "and w20, w20, #0x7", + "strb w20, [x28, #1019]", + "add x21, x4, #0x46 (70)", + "add x0, x28, x20, lsl #4", + "ldr q2, [x0, #1040]", + "str d2, [x21]", + "mov x23, v2.d[1]", + "add x21, x21, #0x8 (8)", + "strh w23, [x21]", + "ldrb w21, [x28, #1298]", + "lsl w22, w22, w20", + "bic w21, w21, w22", + "strb w21, [x28, #1298]", + "add w20, w20, #0x1 (1)", + "and w20, w20, #0x7", + "strb w20, [x28, #1019]" + ] + }, + "fld tword [rax]": { + "ExpectedInstructionCount": 14, + "Comment": "Single 80-bit store.", + "ExpectedArm64ASM": [ + "ldr d2, [x4]", + "add x20, x4, #0x8 (8)", + "ld1 {v2.h}[4], [x20]", + "ldrb w20, [x28, #1019]", + "mov w21, #0x1", + "sub w20, w20, #0x1 (1)", + "and w20, w20, #0x7", + "strb w20, [x28, #1019]", + "add x0, x28, x20, lsl #4", + "str q2, [x0, #1040]", + "ldrb w22, [x28, #1298]", + "lsl w20, w21, w20", + "orr w20, w22, w20", + "strb w20, [x28, #1298]" + ] + }, + "2-load 80bit": { + "x86InstructionCount": 2, + "ExpectedInstructionCount": 24, + "x86Insts": [ + "fld tword [rax]", + "fld tword [rax+10]" + ], + "ExpectedArm64ASM": [ + "ldr d2, [x4]", + "add x20, x4, #0x8 (8)", + "ld1 {v2.h}[4], [x20]", + "add x20, x4, #0xa (10)", + "ldr d3, [x20]", + "add x20, x20, #0x8 (8)", + "ld1 {v3.h}[4], [x20]", + "ldrb w20, [x28, #1019]", + "sub w20, w20, #0x2 (2)", + "and w20, w20, #0x7", + "strb w20, [x28, #1019]", + "add x0, x28, x20, lsl #4", + "str q3, [x0, #1040]", + "add w21, w20, #0x1 (1)", + "and w21, w21, #0x7", + "add x0, x28, x21, lsl #4", + "str q2, [x0, #1040]", + "mov w21, #0x8", + "sub w20, w21, w20", + "ldrb w21, [x28, #1298]", + "mov w22, #0x303", + "lsr w20, w22, w20", + "orr w20, w21, w20", + "strb w20, [x28, #1298]" + ] + }, + "8-load 80bit": { + "x86InstructionCount": 8, + "ExpectedInstructionCount": 67, + "x86Insts": [ + "fld tword [rax]", + "fld tword [rax+10]", + "fld tword [rax+20]", + "fld tword [rax+30]", + "fld tword [rax+40]", + "fld tword [rax+50]", + "fld tword [rax+60]", + "fld tword [rax+70]" + ], + "ExpectedArm64ASM": [ + "ldr d2, [x4]", + "add x20, x4, #0x8 (8)", + "ld1 {v2.h}[4], [x20]", + "add x20, x4, #0xa (10)", + "ldr d3, [x20]", + "add x20, x20, #0x8 (8)", + "ld1 {v3.h}[4], [x20]", + "add x20, x4, #0x14 (20)", + "ldr d4, [x20]", + "add x20, x20, #0x8 (8)", + "ld1 {v4.h}[4], [x20]", + "add x20, x4, #0x1e (30)", + "ldr d5, [x20]", + "add x20, x20, #0x8 (8)", + "ld1 {v5.h}[4], [x20]", + "add x20, x4, #0x28 (40)", + "ldr d6, [x20]", + "add x20, x20, #0x8 (8)", + "ld1 {v6.h}[4], [x20]", + "add x20, x4, #0x32 (50)", + "ldr d7, [x20]", + "add x20, x20, #0x8 (8)", + "ld1 {v7.h}[4], [x20]", + "add x20, x4, #0x3c (60)", + "ldr d8, [x20]", + "add x20, x20, #0x8 (8)", + "ld1 {v8.h}[4], [x20]", + "add x20, x4, #0x46 (70)", + "ldr d9, [x20]", + "add x20, x20, #0x8 (8)", + "ld1 {v9.h}[4], [x20]", + "ldrb w20, [x28, #1019]", + "sub w20, w20, #0x8 (8)", + "and w20, w20, #0x7", + "strb w20, [x28, #1019]", + "add x0, x28, x20, lsl #4", + "str q9, [x0, #1040]", + "add w21, w20, #0x1 (1)", + "and w21, w21, #0x7", + "add x0, x28, x21, lsl #4", + "str q8, [x0, #1040]", + "add w21, w20, #0x2 (2)", + "and w21, w21, #0x7", + "add x0, x28, x21, lsl #4", + "str q7, [x0, #1040]", + "add w21, w20, #0x3 (3)", + "and w21, w21, #0x7", + "add x0, x28, x21, lsl #4", + "str q6, [x0, #1040]", + "add w21, w20, #0x4 (4)", + "and w21, w21, #0x7", + "add x0, x28, x21, lsl #4", + "str q5, [x0, #1040]", + "add w21, w20, #0x5 (5)", + "and w21, w21, #0x7", + "add x0, x28, x21, lsl #4", + "str q4, [x0, #1040]", + "add w21, w20, #0x6 (6)", + "and w21, w21, #0x7", + "add x0, x28, x21, lsl #4", + "str q3, [x0, #1040]", + "add w20, w20, #0x7 (7)", + "and w20, w20, #0x7", + "add x0, x28, x20, lsl #4", + "str q2, [x0, #1040]", + "mov w20, #0xff", + "strb w20, [x28, #1298]" + ] + } + } +}