Merge pull request #3781 from Sonicadvance1/optimize_vmovlh

AVX128: Minor optimization to vmov{l,h}{ps,pd}
FEX-Emu · Jun 30, 2024 · cf24d3c · cf24d3c
2 parents 76f3391 + cc0509c
commit cf24d3c
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 27 deletions.
diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp
@@ -837,8 +837,8 @@ void OpDispatchBuilder::AVX128_VMOVLP(OpcodeArgs) {
     ///< VMOVLPS/PD xmm1, xmm2, mem64
     // Bits[63:0] come from Src2[63:0]
     // Bits[127:64] come from Src1[127:64]
-    auto Src2 = LoadSource_WithOpSize(FPRClass, Op, Op->Src[1], OpSize::i64Bit, Op->Flags);
-    Ref Result_Low = _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 1, 1, Src2, Src1.Low);
+    auto Src2 = LoadSource_WithOpSize(GPRClass, Op, Op->Src[1], OpSize::i64Bit, Op->Flags, {.LoadData = false});
+    Ref Result_Low = _VLoadVectorElement(OpSize::i128Bit, OpSize::i64Bit, Src1.Low, 0, Src2);
     Ref ZeroVector = LoadZeroVector(OpSize::i128Bit);
 
     AVX128_StoreResult_WithOpSize(Op, Op->Dest, RefPair {.Low = Result_Low, .High = ZeroVector});
@@ -857,18 +857,18 @@ void OpDispatchBuilder::AVX128_VMOVHP(OpcodeArgs) {
   auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false);
 
   if (Op->Dest.IsGPR()) {
-    auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, false);
+    auto Src2 = LoadSource_WithOpSize(GPRClass, Op, Op->Src[1], OpSize::i64Bit, Op->Flags, {.LoadData = false});
 
     // Bits[63:0] come from Src1[63:0]
     // Bits[127:64] come from Src2[63:0]
-    Ref Result_Low = _VZip(OpSize::i128Bit, OpSize::i64Bit, Src1.Low, Src2.Low);
+    Ref Result_Low = _VLoadVectorElement(OpSize::i128Bit, OpSize::i64Bit, Src1.Low, 1, Src2);
     Ref ZeroVector = LoadZeroVector(OpSize::i128Bit);
 
     AVX128_StoreResult_WithOpSize(Op, Op->Dest, RefPair {.Low = Result_Low, .High = ZeroVector});
   } else {
-    // Need to store Bits[127:64]. Duplicate the element to get it in the low bits.
-    Src1.Low = _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src1.Low, 1);
-    StoreResult_WithOpSize(FPRClass, Op, Op->Dest, Src1.Low, OpSize::i64Bit, OpSize::i64Bit);
+    // Need to store Bits[127:64]. Use a vector element store.
+    auto Dest = LoadSource_WithOpSize(GPRClass, Op, Op->Dest, OpSize::i64Bit, Op->Flags, {.LoadData = false});
+    _VStoreVectorElement(OpSize::i128Bit, OpSize::i64Bit, Src1.Low, 1, Dest);
   }
 }
 

diff --git a/unittests/InstructionCountCI/AVX128/VEX_map1.json b/unittests/InstructionCountCI/AVX128/VEX_map1.json
@@ -234,29 +234,27 @@
       ]
     },
     "vmovlps xmm0, xmm1, [rax]": {
-      "ExpectedInstructionCount": 5,
+      "ExpectedInstructionCount": 4,
       "Comment": [
         "Insert in to first element could be more optimal, which is the common case.",
         "Map 1 0b00 0x12 128-bit"
       ],
       "ExpectedArm64ASM": [
-        "ldr d2, [x4]",
-        "mov v16.16b, v2.16b",
-        "mov v16.d[1], v17.d[1]",
+        "mov v16.16b, v17.16b",
+        "ld1 {v16.d}[0], [x4]",
         "movi v2.2d, #0x0",
         "str q2, [x28, #16]"
       ]
     },
     "vmovlpd xmm0, xmm1, [rax]": {
-      "ExpectedInstructionCount": 5,
+      "ExpectedInstructionCount": 4,
       "Comment": [
         "Insert in to first element could be more optimal, which is the common case.",
         "Map 1 0b01 0x12 128-bit"
       ],
       "ExpectedArm64ASM": [
-        "ldr d2, [x4]",
-        "mov v16.16b, v2.16b",
-        "mov v16.d[1], v17.d[1]",
+        "mov v16.16b, v17.16b",
+        "ld1 {v16.d}[0], [x4]",
         "movi v2.2d, #0x0",
         "str q2, [x28, #16]"
       ]
@@ -439,8 +437,8 @@
         "Map 1 0b00 0x16 128-bit"
       ],
       "ExpectedArm64ASM": [
-        "ldr q2, [x4]",
-        "zip1 v16.2d, v17.2d, v2.2d",
+        "mov v16.16b, v17.16b",
+        "ld1 {v16.d}[1], [x4]",
         "movi v2.2d, #0x0",
         "str q2, [x28, #16]"
       ]
@@ -451,8 +449,8 @@
         "Map 1 0b01 0x16 128-bit"
       ],
       "ExpectedArm64ASM": [
-        "ldr q2, [x4]",
-        "zip1 v16.2d, v17.2d, v2.2d",
+        "mov v16.16b, v17.16b",
+        "ld1 {v16.d}[1], [x4]",
         "movi v2.2d, #0x0",
         "str q2, [x28, #16]"
       ]
@@ -483,25 +481,21 @@
       ]
     },
     "vmovhps [rax], xmm0": {
-      "ExpectedInstructionCount": 2,
+      "ExpectedInstructionCount": 1,
       "Comment": [
-        "Can be more optimal with an element store.",
         "Map 1 0b00 0x17 128-bit"
       ],
       "ExpectedArm64ASM": [
-        "dup v2.2d, v16.d[1]",
-        "str d2, [x4]"
+        "st1 {v16.d}[1], [x4]"
       ]
     },
     "vmovhpd [rax], xmm0": {
-      "ExpectedInstructionCount": 2,
+      "ExpectedInstructionCount": 1,
       "Comment": [
-        "Can be more optimal with an element store.",
         "Map 1 0b01 0x17 128-bit"
       ],
       "ExpectedArm64ASM": [
-        "dup v2.2d, v16.d[1]",
-        "str d2, [x4]"
+        "st1 {v16.d}[1], [x4]"
       ]
     },
     "vmovmskps rax, xmm0": {