Skip to content

Commit

Permalink
Merge pull request #3781 from Sonicadvance1/optimize_vmovlh
Browse files Browse the repository at this point in the history
AVX128: Minor optimization to vmov{l,h}{ps,pd}
  • Loading branch information
Sonicadvance1 authored Jun 30, 2024
2 parents 76f3391 + cc0509c commit cf24d3c
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 27 deletions.
14 changes: 7 additions & 7 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -837,8 +837,8 @@ void OpDispatchBuilder::AVX128_VMOVLP(OpcodeArgs) {
///< VMOVLPS/PD xmm1, xmm2, mem64
// Bits[63:0] come from Src2[63:0]
// Bits[127:64] come from Src1[127:64]
auto Src2 = LoadSource_WithOpSize(FPRClass, Op, Op->Src[1], OpSize::i64Bit, Op->Flags);
Ref Result_Low = _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 1, 1, Src2, Src1.Low);
auto Src2 = LoadSource_WithOpSize(GPRClass, Op, Op->Src[1], OpSize::i64Bit, Op->Flags, {.LoadData = false});
Ref Result_Low = _VLoadVectorElement(OpSize::i128Bit, OpSize::i64Bit, Src1.Low, 0, Src2);
Ref ZeroVector = LoadZeroVector(OpSize::i128Bit);

AVX128_StoreResult_WithOpSize(Op, Op->Dest, RefPair {.Low = Result_Low, .High = ZeroVector});
Expand All @@ -857,18 +857,18 @@ void OpDispatchBuilder::AVX128_VMOVHP(OpcodeArgs) {
auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false);

if (Op->Dest.IsGPR()) {
auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, false);
auto Src2 = LoadSource_WithOpSize(GPRClass, Op, Op->Src[1], OpSize::i64Bit, Op->Flags, {.LoadData = false});

// Bits[63:0] come from Src1[63:0]
// Bits[127:64] come from Src2[63:0]
Ref Result_Low = _VZip(OpSize::i128Bit, OpSize::i64Bit, Src1.Low, Src2.Low);
Ref Result_Low = _VLoadVectorElement(OpSize::i128Bit, OpSize::i64Bit, Src1.Low, 1, Src2);
Ref ZeroVector = LoadZeroVector(OpSize::i128Bit);

AVX128_StoreResult_WithOpSize(Op, Op->Dest, RefPair {.Low = Result_Low, .High = ZeroVector});
} else {
// Need to store Bits[127:64]. Duplicate the element to get it in the low bits.
Src1.Low = _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src1.Low, 1);
StoreResult_WithOpSize(FPRClass, Op, Op->Dest, Src1.Low, OpSize::i64Bit, OpSize::i64Bit);
// Need to store Bits[127:64]. Use a vector element store.
auto Dest = LoadSource_WithOpSize(GPRClass, Op, Op->Dest, OpSize::i64Bit, Op->Flags, {.LoadData = false});
_VStoreVectorElement(OpSize::i128Bit, OpSize::i64Bit, Src1.Low, 1, Dest);
}
}

Expand Down
34 changes: 14 additions & 20 deletions unittests/InstructionCountCI/AVX128/VEX_map1.json
Original file line number Diff line number Diff line change
Expand Up @@ -234,29 +234,27 @@
]
},
"vmovlps xmm0, xmm1, [rax]": {
"ExpectedInstructionCount": 5,
"ExpectedInstructionCount": 4,
"Comment": [
"Insert in to first element could be more optimal, which is the common case.",
"Map 1 0b00 0x12 128-bit"
],
"ExpectedArm64ASM": [
"ldr d2, [x4]",
"mov v16.16b, v2.16b",
"mov v16.d[1], v17.d[1]",
"mov v16.16b, v17.16b",
"ld1 {v16.d}[0], [x4]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vmovlpd xmm0, xmm1, [rax]": {
"ExpectedInstructionCount": 5,
"ExpectedInstructionCount": 4,
"Comment": [
"Insert in to first element could be more optimal, which is the common case.",
"Map 1 0b01 0x12 128-bit"
],
"ExpectedArm64ASM": [
"ldr d2, [x4]",
"mov v16.16b, v2.16b",
"mov v16.d[1], v17.d[1]",
"mov v16.16b, v17.16b",
"ld1 {v16.d}[0], [x4]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
Expand Down Expand Up @@ -439,8 +437,8 @@
"Map 1 0b00 0x16 128-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x4]",
"zip1 v16.2d, v17.2d, v2.2d",
"mov v16.16b, v17.16b",
"ld1 {v16.d}[1], [x4]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
Expand All @@ -451,8 +449,8 @@
"Map 1 0b01 0x16 128-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x4]",
"zip1 v16.2d, v17.2d, v2.2d",
"mov v16.16b, v17.16b",
"ld1 {v16.d}[1], [x4]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
Expand Down Expand Up @@ -483,25 +481,21 @@
]
},
"vmovhps [rax], xmm0": {
"ExpectedInstructionCount": 2,
"ExpectedInstructionCount": 1,
"Comment": [
"Can be more optimal with an element store.",
"Map 1 0b00 0x17 128-bit"
],
"ExpectedArm64ASM": [
"dup v2.2d, v16.d[1]",
"str d2, [x4]"
"st1 {v16.d}[1], [x4]"
]
},
"vmovhpd [rax], xmm0": {
"ExpectedInstructionCount": 2,
"ExpectedInstructionCount": 1,
"Comment": [
"Can be more optimal with an element store.",
"Map 1 0b01 0x17 128-bit"
],
"ExpectedArm64ASM": [
"dup v2.2d, v16.d[1]",
"str d2, [x4]"
"st1 {v16.d}[1], [x4]"
]
},
"vmovmskps rax, xmm0": {
Expand Down

0 comments on commit cf24d3c

Please sign in to comment.