Skip to content

Commit

Permalink
[X86] X86FixupInstTunings - add VPERMILPDri -> VSHUFPDrri mapping
Browse files Browse the repository at this point in the history
Similar to the original VPERMILPSri -> VSHUFPSrri mapping added in D143787, replacing VPERMILPDri -> VSHUFPDrri should never be any slower and saves an encoding byte.

The sibling VPERMILPDmi -> VPSHUFDmi mapping is trickier as we need the same shuffle mask in every lane (and it needs to be adjusted) - I haven't attempted that yet but we can investigate it in the future if there's interest.

Fixes llvm#61060

Differential Revision: https://reviews.llvm.org/D148999
  • Loading branch information
RKSimon committed Apr 23, 2023
1 parent b92839c commit e9f9467
Show file tree
Hide file tree
Showing 72 changed files with 1,197 additions and 1,159 deletions.
38 changes: 38 additions & 0 deletions llvm/lib/Target/X86/X86FixupInstTuning.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,21 @@ bool X86FixupInstTuningPass::processInstruction(
return ReplaceInTie;
};

// `vpermilpd r, i` -> `vshufpd r, r, i`
// `vpermilpd r, i, k` -> `vshufpd r, r, i, k`
// `vshufpd` is always as fast or faster than `vpermilpd` and takes
// 1 less byte of code size for VEX and EVEX encoding.
auto ProcessVPERMILPDri = [&](unsigned NewOpc) -> bool {
if (!NewOpcPreferable(NewOpc))
return false;
unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm();
MI.removeOperand(NumOperands - 1);
MI.addOperand(MI.getOperand(NumOperands - 2));
MI.setDesc(TII->get(NewOpc));
MI.addOperand(MachineOperand::CreateImm(MaskImm));
return true;
};

// `vpermilps r, i` -> `vshufps r, r, i`
// `vpermilps r, i, k` -> `vshufps r, r, i, k`
// `vshufps` is always as fast or faster than `vpermilps` and takes
Expand Down Expand Up @@ -210,6 +225,29 @@ bool X86FixupInstTuningPass::processInstruction(
};

switch (Opc) {
case X86::VPERMILPDri:
return ProcessVPERMILPDri(X86::VSHUFPDrri);
case X86::VPERMILPDYri:
return ProcessVPERMILPDri(X86::VSHUFPDYrri);
case X86::VPERMILPDZ128ri:
return ProcessVPERMILPDri(X86::VSHUFPDZ128rri);
case X86::VPERMILPDZ256ri:
return ProcessVPERMILPDri(X86::VSHUFPDZ256rri);
case X86::VPERMILPDZri:
return ProcessVPERMILPDri(X86::VSHUFPDZrri);
case X86::VPERMILPDZ128rikz:
return ProcessVPERMILPDri(X86::VSHUFPDZ128rrikz);
case X86::VPERMILPDZ256rikz:
return ProcessVPERMILPDri(X86::VSHUFPDZ256rrikz);
case X86::VPERMILPDZrikz:
return ProcessVPERMILPDri(X86::VSHUFPDZrrikz);
case X86::VPERMILPDZ128rik:
return ProcessVPERMILPDri(X86::VSHUFPDZ128rrik);
case X86::VPERMILPDZ256rik:
return ProcessVPERMILPDri(X86::VSHUFPDZ256rrik);
case X86::VPERMILPDZrik:
return ProcessVPERMILPDri(X86::VSHUFPDZrrik);

case X86::VPERMILPSri:
return ProcessVPERMILPSri(X86::VSHUFPSrri);
case X86::VPERMILPSYri:
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1257,7 +1257,7 @@ define <8 x float> @test_mm256_or_ps(<8 x float> %a0, <8 x float> %a1) nounwind
define <2 x double> @test_mm_permute_pd(<2 x double> %a0) nounwind {
; CHECK-LABEL: test_mm_permute_pd:
; CHECK: # %bb.0:
; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-NEXT: ret{{[l|q]}}
%res = shufflevector <2 x double> %a0, <2 x double> %a0, <2 x i32> <i32 1, i32 0>
ret <2 x double> %res
Expand All @@ -1266,7 +1266,7 @@ define <2 x double> @test_mm_permute_pd(<2 x double> %a0) nounwind {
define <4 x double> @test_mm256_permute_pd(<4 x double> %a0) nounwind {
; CHECK-LABEL: test_mm256_permute_pd:
; CHECK: # %bb.0:
; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
; CHECK-NEXT: ret{{[l|q]}}
%res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
ret <4 x double> %res
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
Original file line number Diff line number Diff line change
Expand Up @@ -843,13 +843,13 @@ declare void @llvm.x86.avx.storeu.ps.256(ptr, <8 x float>) nounwind
define <2 x double> @test_x86_avx_vpermil_pd(<2 x double> %a0) {
; AVX-LABEL: test_x86_avx_vpermil_pd:
; AVX: # %bb.0:
; AVX-NEXT: vpermilpd $1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x05,0xc0,0x01]
; AVX-NEXT: vshufpd $1, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc6,0xc0,0x01]
; AVX-NEXT: # xmm0 = xmm0[1,0]
; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx_vpermil_pd:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpermilpd $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x05,0xc0,0x01]
; AVX512VL-NEXT: vshufpd $1, %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc6,0xc0,0x01]
; AVX512VL-NEXT: # xmm0 = xmm0[1,0]
; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double> %a0, i8 1) ; <<2 x double>> [#uses=1]
Expand All @@ -861,13 +861,13 @@ declare <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double>, i8) nounwind readnon
define <4 x double> @test_x86_avx_vpermil_pd_256(<4 x double> %a0) {
; AVX-LABEL: test_x86_avx_vpermil_pd_256:
; AVX: # %bb.0:
; AVX-NEXT: vpermilpd $7, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x05,0xc0,0x07]
; AVX-NEXT: vshufpd $7, %ymm0, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xc6,0xc0,0x07]
; AVX-NEXT: # ymm0 = ymm0[1,1,3,2]
; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx_vpermil_pd_256:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpermilpd $7, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x05,0xc0,0x07]
; AVX512VL-NEXT: vshufpd $7, %ymm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xc6,0xc0,0x07]
; AVX512VL-NEXT: # ymm0 = ymm0[1,1,3,2]
; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double> %a0, i8 7) ; <<4 x double>> [#uses=1]
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/avx-intrinsics-x86.ll
Original file line number Diff line number Diff line change
Expand Up @@ -656,13 +656,13 @@ declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) no
define <4 x double> @test_x86_avx_vpermilvar_pd_256_2(<4 x double> %a0) {
; AVX-LABEL: test_x86_avx_vpermilvar_pd_256_2:
; AVX: # %bb.0:
; AVX-NEXT: vpermilpd $9, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x05,0xc0,0x09]
; AVX-NEXT: vshufpd $9, %ymm0, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xc6,0xc0,0x09]
; AVX-NEXT: # ymm0 = ymm0[1,0,2,3]
; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx_vpermilvar_pd_256_2:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpermilpd $9, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x05,0xc0,0x09]
; AVX512VL-NEXT: vshufpd $9, %ymm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xc6,0xc0,0x09]
; AVX512VL-NEXT: # ymm0 = ymm0[1,0,2,3]
; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 0, i64 2>) ; <<4 x double>> [#uses=1]
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/avx-vbroadcast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -882,7 +882,7 @@ define double @broadcast_scale_xyz(ptr nocapture readonly, ptr nocapture readonl
; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X86-NEXT: vmulpd (%eax), %xmm0, %xmm1
; X86-NEXT: vmulsd 16(%eax), %xmm0, %xmm0
; X86-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; X86-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
; X86-NEXT: vaddsd %xmm2, %xmm1, %xmm1
; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovsd %xmm0, (%esp)
Expand All @@ -895,7 +895,7 @@ define double @broadcast_scale_xyz(ptr nocapture readonly, ptr nocapture readonl
; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: vmulpd (%rsi), %xmm0, %xmm1
; X64-NEXT: vmulsd 16(%rsi), %xmm0, %xmm0
; X64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; X64-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
; X64-NEXT: vaddsd %xmm2, %xmm1, %xmm1
; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/avx512-cvt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -180,13 +180,13 @@ define <4 x i64> @f64to4sl(<4 x double> %a) {
; NODQ-NEXT: vextractf128 $1, %ymm0, %xmm1
; NODQ-NEXT: vcvttsd2si %xmm1, %rax
; NODQ-NEXT: vmovq %rax, %xmm2
; NODQ-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
; NODQ-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
; NODQ-NEXT: vcvttsd2si %xmm1, %rax
; NODQ-NEXT: vmovq %rax, %xmm1
; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; NODQ-NEXT: vcvttsd2si %xmm0, %rax
; NODQ-NEXT: vmovq %rax, %xmm2
; NODQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; NODQ-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
; NODQ-NEXT: vcvttsd2si %xmm0, %rax
; NODQ-NEXT: vmovq %rax, %xmm0
; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
Expand Down Expand Up @@ -214,7 +214,7 @@ define <4 x i64> @f32to4sl(<4 x float> %a) {
; NODQ-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
; NODQ-NEXT: vcvttss2si %xmm1, %rax
; NODQ-NEXT: vmovq %rax, %xmm1
; NODQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; NODQ-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
; NODQ-NEXT: vcvttss2si %xmm2, %rax
; NODQ-NEXT: vmovq %rax, %xmm2
; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/X86/avx512-hadd-hsub.ll
Original file line number Diff line number Diff line change
Expand Up @@ -59,15 +59,15 @@ define i32 @hsub_16(<16 x i32> %x225) {
define float @fhadd_16(<16 x float> %x225) {
; KNL-LABEL: fhadd_16:
; KNL: # %bb.0:
; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; KNL-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
; KNL-NEXT: vaddps %xmm1, %xmm0, %xmm0
; KNL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; KNL-NEXT: vaddss %xmm1, %xmm0, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: fhadd_16:
; SKX: # %bb.0:
; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; SKX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; SKX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SKX-NEXT: vaddss %xmm1, %xmm0, %xmm0
Expand All @@ -84,15 +84,15 @@ define float @fhadd_16(<16 x float> %x225) {
define float @fhsub_16(<16 x float> %x225) {
; KNL-LABEL: fhsub_16:
; KNL: # %bb.0:
; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; KNL-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
; KNL-NEXT: vaddps %xmm1, %xmm0, %xmm0
; KNL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; KNL-NEXT: vsubss %xmm1, %xmm0, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: fhsub_16:
; SKX: # %bb.0:
; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; SKX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; SKX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SKX-NEXT: vsubss %xmm1, %xmm0, %xmm0
Expand Down Expand Up @@ -192,13 +192,13 @@ define <8 x i32> @hadd_16_3_sv(<16 x i32> %x225, <16 x i32> %x227) {
define double @fadd_noundef_eel(<8 x double> %x225, <8 x double> %x227) {
; KNL-LABEL: fadd_noundef_eel:
; KNL: # %bb.0:
; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; KNL-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
; KNL-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: fadd_noundef_eel:
; SKX: # %bb.0:
; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; SKX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
; SKX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
Expand All @@ -215,14 +215,14 @@ define double @fsub_noundef_ee (<8 x double> %x225, <8 x double> %x227) {
; KNL-LABEL: fsub_noundef_ee:
; KNL: # %bb.0:
; KNL-NEXT: vextractf32x4 $2, %zmm1, %xmm0
; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; KNL-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
; KNL-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: fsub_noundef_ee:
; SKX: # %bb.0:
; SKX-NEXT: vextractf32x4 $2, %zmm1, %xmm0
; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; SKX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
; SKX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
Expand Down
Loading

0 comments on commit e9f9467

Please sign in to comment.