Skip to content

Commit

Permalink
[AArch64][SVE] Detect MOV (imm, pred, zeroing/merging) (#116032)
Browse files Browse the repository at this point in the history
Add patterns to fold MOV (scalar, predicated) to MOV (imm, pred,
merging) or MOV (imm, pred, zeroing) as appropriate.

This affects the `@llvm.aarch64.sve.dup` intrinsics, which currently
generate MOV (scalar, predicated) instructions even when the
immediate forms are possible. For example:
```
svuint8_t mov_z_b(svbool_t p) {
  return svdup_u8_z(p, 1);
}
```
Currently generates:
```
mov_z_b(__SVBool_t):
        mov     z0.b, #0
        mov     w8, #1
        mov     z0.b, p0/m, w8
        ret
```
Instead of:
```
mov_z_b(__SVBool_t):
        mov     z0.b, p0/z, #1
        ret
```
  • Loading branch information
rj-jesus authored Nov 15, 2024
1 parent 4163136 commit e543650
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 4 deletions.
4 changes: 2 additions & 2 deletions llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -839,8 +839,8 @@ let Predicates = [HasSVEorSME] in {
defm DUPM_ZI : sve_int_dup_mask_imm<"dupm">;

// Splat immediate (predicated)
defm CPY_ZPmI : sve_int_dup_imm_pred_merge<"cpy">;
defm CPY_ZPzI : sve_int_dup_imm_pred_zero<"cpy">;
defm CPY_ZPmI : sve_int_dup_imm_pred_merge<"cpy", AArch64dup_mt>;
defm CPY_ZPzI : sve_int_dup_imm_pred_zero<"cpy", AArch64dup_mt>;
defm FCPY_ZPmI : sve_int_dup_fpimm_pred<"fcpy">;

// Splat scalar register (unpredicated, GPR or vector + element index)
Expand Down
22 changes: 20 additions & 2 deletions llvm/lib/Target/AArch64/SVEInstrFormats.td
Original file line number Diff line number Diff line change
Expand Up @@ -5357,7 +5357,7 @@ multiclass sve_int_dup_imm_pred_merge_inst<
(!cast<Instruction>(NAME) $Zd, $Pg, $imm, $shift)>;
}

multiclass sve_int_dup_imm_pred_merge<string asm> {
multiclass sve_int_dup_imm_pred_merge<string asm, SDPatternOperator op> {
defm _B : sve_int_dup_imm_pred_merge_inst<0b00, asm, ZPR8, cpy_imm8_opt_lsl_i8,
nxv16i8, nxv16i1, i32, SVECpyDupImm8Pat>;
defm _H : sve_int_dup_imm_pred_merge_inst<0b01, asm, ZPR16, cpy_imm8_opt_lsl_i16,
Expand Down Expand Up @@ -5386,6 +5386,15 @@ multiclass sve_int_dup_imm_pred_merge<string asm> {
(!cast<Instruction>(NAME # _D) $Zd, $Pg, 0, 0)>;
def : Pat<(vselect PPRAny:$Pg, (SVEDup0), (nxv2f64 ZPR:$Zd)),
(!cast<Instruction>(NAME # _D) $Zd, $Pg, 0, 0)>;

def : Pat<(nxv16i8 (op nxv16i1:$pg, (i32 (SVECpyDupImm8Pat i32:$a, i32:$b)), nxv16i8:$zd)),
(!cast<Instruction>(NAME # _B) $zd, $pg, $a, $b)>;
def : Pat<(nxv8i16 (op nxv8i1:$pg, (i32 (SVECpyDupImm16Pat i32:$a, i32:$b)), nxv8i16:$zd)),
(!cast<Instruction>(NAME # _H) $zd, $pg, $a, $b)>;
def : Pat<(nxv4i32 (op nxv4i1:$pg, (i32 (SVECpyDupImm32Pat i32:$a, i32:$b)), nxv4i32:$zd)),
(!cast<Instruction>(NAME # _S) $zd, $pg, $a, $b)>;
def : Pat<(nxv2i64 (op nxv2i1:$pg, (i64 (SVECpyDupImm64Pat i32:$a, i32:$b)), nxv2i64:$zd)),
(!cast<Instruction>(NAME # _D) $zd, $pg, $a, $b)>;
}

multiclass sve_int_dup_imm_pred_zero_inst<
Expand All @@ -5407,7 +5416,7 @@ multiclass sve_int_dup_imm_pred_zero_inst<
(!cast<Instruction>(NAME) $Pg, $imm, $shift)>;
}

multiclass sve_int_dup_imm_pred_zero<string asm> {
multiclass sve_int_dup_imm_pred_zero<string asm, SDPatternOperator op> {
defm _B : sve_int_dup_imm_pred_zero_inst<0b00, asm, ZPR8, cpy_imm8_opt_lsl_i8,
nxv16i8, nxv16i1, i32, SVECpyDupImm8Pat>;
defm _H : sve_int_dup_imm_pred_zero_inst<0b01, asm, ZPR16, cpy_imm8_opt_lsl_i16,
Expand All @@ -5416,6 +5425,15 @@ multiclass sve_int_dup_imm_pred_zero<string asm> {
nxv4i32, nxv4i1, i32, SVECpyDupImm32Pat>;
defm _D : sve_int_dup_imm_pred_zero_inst<0b11, asm, ZPR64, cpy_imm8_opt_lsl_i64,
nxv2i64, nxv2i1, i64, SVECpyDupImm64Pat>;

def : Pat<(nxv16i8 (op nxv16i1:$pg, (i32 (SVECpyDupImm8Pat i32:$a, i32:$b)), (SVEDup0))),
(!cast<Instruction>(NAME # _B) $pg, $a, $b)>;
def : Pat<(nxv8i16 (op nxv8i1:$pg, (i32 (SVECpyDupImm16Pat i32:$a, i32:$b)), (SVEDup0))),
(!cast<Instruction>(NAME # _H) $pg, $a, $b)>;
def : Pat<(nxv4i32 (op nxv4i1:$pg, (i32 (SVECpyDupImm32Pat i32:$a, i32:$b)), (SVEDup0))),
(!cast<Instruction>(NAME # _S) $pg, $a, $b)>;
def : Pat<(nxv2i64 (op nxv2i1:$pg, (i64 (SVECpyDupImm64Pat i32:$a, i32:$b)), (SVEDup0))),
(!cast<Instruction>(NAME # _D) $pg, $a, $b)>;
}

//===----------------------------------------------------------------------===//
Expand Down
83 changes: 83 additions & 0 deletions llvm/test/CodeGen/AArch64/sve-mov-imm-pred.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s

; Zeroing.

define <vscale x 16 x i8> @mov_z_b(<vscale x 16 x i1> %pg) {
; CHECK-LABEL: mov_z_b:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1
; CHECK-NEXT: ret
%r = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i1> %pg, i8 1)
ret <vscale x 16 x i8> %r
}

define <vscale x 8 x i16> @mov_z_h(<vscale x 8 x i1> %pg) {
; CHECK-LABEL: mov_z_h:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1
; CHECK-NEXT: ret
%r = tail call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> %pg, i16 1)
ret <vscale x 8 x i16> %r
}

define <vscale x 4 x i32> @mov_z_s(<vscale x 4 x i1> %pg) {
; CHECK-LABEL: mov_z_s:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1
; CHECK-NEXT: ret
%r = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> %pg, i32 1)
ret <vscale x 4 x i32> %r
}

define <vscale x 2 x i64> @mov_z_d(<vscale x 2 x i1> %pg) {
; CHECK-LABEL: mov_z_d:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1
; CHECK-NEXT: ret
%r = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, i64 1)
ret <vscale x 2 x i64> %r
}

; Merging.

define <vscale x 16 x i8> @mov_m_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg) {
; CHECK-LABEL: mov_m_b:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z0.b, p0/m, #1 // =0x1
; CHECK-NEXT: ret
%r = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i8 1)
ret <vscale x 16 x i8> %r
}

define <vscale x 8 x i16> @mov_m_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg) {
; CHECK-LABEL: mov_m_h:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z0.h, p0/m, #1 // =0x1
; CHECK-NEXT: ret
%r = tail call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i16 1)
ret <vscale x 8 x i16> %r
}

define <vscale x 4 x i32> @mov_m_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg) {
; CHECK-LABEL: mov_m_s:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z0.s, p0/m, #1 // =0x1
; CHECK-NEXT: ret
%r = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 1)
ret <vscale x 4 x i32> %r
}

define <vscale x 2 x i64> @mov_m_d(<vscale x 2 x i64> %zd, <vscale x 2 x i1> %pg) {
; CHECK-LABEL: mov_m_d:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z0.d, p0/m, #1 // =0x1
; CHECK-NEXT: ret
%r = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> %zd, <vscale x 2 x i1> %pg, i64 1)
ret <vscale x 2 x i64> %r
}

declare <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i8)
declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16)
declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32)
declare <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64)

0 comments on commit e543650

Please sign in to comment.