Skip to content

Commit

Permalink
aarch64: Emit XAR for vector rotates where possible
Browse files Browse the repository at this point in the history
We can make use of the integrated rotate step of the XAR instruction
to implement most vector integer rotates, as long we zero out one
of the input registers for it.  This allows for a lower-latency sequence
than the fallback SHL+USRA, especially when we can hoist the zeroing operation
away from loops and hot parts.  This should be safe to do for 64-bit vectors
as well even though the XAR instructions operate on 128-bit values, as the
bottom 64-bit results is later accessed through the right subregs.

This strategy is used whenever we have XAR instructions, the logic
in aarch64_emit_opt_vec_rotate is adjusted to resort to
expand_rotate_as_vec_perm only when it's expected to generate a single REV*
instruction or when XAR instructions are not present.

With this patch we can gerate for the input:
v4si
G1 (v4si r)
{
    return (r >> 23) | (r << 9);
}

v8qi
G2 (v8qi r)
{
  return (r << 3) | (r >> 5);
}
the assembly for +sve2:
G1:
        movi    v31.4s, 0
        xar     z0.s, z0.s, z31.s, #23
        ret

G2:
        movi    v31.4s, 0
        xar     z0.b, z0.b, z31.b, #5
        ret

instead of the current:
G1:
        shl     v31.4s, v0.4s, 9
        usra    v31.4s, v0.4s, 23
        mov     v0.16b, v31.16b
        ret
G2:
        shl     v31.8b, v0.8b, 3
        usra    v31.8b, v0.8b, 5
        mov     v0.8b, v31.8b
        ret

Bootstrapped and tested on aarch64-none-linux-gnu.

Signed-off-by: Kyrylo Tkachov <[email protected]>

gcc/

	* config/aarch64/aarch64.cc (aarch64_emit_opt_vec_rotate): Add
	generation of XAR sequences when possible.

gcc/testsuite/

	* gcc.target/aarch64/rotate_xar_1.c: New test.
  • Loading branch information
ktkachov committed Nov 4, 2024
1 parent 19757e1 commit 14cb23e
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 6 deletions.
34 changes: 28 additions & 6 deletions gcc/config/aarch64/aarch64.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16019,17 +16019,39 @@ aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
}

/* Emit an optimized sequence to perform a vector rotate
of REG by the vector constant amount AMNT and place the result
of REG by the vector constant amount AMNT_VEC and place the result
in DST. Return true iff successful. */

bool
aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx amnt)
aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx amnt_vec)
{
rtx amnt = unwrap_const_vec_duplicate (amnt_vec);
gcc_assert (CONST_INT_P (amnt));
HOST_WIDE_INT rotamnt = UINTVAL (amnt);
machine_mode mode = GET_MODE (reg);
/* Attempt to expand the rotate as a vector permute.
For some rotate amounts they can be single instructions and
even the general single-vector TBL permute has good throughput. */
if (expand_rotate_as_vec_perm (mode, dst, reg, amnt))
/* Rotates by half the element width map down to REV* instructions and should
always be preferred when possible. */
if (rotamnt == GET_MODE_UNIT_BITSIZE (mode) / 2
&& expand_rotate_as_vec_perm (mode, dst, reg, amnt))
return true;
/* 64 and 128-bit vector modes can use the XAR instruction
when available. */
else if (can_create_pseudo_p ()
&& ((TARGET_SHA3 && mode == V2DImode)
|| (TARGET_SVE2
&& (known_eq (GET_MODE_SIZE (mode), 8)
|| known_eq (GET_MODE_SIZE (mode), 16)))))
{
rtx zeroes = aarch64_gen_shareable_zero (mode);
rtx xar_op
= gen_rtx_ROTATE (mode, gen_rtx_XOR (mode, reg, zeroes),
amnt_vec);
emit_set_insn (dst, xar_op);
return true;
}
/* If none of the above, try to expand rotates by any byte amount as
permutes. */
else if (expand_rotate_as_vec_perm (mode, dst, reg, amnt))
return true;
return false;
}
Expand Down
93 changes: 93 additions & 0 deletions gcc/testsuite/gcc.target/aarch64/rotate_xar_1.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
/* { dg-do compile } */
/* { dg-options "-O2" } */
/* { dg-final { check-function-bodies "**" "" } } */

typedef char __attribute__ ((vector_size (16))) v16qi;
typedef unsigned short __attribute__ ((vector_size (16))) v8hi;
typedef unsigned int __attribute__ ((vector_size (16))) v4si;
typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
typedef char __attribute__ ((vector_size (8))) v8qi;
typedef unsigned short __attribute__ ((vector_size (8))) v4hi;
typedef unsigned int __attribute__ ((vector_size (8))) v2si;

#pragma GCC target "+sve2+sha3"

/*
** G1:
** movi? [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
** xar v0\.2d, v[0-9]+\.2d, v[0-9]+\.2d, 39
** ret
*/
v2di
G1 (v2di r) {
return (r >> 39) | (r << 25);
}

/*
** G2:
** movi? [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
** xar z0\.s, z[0-9]+\.s, z[0-9]+\.s, #23
** ret
*/
v4si
G2 (v4si r) {
return (r >> 23) | (r << 9);
}

/*
** G3:
** movi? [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
** xar z0\.h, z[0-9]+\.h, z[0-9]+\.h, #5
** ret
*/
v8hi
G3 (v8hi r) {
return (r >> 5) | (r << 11);
}

/*
** G4:
** movi? [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
** xar z0\.b, z[0-9]+\.b, z[0-9]+\.b, #6
** ret
*/
v16qi
G4 (v16qi r)
{
return (r << 2) | (r >> 6);
}

/*
** G5:
** movi? [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
** xar z0\.s, z[0-9]+\.s, z[0-9]+\.s, #22
** ret
*/
v2si
G5 (v2si r) {
return (r >> 22) | (r << 10);
}

/*
** G6:
** movi? [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
** xar z0\.h, z[0-9]+\.h, z[0-9]+\.h, #7
** ret
*/
v4hi
G6 (v4hi r) {
return (r >> 7) | (r << 9);
}

/*
** G7:
** movi? [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
** xar z0\.b, z[0-9]+\.b, z[0-9]+\.b, #5
** ret
*/
v8qi
G7 (v8qi r)
{
return (r << 3) | (r >> 5);
}

0 comments on commit 14cb23e

Please sign in to comment.