Skip to content

Commit 14cb23e

Browse files
committed
aarch64: Emit XAR for vector rotates where possible
We can make use of the integrated rotate step of the XAR instruction to implement most vector integer rotates, as long we zero out one of the input registers for it. This allows for a lower-latency sequence than the fallback SHL+USRA, especially when we can hoist the zeroing operation away from loops and hot parts. This should be safe to do for 64-bit vectors as well even though the XAR instructions operate on 128-bit values, as the bottom 64-bit results is later accessed through the right subregs. This strategy is used whenever we have XAR instructions, the logic in aarch64_emit_opt_vec_rotate is adjusted to resort to expand_rotate_as_vec_perm only when it's expected to generate a single REV* instruction or when XAR instructions are not present. With this patch we can gerate for the input: v4si G1 (v4si r) { return (r >> 23) | (r << 9); } v8qi G2 (v8qi r) { return (r << 3) | (r >> 5); } the assembly for +sve2: G1: movi v31.4s, 0 xar z0.s, z0.s, z31.s, #23 ret G2: movi v31.4s, 0 xar z0.b, z0.b, z31.b, #5 ret instead of the current: G1: shl v31.4s, v0.4s, 9 usra v31.4s, v0.4s, 23 mov v0.16b, v31.16b ret G2: shl v31.8b, v0.8b, 3 usra v31.8b, v0.8b, 5 mov v0.8b, v31.8b ret Bootstrapped and tested on aarch64-none-linux-gnu. Signed-off-by: Kyrylo Tkachov <[email protected]> gcc/ * config/aarch64/aarch64.cc (aarch64_emit_opt_vec_rotate): Add generation of XAR sequences when possible. gcc/testsuite/ * gcc.target/aarch64/rotate_xar_1.c: New test.
1 parent 19757e1 commit 14cb23e

File tree

2 files changed

+121
-6
lines changed

2 files changed

+121
-6
lines changed

gcc/config/aarch64/aarch64.cc

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16019,17 +16019,39 @@ aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
1601916019
}
1602016020

1602116021
/* Emit an optimized sequence to perform a vector rotate
16022-
of REG by the vector constant amount AMNT and place the result
16022+
of REG by the vector constant amount AMNT_VEC and place the result
1602316023
in DST. Return true iff successful. */
1602416024

1602516025
bool
16026-
aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx amnt)
16026+
aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx amnt_vec)
1602716027
{
16028+
rtx amnt = unwrap_const_vec_duplicate (amnt_vec);
16029+
gcc_assert (CONST_INT_P (amnt));
16030+
HOST_WIDE_INT rotamnt = UINTVAL (amnt);
1602816031
machine_mode mode = GET_MODE (reg);
16029-
/* Attempt to expand the rotate as a vector permute.
16030-
For some rotate amounts they can be single instructions and
16031-
even the general single-vector TBL permute has good throughput. */
16032-
if (expand_rotate_as_vec_perm (mode, dst, reg, amnt))
16032+
/* Rotates by half the element width map down to REV* instructions and should
16033+
always be preferred when possible. */
16034+
if (rotamnt == GET_MODE_UNIT_BITSIZE (mode) / 2
16035+
&& expand_rotate_as_vec_perm (mode, dst, reg, amnt))
16036+
return true;
16037+
/* 64 and 128-bit vector modes can use the XAR instruction
16038+
when available. */
16039+
else if (can_create_pseudo_p ()
16040+
&& ((TARGET_SHA3 && mode == V2DImode)
16041+
|| (TARGET_SVE2
16042+
&& (known_eq (GET_MODE_SIZE (mode), 8)
16043+
|| known_eq (GET_MODE_SIZE (mode), 16)))))
16044+
{
16045+
rtx zeroes = aarch64_gen_shareable_zero (mode);
16046+
rtx xar_op
16047+
= gen_rtx_ROTATE (mode, gen_rtx_XOR (mode, reg, zeroes),
16048+
amnt_vec);
16049+
emit_set_insn (dst, xar_op);
16050+
return true;
16051+
}
16052+
/* If none of the above, try to expand rotates by any byte amount as
16053+
permutes. */
16054+
else if (expand_rotate_as_vec_perm (mode, dst, reg, amnt))
1603316055
return true;
1603416056
return false;
1603516057
}
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
/* { dg-do compile } */
2+
/* { dg-options "-O2" } */
3+
/* { dg-final { check-function-bodies "**" "" } } */
4+
5+
typedef char __attribute__ ((vector_size (16))) v16qi;
6+
typedef unsigned short __attribute__ ((vector_size (16))) v8hi;
7+
typedef unsigned int __attribute__ ((vector_size (16))) v4si;
8+
typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
9+
typedef char __attribute__ ((vector_size (8))) v8qi;
10+
typedef unsigned short __attribute__ ((vector_size (8))) v4hi;
11+
typedef unsigned int __attribute__ ((vector_size (8))) v2si;
12+
13+
#pragma GCC target "+sve2+sha3"
14+
15+
/*
16+
** G1:
17+
** movi? [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
18+
** xar v0\.2d, v[0-9]+\.2d, v[0-9]+\.2d, 39
19+
** ret
20+
*/
21+
v2di
22+
G1 (v2di r) {
23+
return (r >> 39) | (r << 25);
24+
}
25+
26+
/*
27+
** G2:
28+
** movi? [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
29+
** xar z0\.s, z[0-9]+\.s, z[0-9]+\.s, #23
30+
** ret
31+
*/
32+
v4si
33+
G2 (v4si r) {
34+
return (r >> 23) | (r << 9);
35+
}
36+
37+
/*
38+
** G3:
39+
** movi? [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
40+
** xar z0\.h, z[0-9]+\.h, z[0-9]+\.h, #5
41+
** ret
42+
*/
43+
v8hi
44+
G3 (v8hi r) {
45+
return (r >> 5) | (r << 11);
46+
}
47+
48+
/*
49+
** G4:
50+
** movi? [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
51+
** xar z0\.b, z[0-9]+\.b, z[0-9]+\.b, #6
52+
** ret
53+
*/
54+
v16qi
55+
G4 (v16qi r)
56+
{
57+
return (r << 2) | (r >> 6);
58+
}
59+
60+
/*
61+
** G5:
62+
** movi? [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
63+
** xar z0\.s, z[0-9]+\.s, z[0-9]+\.s, #22
64+
** ret
65+
*/
66+
v2si
67+
G5 (v2si r) {
68+
return (r >> 22) | (r << 10);
69+
}
70+
71+
/*
72+
** G6:
73+
** movi? [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
74+
** xar z0\.h, z[0-9]+\.h, z[0-9]+\.h, #7
75+
** ret
76+
*/
77+
v4hi
78+
G6 (v4hi r) {
79+
return (r >> 7) | (r << 9);
80+
}
81+
82+
/*
83+
** G7:
84+
** movi? [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
85+
** xar z0\.b, z[0-9]+\.b, z[0-9]+\.b, #5
86+
** ret
87+
*/
88+
v8qi
89+
G7 (v8qi r)
90+
{
91+
return (r << 3) | (r >> 5);
92+
}
93+

0 commit comments

Comments
 (0)