-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[AArch64][SVE2] SVE2 NBSL instruction lowering. #89732
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Dinar Temirbulatov (dtemirbulatov) ChangesAllow to fold BSL/CNOT instuctions to NBSL instruction for scalable vectors. Full diff: https://github.com/llvm/llvm-project/pull/89732.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 6972acd985cb9a..e291d8857bdcd5 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3750,6 +3750,23 @@ let Predicates = [HasSVE2orSME] in {
// SVE2 extract vector (immediate offset, constructive)
def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">;
+
+ // zext(cmpeq(bsl(x, y, z), splat(0))) -> nbsl(x, y, z)
+ def : Pat<(nxv16i8 (zext (nxv16i1 (AArch64setcc_z (nxv16i1 (SVEAllActive)),
+ (nxv16i8 (AArch64bsp nxv16i8:$Op1, nxv16i8:$Op2, nxv16i8:$Op3)), (SVEDup0), SETEQ)))),
+ (NBSL_ZZZZ nxv16i8:$Op1, nxv16i8:$Op2, nxv16i8:$Op3)>;
+
+ def : Pat<(nxv8i16 (zext (nxv8i1 (AArch64setcc_z (nxv8i1 (SVEAllActive)),
+ (nxv8i16 (AArch64bsp nxv8i16:$Op1, nxv8i16:$Op2, nxv8i16:$Op3)), (SVEDup0), SETEQ)))),
+ (NBSL_ZZZZ nxv8i16:$Op1, nxv8i16:$Op2, nxv8i16:$Op3)>;
+
+ def : Pat<(nxv4i32 (zext (nxv4i1 (AArch64setcc_z (nxv4i1 (SVEAllActive)),
+ (nxv4i32 (AArch64bsp nxv4i32:$Op1, nxv4i32:$Op2, nxv4i32:$Op3)), (SVEDup0), SETEQ)))),
+ (NBSL_ZZZZ nxv4i32:$Op1, nxv4i32:$Op2, nxv4i32:$Op3)>;
+
+ def : Pat<(nxv2i64 (zext (nxv2i1 (AArch64setcc_z (nxv2i1 (SVEAllActive)),
+ (nxv2i64 (AArch64bsp nxv2i64:$Op1, nxv2i64:$Op2, nxv2i64:$Op3)), (SVEDup0), SETEQ)))),
+ (NBSL_ZZZZ nxv2i64:$Op1, nxv2i64:$Op2, nxv2i64:$Op3)>;
} // End HasSVE2orSME
let Predicates = [HasSVE2] in {
diff --git a/llvm/test/CodeGen/AArch64/sve2-bsl.ll b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
index 23b2622f5f5863..a7edd944e399fe 100644
--- a/llvm/test/CodeGen/AArch64/sve2-bsl.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
@@ -41,3 +41,18 @@ define <vscale x 4 x i32> @no_bsl_fold(<vscale x 4 x i32> %a, <vscale x 4 x i32>
%c = or <vscale x 4 x i32> %1, %2
ret <vscale x 4 x i32> %c
}
+
+define <vscale x 4 x i32> @nbsl(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: nbsl:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.s, #0x7fffffff
+; CHECK-NEXT: nbsl z2.d, z2.d, z0.d, z1.d
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+ %1 = and <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+ %2 = and <vscale x 4 x i32> %b, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+ %3 = or <vscale x 4 x i32> %1, %2
+ %4 = icmp eq <vscale x 4 x i32> %3, zeroinitializer
+ %5 = zext <vscale x 4 x i1> %4 to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %5
+}
|
@@ -41,3 +41,18 @@ define <vscale x 4 x i32> @no_bsl_fold(<vscale x 4 x i32> %a, <vscale x 4 x i32> | |||
%c = or <vscale x 4 x i32> %1, %2 | |||
ret <vscale x 4 x i32> %c | |||
} | |||
|
|||
define <vscale x 4 x i32> @nbsl(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is probably worth having a test per type-size, if the patterns are different.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
// zext(cmpeq(bsl(x, y, z), splat(0))) -> nbsl(x, y, z) | ||
def : Pat<(nxv16i8 (zext (nxv16i1 (AArch64setcc_z (nxv16i1 (SVEAllActive)), | ||
(nxv16i8 (AArch64bsp nxv16i8:$Op1, nxv16i8:$Op2, nxv16i8:$Op3)), (SVEDup0), SETEQ)))), | ||
(NBSL_ZZZZ nxv16i8:$Op1, nxv16i8:$Op2, nxv16i8:$Op3)>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe the operands need to be Op2, Op3, Op1. The order of the operands is weird.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
@@ -3750,6 +3750,23 @@ let Predicates = [HasSVE2orSME] in { | |||
|
|||
// SVE2 extract vector (immediate offset, constructive) | |||
def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">; | |||
|
|||
// zext(cmpeq(bsl(x, y, z), splat(0))) -> nbsl(x, y, z) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The instruction looks like it should be not(or(and(x, z), and(y, not(z)))
, not the same as a "cnot
" instruction. I think you can use vnot in the tablegen pattern, and the ir will be xor "bsl", -1
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It may be possible to pass a pattern fragment to sve2_int_bitwise_ternary_op, like bsl already does.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. LGTM
@@ -733,6 +733,8 @@ def AArch64vsli : SDNode<"AArch64ISD::VSLI", SDT_AArch64vshiftinsert>; | |||
def AArch64vsri : SDNode<"AArch64ISD::VSRI", SDT_AArch64vshiftinsert>; | |||
|
|||
def AArch64bsp: SDNode<"AArch64ISD::BSP", SDT_AArch64trivec>; | |||
def AArch64nbsl: PatFrag<(ops node:$Op1, node:$Op2, node:$Op3), | |||
(vnot (AArch64bsp node:$Op1, node:$Op2, node:$Op3))>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add an extra space, to line this up.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
Allow to fold BSL/CNOT instuctions to NBSL instruction for scalable vectors.
Allow to fold BSL/EOR instuctions to NBSL instruction for scalable vectors.