AMDGPU: Update pattern matching from "x&(-1>>(32-y))" to "bfe x, 0, y" (#116115)

changpeng · web-flow · commit e3e7c756fb43 · 2024-11-14T12:21:34.000-08:00
It is not correct to lower "x&amp;(-1&gt;&gt;(32-y))" to "bfe x, 0, y". When y
equals 32, "-1" is not shifted, so x&amp;(-1&gt;&gt;(32-32) is still x, but "bfe
x, 0, 32" is 0. However, if we know y is at most of 5 bits (&lt; 32), we
can still do the pattern matching.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -22,7 +22,6 @@
 #include "SIISelLowering.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -17,6 +17,7 @@
 #include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIModeRegisterDefaults.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Target/TargetMachine.h"
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3550,9 +3550,13 @@ def : AMDGPUPat <
   (V_BFE_U32_e64 $src, (i32 0), $width)
 >;
 
+def uint5Bits : PatLeaf<(i32 VGPR_32:$width), [{
+  return CurDAG->computeKnownBits(SDValue(N, 0)).countMaxActiveBits() <= 5;
+}]>;
+
 // x & (-1 >> (bitwidth - y))
 def : AMDGPUPat <
-  (DivergentBinFrag<and> i32:$src, (srl_oneuse -1, (sub 32, i32:$width))),
+  (DivergentBinFrag<and> i32:$src, (srl_oneuse -1, (sub 32, uint5Bits:$width))),
   (V_BFE_U32_e64 $src, (i32 0), $width)
 >;
 
diff --git a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
@@ -99,12 +99,36 @@ define i32 @bzhi32_b4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ; ---------------------------------------------------------------------------- ;
 
 define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
-; GCN-LABEL: bzhi32_c0:
+; SI-LABEL: bzhi32_c0:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_sub_i32_e32 v1, vcc, 32, v1
+; SI-NEXT:    v_lshr_b32_e32 v1, -1, v1
+; SI-NEXT:    v_and_b32_e32 v0, v1, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: bzhi32_c0:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v1
+; VI-NEXT:    v_lshrrev_b32_e64 v1, v1, -1
+; VI-NEXT:    v_and_b32_e32 v0, v1, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %numhighbits = sub i32 32, %numlowbits
+  %mask = lshr i32 -1, %numhighbits
+  %masked = and i32 %mask, %val
+  ret i32 %masked
+}
+
+define i32 @bzhi32_c0_clamp(i32 %val, i32 %numlowbits) nounwind {
+; GCN-LABEL: bzhi32_c0_clamp:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 31, v1
 ; GCN-NEXT:    v_bfe_u32 v0, v0, 0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %numhighbits = sub i32 32, %numlowbits
+  %low5bits = and i32 %numlowbits, 31
+  %numhighbits = sub i32 32, %low5bits
   %mask = lshr i32 -1, %numhighbits
   %masked = and i32 %mask, %val
   ret i32 %masked
@@ -134,11 +158,21 @@ define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 }
 
 define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
-; GCN-LABEL: bzhi32_c4_commutative:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_bfe_u32 v0, v0, 0, v1
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SI-LABEL: bzhi32_c4_commutative:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_sub_i32_e32 v1, vcc, 32, v1
+; SI-NEXT:    v_lshr_b32_e32 v1, -1, v1
+; SI-NEXT:    v_and_b32_e32 v0, v0, v1
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: bzhi32_c4_commutative:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v1
+; VI-NEXT:    v_lshrrev_b32_e64 v1, v1, -1
+; VI-NEXT:    v_and_b32_e32 v0, v0, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
   %numhighbits = sub i32 32, %numlowbits
   %mask = lshr i32 -1, %numhighbits
   %masked = and i32 %val, %mask ; swapped order