Skip to content

Commit e3e7c75

Browse files
authored
AMDGPU: Update pattern matching from "x&(-1>>(32-y))" to "bfe x, 0, y" (#116115)
It is not correct to lower "x&(-1>>(32-y))" to "bfe x, 0, y". When y equals 32, "-1" is not shifted, so x&(-1>>(32-32) is still x, but "bfe x, 0, 32" is 0. However, if we know y is at most of 5 bits (< 32), we can still do the pattern matching.
1 parent 9f96f1c commit e3e7c75

File tree

4 files changed

+47
-9
lines changed

4 files changed

+47
-9
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
#include "SIISelLowering.h"
2323
#include "SIMachineFunctionInfo.h"
2424
#include "llvm/Analysis/UniformityAnalysis.h"
25-
#include "llvm/Analysis/ValueTracking.h"
2625
#include "llvm/CodeGen/FunctionLoweringInfo.h"
2726
#include "llvm/CodeGen/SelectionDAG.h"
2827
#include "llvm/CodeGen/SelectionDAGISel.h"

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "GCNSubtarget.h"
1818
#include "SIMachineFunctionInfo.h"
1919
#include "SIModeRegisterDefaults.h"
20+
#include "llvm/Analysis/ValueTracking.h"
2021
#include "llvm/CodeGen/SelectionDAGISel.h"
2122
#include "llvm/Target/TargetMachine.h"
2223

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3550,9 +3550,13 @@ def : AMDGPUPat <
35503550
(V_BFE_U32_e64 $src, (i32 0), $width)
35513551
>;
35523552

3553+
def uint5Bits : PatLeaf<(i32 VGPR_32:$width), [{
3554+
return CurDAG->computeKnownBits(SDValue(N, 0)).countMaxActiveBits() <= 5;
3555+
}]>;
3556+
35533557
// x & (-1 >> (bitwidth - y))
35543558
def : AMDGPUPat <
3555-
(DivergentBinFrag<and> i32:$src, (srl_oneuse -1, (sub 32, i32:$width))),
3559+
(DivergentBinFrag<and> i32:$src, (srl_oneuse -1, (sub 32, uint5Bits:$width))),
35563560
(V_BFE_U32_e64 $src, (i32 0), $width)
35573561
>;
35583562

llvm/test/CodeGen/AMDGPU/extract-lowbits.ll

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -99,12 +99,36 @@ define i32 @bzhi32_b4_commutative(i32 %val, i32 %numlowbits) nounwind {
9999
; ---------------------------------------------------------------------------- ;
100100

101101
define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
102-
; GCN-LABEL: bzhi32_c0:
102+
; SI-LABEL: bzhi32_c0:
103+
; SI: ; %bb.0:
104+
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105+
; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
106+
; SI-NEXT: v_lshr_b32_e32 v1, -1, v1
107+
; SI-NEXT: v_and_b32_e32 v0, v1, v0
108+
; SI-NEXT: s_setpc_b64 s[30:31]
109+
;
110+
; VI-LABEL: bzhi32_c0:
111+
; VI: ; %bb.0:
112+
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
113+
; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1
114+
; VI-NEXT: v_lshrrev_b32_e64 v1, v1, -1
115+
; VI-NEXT: v_and_b32_e32 v0, v1, v0
116+
; VI-NEXT: s_setpc_b64 s[30:31]
117+
%numhighbits = sub i32 32, %numlowbits
118+
%mask = lshr i32 -1, %numhighbits
119+
%masked = and i32 %mask, %val
120+
ret i32 %masked
121+
}
122+
123+
define i32 @bzhi32_c0_clamp(i32 %val, i32 %numlowbits) nounwind {
124+
; GCN-LABEL: bzhi32_c0_clamp:
103125
; GCN: ; %bb.0:
104126
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
127+
; GCN-NEXT: v_and_b32_e32 v1, 31, v1
105128
; GCN-NEXT: v_bfe_u32 v0, v0, 0, v1
106129
; GCN-NEXT: s_setpc_b64 s[30:31]
107-
%numhighbits = sub i32 32, %numlowbits
130+
%low5bits = and i32 %numlowbits, 31
131+
%numhighbits = sub i32 32, %low5bits
108132
%mask = lshr i32 -1, %numhighbits
109133
%masked = and i32 %mask, %val
110134
ret i32 %masked
@@ -134,11 +158,21 @@ define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
134158
}
135159

136160
define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
137-
; GCN-LABEL: bzhi32_c4_commutative:
138-
; GCN: ; %bb.0:
139-
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
140-
; GCN-NEXT: v_bfe_u32 v0, v0, 0, v1
141-
; GCN-NEXT: s_setpc_b64 s[30:31]
161+
; SI-LABEL: bzhi32_c4_commutative:
162+
; SI: ; %bb.0:
163+
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
164+
; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
165+
; SI-NEXT: v_lshr_b32_e32 v1, -1, v1
166+
; SI-NEXT: v_and_b32_e32 v0, v0, v1
167+
; SI-NEXT: s_setpc_b64 s[30:31]
168+
;
169+
; VI-LABEL: bzhi32_c4_commutative:
170+
; VI: ; %bb.0:
171+
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
172+
; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1
173+
; VI-NEXT: v_lshrrev_b32_e64 v1, v1, -1
174+
; VI-NEXT: v_and_b32_e32 v0, v0, v1
175+
; VI-NEXT: s_setpc_b64 s[30:31]
142176
%numhighbits = sub i32 32, %numlowbits
143177
%mask = lshr i32 -1, %numhighbits
144178
%masked = and i32 %val, %mask ; swapped order

0 commit comments

Comments
 (0)