Skip to content

Commit bfd9bc2

Browse files
authored
[AMDGPU] SIPeepholeSDWA: Disable on existing SDWA instructions (#124131)
This PR reapplies the changes from PR #123942 which had to be reverted because of a test failure. The test has been adjusted.
1 parent e289cb5 commit bfd9bc2

20 files changed

+404
-112
lines changed

llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -962,8 +962,11 @@ bool isConvertibleToSDWA(MachineInstr &MI,
962962
const SIInstrInfo* TII) {
963963
// Check if this is already an SDWA instruction
964964
unsigned Opc = MI.getOpcode();
965-
if (TII->isSDWA(Opc))
966-
return true;
965+
if (TII->isSDWA(Opc)) {
966+
// FIXME: Reenable after fixing selection handling.
967+
// Cf. llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll
968+
return false;
969+
}
967970

968971
// Check if this instruction has opcode that supports SDWA
969972
if (AMDGPU::getSDWAOp(Opc) == -1)

llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -280,8 +280,9 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
280280
; GFX8-NEXT: v_min_i16_e32 v1, v2, v1
281281
; GFX8-NEXT: v_add_u16_e32 v1, v3, v1
282282
; GFX8-NEXT: v_mov_b32_e32 v2, 0xff
283+
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
283284
; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
284-
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
285+
; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
285286
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
286287
; GFX8-NEXT: s_setpc_b64 s[30:31]
287288
;
@@ -299,7 +300,8 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
299300
; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp
300301
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
301302
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
302-
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
303+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
304+
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
303305
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
304306
; GFX9-NEXT: s_setpc_b64 s[30:31]
305307
;
@@ -439,7 +441,8 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
439441
; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp
440442
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
441443
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
442-
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
444+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
445+
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
443446
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
444447
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
445448
; GFX9-NEXT: ; return to shader part epilog
@@ -609,9 +612,11 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
609612
; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
610613
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
611614
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
612-
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
615+
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
616+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
613617
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
614-
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
618+
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
619+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
615620
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
616621
; GFX8-NEXT: s_setpc_b64 s[30:31]
617622
;

llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -281,8 +281,9 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
281281
; GFX8-NEXT: v_min_i16_e32 v1, v1, v4
282282
; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1
283283
; GFX8-NEXT: v_mov_b32_e32 v2, 0xff
284+
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
284285
; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
285-
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
286+
; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
286287
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
287288
; GFX8-NEXT: s_setpc_b64 s[30:31]
288289
;
@@ -300,7 +301,8 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
300301
; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp
301302
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
302303
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
303-
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
304+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
305+
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
304306
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
305307
; GFX9-NEXT: s_setpc_b64 s[30:31]
306308
;
@@ -440,7 +442,8 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
440442
; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp
441443
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
442444
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
443-
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
445+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
446+
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
444447
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
445448
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
446449
; GFX9-NEXT: ; return to shader part epilog
@@ -610,9 +613,11 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
610613
; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
611614
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
612615
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
613-
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
616+
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
617+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
614618
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
615-
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
619+
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
620+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
616621
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
617622
; GFX8-NEXT: s_setpc_b64 s[30:31]
618623
;

llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,8 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
224224
; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp
225225
; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
226226
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
227-
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
227+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
228+
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
228229
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
229230
; GFX9-NEXT: s_setpc_b64 s[30:31]
230231
;
@@ -329,7 +330,8 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
329330
; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp
330331
; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
331332
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
332-
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
333+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
334+
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
333335
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
334336
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
335337
; GFX9-NEXT: ; return to shader part epilog
@@ -451,9 +453,11 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
451453
; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
452454
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
453455
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
454-
; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
456+
; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
457+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
455458
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
456-
; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
459+
; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
460+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
457461
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
458462
; GFX8-NEXT: s_setpc_b64 s[30:31]
459463
;
@@ -618,18 +622,20 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
618622
; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
619623
; GFX8-NEXT: s_lshl_b32 s0, s3, 8
620624
; GFX8-NEXT: v_mov_b32_e32 v2, s1
621-
; GFX8-NEXT: s_lshl_b32 s1, s7, 8
622625
; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
623626
; GFX8-NEXT: v_add_u16_e64 v2, s0, v2 clamp
624-
; GFX8-NEXT: s_lshl_b32 s0, s4, 8
625-
; GFX8-NEXT: v_mov_b32_e32 v3, s1
627+
; GFX8-NEXT: s_lshl_b32 s1, s7, 8
626628
; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
627629
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
628-
; GFX8-NEXT: v_add_u16_e64 v3, s0, v3 clamp
630+
; GFX8-NEXT: s_lshl_b32 s0, s4, 8
631+
; GFX8-NEXT: v_mov_b32_e32 v3, s1
629632
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
630-
; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
633+
; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
634+
; GFX8-NEXT: v_add_u16_e64 v3, s0, v3 clamp
635+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
631636
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
632-
; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
637+
; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
638+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
633639
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
634640
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
635641
; GFX8-NEXT: ; return to shader part epilog

llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,8 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
218218
; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
219219
; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
220220
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
221-
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
221+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
222+
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
222223
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
223224
; GFX9-NEXT: s_setpc_b64 s[30:31]
224225
;
@@ -321,7 +322,8 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
321322
; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
322323
; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
323324
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
324-
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
325+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
326+
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
325327
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
326328
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
327329
; GFX9-NEXT: ; return to shader part epilog
@@ -439,9 +441,11 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
439441
; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
440442
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
441443
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
442-
; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
444+
; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
445+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
443446
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
444-
; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
447+
; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
448+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
445449
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
446450
; GFX8-NEXT: s_setpc_b64 s[30:31]
447451
;
@@ -602,18 +606,20 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
602606
; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
603607
; GFX8-NEXT: s_lshl_b32 s0, s3, 8
604608
; GFX8-NEXT: v_mov_b32_e32 v2, s1
605-
; GFX8-NEXT: s_lshl_b32 s1, s7, 8
606609
; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
607610
; GFX8-NEXT: v_sub_u16_e64 v2, s0, v2 clamp
608-
; GFX8-NEXT: s_lshl_b32 s0, s4, 8
609-
; GFX8-NEXT: v_mov_b32_e32 v3, s1
611+
; GFX8-NEXT: s_lshl_b32 s1, s7, 8
610612
; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
611613
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
612-
; GFX8-NEXT: v_sub_u16_e64 v3, s0, v3 clamp
614+
; GFX8-NEXT: s_lshl_b32 s0, s4, 8
615+
; GFX8-NEXT: v_mov_b32_e32 v3, s1
613616
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
614-
; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
617+
; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
618+
; GFX8-NEXT: v_sub_u16_e64 v3, s0, v3 clamp
619+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
615620
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
616-
; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
621+
; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
622+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
617623
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
618624
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
619625
; GFX8-NEXT: ; return to shader part epilog

llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6398,8 +6398,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
63986398
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
63996399
; GFX8-NEXT: s_waitcnt vmcnt(0)
64006400
; GFX8-NEXT: v_mov_b32_e32 v5, v0
6401-
; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
6401+
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5
6402+
; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
64026403
; GFX8-NEXT: v_add_f16_e32 v1, v5, v2
6404+
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
64036405
; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
64046406
; GFX8-NEXT: v_mov_b32_e32 v0, v4
64056407
; GFX8-NEXT: v_mov_b32_e32 v1, v5
@@ -6625,8 +6627,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
66256627
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
66266628
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
66276629
; GFX8-NEXT: s_waitcnt vmcnt(0)
6628-
; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
6630+
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2
6631+
; GFX8-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
66296632
; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
6633+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
66306634
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
66316635
; GFX8-NEXT: v_mov_b32_e32 v5, v2
66326636
; GFX8-NEXT: v_mov_b32_e32 v4, v1
@@ -7044,7 +7048,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
70447048
; GFX8-NEXT: ; =>This Loop Header: Depth=1
70457049
; GFX8-NEXT: ; Child Loop BB21_4 Depth 2
70467050
; GFX8-NEXT: s_waitcnt vmcnt(0)
7047-
; GFX8-NEXT: v_add_f16_sdwa v4, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
7051+
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v8
7052+
; GFX8-NEXT: v_add_f16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
7053+
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
70487054
; GFX8-NEXT: v_add_f16_e32 v6, v8, v5
70497055
; GFX8-NEXT: v_or_b32_e32 v7, v6, v4
70507056
; GFX8-NEXT: v_mov_b32_e32 v6, v7
@@ -7390,8 +7396,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
73907396
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
73917397
; GFX8-NEXT: s_waitcnt vmcnt(0)
73927398
; GFX8-NEXT: v_mov_b32_e32 v5, v0
7393-
; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
7399+
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5
7400+
; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
73947401
; GFX8-NEXT: v_add_f16_e32 v1, v5, v2
7402+
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
73957403
; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
73967404
; GFX8-NEXT: v_mov_b32_e32 v0, v4
73977405
; GFX8-NEXT: v_mov_b32_e32 v1, v5
@@ -7650,8 +7658,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
76507658
; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
76517659
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
76527660
; GFX8-NEXT: s_waitcnt vmcnt(0)
7653-
; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
7661+
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2
7662+
; GFX8-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
76547663
; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
7664+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
76557665
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
76567666
; GFX8-NEXT: v_mov_b32_e32 v5, v2
76577667
; GFX8-NEXT: v_mov_b32_e32 v4, v1
@@ -7915,8 +7925,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
79157925
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
79167926
; GFX8-NEXT: s_waitcnt vmcnt(0)
79177927
; GFX8-NEXT: v_mov_b32_e32 v5, v0
7918-
; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
7928+
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5
7929+
; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
79197930
; GFX8-NEXT: v_add_f16_e32 v1, v5, v2
7931+
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
79207932
; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
79217933
; GFX8-NEXT: v_mov_b32_e32 v0, v4
79227934
; GFX8-NEXT: v_mov_b32_e32 v1, v5
@@ -8175,8 +8187,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
81758187
; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start
81768188
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
81778189
; GFX8-NEXT: s_waitcnt vmcnt(0)
8178-
; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
8190+
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2
8191+
; GFX8-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
81798192
; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
8193+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
81808194
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
81818195
; GFX8-NEXT: v_mov_b32_e32 v5, v2
81828196
; GFX8-NEXT: v_mov_b32_e32 v4, v1

0 commit comments

Comments
 (0)