-
Notifications
You must be signed in to change notification settings - Fork 13.6k
AMDGPU: Expand shuffle testing with generated tests #123574
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesAdd some generated tests with every shuffle permutation This uses inline assembly to produce sample values because of how the I wrote a small tool to generate these, so I can easily change the This is in preparation for making better use of v_pk_mov_b32, Patch is 32.37 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/123574.diff 81 Files Affected:
diff --git a/llvm/test/CodeGen/AMDGPU/legal-shuffle.v2f32.ll b/llvm/test/CodeGen/AMDGPU/legal-shuffle.v2f32.ll
new file mode 100644
index 00000000000000..e6c155d71e9414
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/legal-shuffle.v2f32.ll
@@ -0,0 +1,567 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90A %s
+
+define void @v_shuffle_v2f32_00(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: v_shuffle_v2f32_00:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; def v[2:3]
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: v_mov_b32_e32 v3, v2
+; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %val0 = call <2 x float> asm "; def $0", "=v"()
+ %val1 = call <2 x float> asm "; def $0", "=v"()
+ %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 0, i32 0>
+ store <2 x float> %shuffle, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_shuffle_v2f32_01(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: v_shuffle_v2f32_01:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; def v[2:3]
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %val0 = call <2 x float> asm "; def $0", "=v"()
+ %val1 = call <2 x float> asm "; def $0", "=v"()
+ %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 0, i32 1>
+ store <2 x float> %shuffle, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_shuffle_v2f32_02(ptr addrspace(1) %ptr) {
+; GFX900-LABEL: v_shuffle_v2f32_02:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def v[2:3]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def v[3:4]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: v_shuffle_v2f32_02:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+ %val0 = call <2 x float> asm "; def $0", "=v"()
+ %val1 = call <2 x float> asm "; def $0", "=v"()
+ %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 0, i32 2>
+ store <2 x float> %shuffle, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_shuffle_v2f32_03(ptr addrspace(1) %ptr) {
+; GFX900-LABEL: v_shuffle_v2f32_03:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def v[2:3]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def v[3:4]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_mov_b32_e32 v3, v4
+; GFX900-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: v_shuffle_v2f32_03:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+ %val0 = call <2 x float> asm "; def $0", "=v"()
+ %val1 = call <2 x float> asm "; def $0", "=v"()
+ %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 0, i32 3>
+ store <2 x float> %shuffle, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_shuffle_v2f32_10(ptr addrspace(1) %ptr) {
+; GFX900-LABEL: v_shuffle_v2f32_10:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def v[2:3]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_mov_b32_e32 v4, v2
+; GFX900-NEXT: global_store_dwordx2 v[0:1], v[3:4], off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: v_shuffle_v2f32_10:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[4:5], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+ %val0 = call <2 x float> asm "; def $0", "=v"()
+ %val1 = call <2 x float> asm "; def $0", "=v"()
+ %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 1, i32 0>
+ store <2 x float> %shuffle, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_shuffle_v2f32_11(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: v_shuffle_v2f32_11:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; def v[2:3]
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %val0 = call <2 x float> asm "; def $0", "=v"()
+ %val1 = call <2 x float> asm "; def $0", "=v"()
+ %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 1, i32 1>
+ store <2 x float> %shuffle, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_shuffle_v2f32_12(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: v_shuffle_v2f32_12:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; def v[2:3]
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; def v[4:5]
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-NEXT: v_mov_b32_e32 v3, v4
+; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %val0 = call <2 x float> asm "; def $0", "=v"()
+ %val1 = call <2 x float> asm "; def $0", "=v"()
+ %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 1, i32 2>
+ store <2 x float> %shuffle, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_shuffle_v2f32_13(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: v_shuffle_v2f32_13:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; def v[4:5]
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; def v[2:3]
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: global_store_dwordx2 v[0:1], v[4:5], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %val0 = call <2 x float> asm "; def $0", "=v"()
+ %val1 = call <2 x float> asm "; def $0", "=v"()
+ %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 1, i32 3>
+ store <2 x float> %shuffle, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_shuffle_v2f32_20(ptr addrspace(1) %ptr) {
+; GFX900-LABEL: v_shuffle_v2f32_20:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def v[2:3]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def v[3:4]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_mov_b32_e32 v4, v2
+; GFX900-NEXT: global_store_dwordx2 v[0:1], v[3:4], off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: v_shuffle_v2f32_20:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[4:5], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+ %val0 = call <2 x float> asm "; def $0", "=v"()
+ %val1 = call <2 x float> asm "; def $0", "=v"()
+ %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 2, i32 0>
+ store <2 x float> %shuffle, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_shuffle_v2f32_21(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: v_shuffle_v2f32_21:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; def v[4:5]
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; def v[2:3]
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: global_store_dwordx2 v[0:1], v[4:5], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %val0 = call <2 x float> asm "; def $0", "=v"()
+ %val1 = call <2 x float> asm "; def $0", "=v"()
+ %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 2, i32 1>
+ store <2 x float> %shuffle, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_shuffle_v2f32_22(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: v_shuffle_v2f32_22:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; def v[2:3]
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: v_mov_b32_e32 v3, v2
+; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %val0 = call <2 x float> asm "; def $0", "=v"()
+ %val1 = call <2 x float> asm "; def $0", "=v"()
+ %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 2, i32 2>
+ store <2 x float> %shuffle, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_shuffle_v2f32_23(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: v_shuffle_v2f32_23:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; def v[2:3]
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %val0 = call <2 x float> asm "; def $0", "=v"()
+ %val1 = call <2 x float> asm "; def $0", "=v"()
+ %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 2, i32 3>
+ store <2 x float> %shuffle, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_shuffle_v2f32_30(ptr addrspace(1) %ptr) {
+; GFX900-LABEL: v_shuffle_v2f32_30:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def v[2:3]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def v[3:4]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_mov_b32_e32 v3, v4
+; GFX900-NEXT: v_mov_b32_e32 v4, v2
+; GFX900-NEXT: global_store_dwordx2 v[0:1], v[3:4], off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: v_shuffle_v2f32_30:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[4:5], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+ %val0 = call <2 x float> asm "; def $0", "=v"()
+ %val1 = call <2 x float> asm "; def $0", "=v"()
+ %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 3, i32 0>
+ store <2 x float> %shuffle, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_shuffle_v2f32_31(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: v_shuffle_v2f32_31:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; def v[2:3]
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; def v[4:5]
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: v_mov_b32_e32 v2, v5
+; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %val0 = call <2 x float> asm "; def $0", "=v"()
+ %val1 = call <2 x float> asm "; def $0", "=v"()
+ %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 3, i32 1>
+ store <2 x float> %shuffle, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_shuffle_v2f32_32(ptr addrspace(1) %ptr) {
+; GFX900-LABEL: v_shuffle_v2f32_32:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def v[2:3]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: v_mov_b32_e32 v4, v2
+; GFX900-NEXT: global_store_dwordx2 v[0:1], v[3:4], off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: v_shuffle_v2f32_32:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[4:5], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+ %val0 = call <2 x float> asm "; def $0", "=v"()
+ %val1 = call <2 x float> asm "; def $0", "=v"()
+ %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 3, i32 2>
+ store <2 x float> %shuffle, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_shuffle_v2f32_33(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: v_shuffle_v2f32_33:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; def v[2:3]
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %val0 = call <2 x float> asm "; def $0", "=v"()
+ %val1 = call <2 x float> asm "; def $0", "=v"()
+ %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 3, i32 3>
+ store <2 x float> %shuffle, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_shuffle_v2f32_uu(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: v_shuffle_v2f32_uu:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %val0 = call <2 x float> asm "; def $0", "=v"()
+ %val1 = call <2 x float> asm "; def $0", "=v"()
+ %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> poison
+ store <2 x float> %shuffle, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_shuffle_v2f32_0u(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: v_shuffle_v2f32_0u:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; def v[2:3]
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %val0 = call <2 x float> asm "; def $0", "=v"()
+ %val1 = call <2 x float> asm "; def $0", "=v"()
+ %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 0, i32 poison>
+ store <2 x float> %shuffle, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_shuffle_v2f32_1u(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: v_shuffle_v2f32_1u:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; def v[2:3]
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %val0 = call <2 x float> asm "; def $0", "=v"()
+ %val1 = call <2 x float> asm "; def $0", "=v"()
+ %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 1, i32 poison>
+ store <2 x float> %shuffle, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_shuffle_v2f32_2u(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: v_shuffle_v2f32_2u:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; def v[2:3]
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %val0 = call <2 x float> asm "; def $0", "=v"()
+ %val1 = call <2 x float> asm "; def $0", "=v"()
+ %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 2, i32 poison>
+ store <2 x float> %shuffle, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_shuffle_v2f32_3u(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: v_shuffle_v2f32_3u:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; def v[2:3]
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %val0 = call <2 x float> asm "; def $0", "=v"()
+ %val1 = call <2 x float> asm "; def $0", "=v"()
+ %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 3, i32 poison>
+ store <2 x float> %shuffle, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_shuffle_v2f32_u0(ptr addrspace(1) %ptr) {
+; GFX900-LABEL: v_shuffle_v2f32_u0:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def v[2:3]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_dwordx2 v[0:1], v[1:2], off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: v_shuffle_v2f32_u0:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+ %val0 = call <2 x float> asm "; def $0", "=v"()
+ %val1 = call <2 x float> asm "; def $0", "=v"()
+ %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 poison, i32 0>
+ store <2 x float> %shuffle, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_shuffle_v2f32_u1(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: v_shuffle_v2f32_u1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; def v[2:3]
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %val0 = call <2 x float> asm "; def $0", "=v"()
+ %val1 = call <2 x float> asm "; def $0", "=v"()
+ %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 poison, i32 1>
+ store <2 x float> %shuffle, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_shuffle_v2f32_u2(ptr addrspace(1) %ptr) {
+; GFX900-LABEL: v_shuffle_v2f32_u2:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def v[2:3]
+; GFX900-NEXT: ;;#A...
[truncated]
|
Add some generated tests with every shuffle permutation for relevant vector element types and sizes. Not sure if this is going overboard with the number of tests. I pruned out the largest cases (16 and 32-bit cases are impractically large), and there's redundancy when testing the pointer cases (at least for SelectionDAG). This uses inline assembly to produce sample values because of how the ABI is lowered when using a function argument. Since we break all arguments into 32-bit pieces, a shuffle never ends up forming. We need separate handling to reconstruct shuffles in contexts involving physical registers in ABI contexts. I wrote a small tool to generate these, so I can easily change the exact test body. Not sure if it's worth posting anywhere. This is in preparation for making better use of v_pk_mov_b32, v_mov_b64 and s_mov_b64 in shuffles.
a887dca
to
4f06cd9
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure if it's worth posting anywhere.
Probably somewhere like llvm/utils/Target/AMDGPU
?
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/60/builds/17484 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/185/builds/11776 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/137/builds/11926 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/175/builds/11782 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/153/builds/20317 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/16/builds/12331 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/56/builds/16695 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/174/builds/11723 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/108/builds/8361 Here is the relevant piece of the build log for the reference
|
Test constraints passed the bot but now fail due to cc5eba1 |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/198/builds/1336 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/55/builds/5850 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/33/builds/9913 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/95/builds/8623 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/35/builds/6444 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/168/builds/7724 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/24/builds/4422 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/169/builds/7600 Here is the relevant piece of the build log for the reference
|
Should be fixed by 585858a |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/85/builds/4576 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/125/builds/5080 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/76/builds/6272 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/199/builds/1092 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/25/builds/5808 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/145/builds/4585 Here is the relevant piece of the build log for the reference
|
Add some generated tests with every shuffle permutation
for relevant vector element types and sizes. Not sure if this
is going overboard with the number of tests. I pruned out the largest
cases (16 and 32-bit cases are impractically large), and there's
redundancy when testing the pointer cases (at least for SelectionDAG).
This uses inline assembly to produce sample values because of how the
ABI is lowered when using a function argument. Since we break all
arguments into 32-bit pieces, a shuffle never ends up forming. We
need separate handling to reconstruct shuffles in contexts involving
physical registers in ABI contexts.
I wrote a small tool to generate these, so I can easily change the
exact test body. Not sure if it's worth posting anywhere.
This is in preparation for making better use of v_pk_mov_b32,
v_mov_b64 and s_mov_b64 in shuffles.