diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd-fp-vector.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd-fp-vector.ll deleted file mode 100644 index a7539ac3cce80..0000000000000 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd-fp-vector.ll +++ /dev/null @@ -1,115 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=aarch64-- -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefixes=CHECK,NOLSE %s -; RUN: llc -mtriple=aarch64-- -mattr=+lse -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefixes=CHECK,LSE %s - -define <2 x half> @test_atomicrmw_fadd_v2f16_align4(ptr addrspace(1) %ptr, <2 x half> %value) #0 { -; NOLSE-LABEL: test_atomicrmw_fadd_v2f16_align4: -; NOLSE: // %bb.0: -; NOLSE-NEXT: fcvtl v1.4s, v0.4h -; NOLSE-NEXT: ldr s0, [x0] -; NOLSE-NEXT: b .LBB0_2 -; NOLSE-NEXT: .LBB0_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.eq .LBB0_5 -; NOLSE-NEXT: .LBB0_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB0_3 Depth 2 -; NOLSE-NEXT: fcvtl v2.4s, v0.4h -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fadd v2.4s, v2.4s, v1.4s -; NOLSE-NEXT: fcvtn v2.4h, v2.4s -; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB0_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w10, [x0] -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.ne .LBB0_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB0_3 -; NOLSE-NEXT: b .LBB0_1 -; NOLSE-NEXT: .LBB0_5: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 -; NOLSE-NEXT: ret -; -; LSE-LABEL: test_atomicrmw_fadd_v2f16_align4: -; LSE: // %bb.0: -; LSE-NEXT: fcvtl v1.4s, v0.4h -; LSE-NEXT: ldr s0, [x0] -; LSE-NEXT: .LBB0_1: // %atomicrmw.start -; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: fcvtl v2.4s, v0.4h -; LSE-NEXT: fmov w8, s0 -; LSE-NEXT: mov w10, w8 -; LSE-NEXT: fadd v2.4s, v2.4s, v1.4s -; LSE-NEXT: fcvtn v2.4h, v2.4s -; LSE-NEXT: fmov w9, s2 -; LSE-NEXT: casal w10, w9, [x0] -; LSE-NEXT: fmov s0, w10 -; LSE-NEXT: cmp w10, w8 -; LSE-NEXT: b.ne .LBB0_1 -; LSE-NEXT: // %bb.2: // %atomicrmw.end -; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0 -; LSE-NEXT: ret - %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4 - ret <2 x half> %res -} - -define <2 x float> @test_atomicrmw_fadd_v2f32_align8(ptr addrspace(1) %ptr, <2 x float> %value) #0 { -; NOLSE-LABEL: test_atomicrmw_fadd_v2f32_align8: -; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr d1, [x0] -; NOLSE-NEXT: b .LBB1_2 -; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; NOLSE-NEXT: fmov d1, x10 -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.eq .LBB1_5 -; NOLSE-NEXT: .LBB1_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB1_3 Depth 2 -; NOLSE-NEXT: fadd v2.2s, v1.2s, v0.2s -; NOLSE-NEXT: fmov x9, d1 -; NOLSE-NEXT: fmov x8, d2 -; NOLSE-NEXT: .LBB1_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr x10, [x0] -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.ne .LBB1_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, x8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB1_3 -; NOLSE-NEXT: b .LBB1_1 -; NOLSE-NEXT: .LBB1_5: // %atomicrmw.end -; NOLSE-NEXT: fmov d0, d1 -; NOLSE-NEXT: ret -; -; LSE-LABEL: test_atomicrmw_fadd_v2f32_align8: -; LSE: // %bb.0: -; LSE-NEXT: ldr d1, [x0] -; LSE-NEXT: .LBB1_1: // %atomicrmw.start -; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: fadd v2.2s, v1.2s, v0.2s -; LSE-NEXT: fmov x8, d1 -; LSE-NEXT: mov x10, x8 -; LSE-NEXT: fmov x9, d2 -; LSE-NEXT: casal x10, x9, [x0] -; LSE-NEXT: fmov d1, x10 -; LSE-NEXT: cmp x10, x8 -; LSE-NEXT: b.ne .LBB1_1 -; LSE-NEXT: // %bb.2: // %atomicrmw.end -; LSE-NEXT: fmov d0, d1 -; LSE-NEXT: ret - %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x float> %value seq_cst, align 8 - ret <2 x float> %res -} - -attributes #0 = { nounwind } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll new file mode 100644 index 0000000000000..89c9880ffc786 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll @@ -0,0 +1,1209 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=NOLSE %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+lse -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=LSE %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=-lse,-fp-armv8 -O1 < %s | FileCheck -check-prefix=SOFTFP-NOLSE %s + +define half @test_atomicrmw_fadd_f16_seq_cst_align2(ptr %ptr, half %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align2: +; NOLSE: // %bb.0: +; NOLSE-NEXT: fcvt s1, h0 +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: b .LBB0_2 +; NOLSE-NEXT: .LBB0_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.eq .LBB0_5 +; NOLSE-NEXT: .LBB0_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; NOLSE-NEXT: fcvt s2, h0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fadd s2, s2, s1 +; NOLSE-NEXT: fcvt h2, s2 +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB0_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w10, [x0] +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.ne .LBB0_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB0_3 +; NOLSE-NEXT: b .LBB0_1 +; NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align2: +; LSE: // %bb.0: +; LSE-NEXT: fcvt s1, h0 +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: .LBB0_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fcvt s2, h0 +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fadd s2, s2, s1 +; LSE-NEXT: fcvt h2, s2 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casalh w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8, uxth +; LSE-NEXT: b.ne .LBB0_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align2: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: b .LBB0_2 +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 +; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: bl __addsf3 +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 +; SOFTFP-NOLSE-NEXT: b .LBB0_1 +; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fadd ptr %ptr, half %value seq_cst, align 2 + ret half %res +} + +define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: fcvt s1, h0 +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: b .LBB1_2 +; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.eq .LBB1_5 +; NOLSE-NEXT: .LBB1_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; NOLSE-NEXT: fcvt s2, h0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fadd s2, s2, s1 +; NOLSE-NEXT: fcvt h2, s2 +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB1_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w10, [x0] +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.ne .LBB1_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB1_3 +; NOLSE-NEXT: b .LBB1_1 +; NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: fcvt s1, h0 +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: .LBB1_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fcvt s2, h0 +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fadd s2, s2, s1 +; LSE-NEXT: fcvt h2, s2 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casalh w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8, uxth +; LSE-NEXT: b.ne .LBB1_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: b .LBB1_2 +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 +; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: bl __addsf3 +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 +; SOFTFP-NOLSE-NEXT: b .LBB1_1 +; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fadd ptr %ptr, half %value seq_cst, align 4 + ret half %res +} + +define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align2: +; NOLSE: // %bb.0: +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: mov w8, #32767 // =0x7fff +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s1, w9 +; NOLSE-NEXT: b .LBB2_2 +; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 +; NOLSE-NEXT: fmov s0, w11 +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.eq .LBB2_5 +; NOLSE-NEXT: .LBB2_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB2_3 Depth 2 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fadd s2, s2, s1 +; NOLSE-NEXT: fmov w9, s2 +; NOLSE-NEXT: ubfx w10, w9, #16, #1 +; NOLSE-NEXT: add w9, w9, w8 +; NOLSE-NEXT: add w9, w10, w9 +; NOLSE-NEXT: lsr w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fmov w10, s2 +; NOLSE-NEXT: .LBB2_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w11, [x0] +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.ne .LBB2_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w10, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB2_3 +; NOLSE-NEXT: b .LBB2_1 +; NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align2: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: mov w8, #32767 // =0x7fff +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s1, w9 +; LSE-NEXT: .LBB2_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fadd s2, s2, s1 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: ubfx w10, w9, #16, #1 +; LSE-NEXT: add w9, w9, w8 +; LSE-NEXT: add w9, w10, w9 +; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: lsr w9, w9, #16 +; LSE-NEXT: mov w11, w10 +; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s0, w11 +; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: b.ne .LBB2_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align2: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: b .LBB2_2 +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 +; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl __addsf3 +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 +; SOFTFP-NOLSE-NEXT: b .LBB2_1 +; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 2 + ret bfloat %res +} + +define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: mov w8, #32767 // =0x7fff +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s1, w9 +; NOLSE-NEXT: b .LBB3_2 +; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 +; NOLSE-NEXT: fmov s0, w11 +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.eq .LBB3_5 +; NOLSE-NEXT: .LBB3_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB3_3 Depth 2 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fadd s2, s2, s1 +; NOLSE-NEXT: fmov w9, s2 +; NOLSE-NEXT: ubfx w10, w9, #16, #1 +; NOLSE-NEXT: add w9, w9, w8 +; NOLSE-NEXT: add w9, w10, w9 +; NOLSE-NEXT: lsr w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fmov w10, s2 +; NOLSE-NEXT: .LBB3_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w11, [x0] +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.ne .LBB3_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w10, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB3_3 +; NOLSE-NEXT: b .LBB3_1 +; NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: mov w8, #32767 // =0x7fff +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s1, w9 +; LSE-NEXT: .LBB3_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fadd s2, s2, s1 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: ubfx w10, w9, #16, #1 +; LSE-NEXT: add w9, w9, w8 +; LSE-NEXT: add w9, w10, w9 +; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: lsr w9, w9, #16 +; LSE-NEXT: mov w11, w10 +; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s0, w11 +; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: b.ne .LBB3_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: b .LBB3_2 +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 +; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl __addsf3 +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 +; SOFTFP-NOLSE-NEXT: b .LBB3_1 +; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 4 + ret bfloat %res +} + +define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr s1, [x0] +; NOLSE-NEXT: b .LBB4_2 +; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 +; NOLSE-NEXT: fmov s1, w10 +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.eq .LBB4_5 +; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; NOLSE-NEXT: fadd s2, s1, s0 +; NOLSE-NEXT: fmov w9, s1 +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB4_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w10, [x0] +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.ne .LBB4_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB4_3 +; NOLSE-NEXT: b .LBB4_1 +; NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; NOLSE-NEXT: fmov s0, s1 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: ldr s1, [x0] +; LSE-NEXT: .LBB4_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fadd s2, s1, s0 +; LSE-NEXT: fmov w8, s1 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casal w10, w9, [x0] +; LSE-NEXT: fmov s1, w10 +; LSE-NEXT: cmp w10, w8 +; LSE-NEXT: b.ne .LBB4_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: fmov s0, s1 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr w20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: b .LBB4_2 +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 +; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl __addsf3 +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 +; SOFTFP-NOLSE-NEXT: b .LBB4_1 +; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fadd ptr %ptr, float %value seq_cst, align 4 + ret float %res +} + +define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align8: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr d1, [x0] +; NOLSE-NEXT: b .LBB5_2 +; NOLSE-NEXT: .LBB5_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 +; NOLSE-NEXT: fmov d1, x10 +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.eq .LBB5_5 +; NOLSE-NEXT: .LBB5_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; NOLSE-NEXT: fadd d2, d1, d0 +; NOLSE-NEXT: fmov x9, d1 +; NOLSE-NEXT: fmov x8, d2 +; NOLSE-NEXT: .LBB5_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr x10, [x0] +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.ne .LBB5_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, x8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB5_3 +; NOLSE-NEXT: b .LBB5_1 +; NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, d1 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align8: +; LSE: // %bb.0: +; LSE-NEXT: ldr d1, [x0] +; LSE-NEXT: .LBB5_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fadd d2, d1, d0 +; LSE-NEXT: fmov x8, d1 +; LSE-NEXT: mov x10, x8 +; LSE-NEXT: fmov x9, d2 +; LSE-NEXT: casal x10, x9, [x0] +; LSE-NEXT: fmov d1, x10 +; LSE-NEXT: cmp x10, x8 +; LSE-NEXT: b.ne .LBB5_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: fmov d0, d1 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align8: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: mov x21, x1 +; SOFTFP-NOLSE-NEXT: b .LBB5_2 +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: mov x20, x8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 +; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: mov x1, x21 +; SOFTFP-NOLSE-NEXT: bl __adddf3 +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 +; SOFTFP-NOLSE-NEXT: b .LBB5_1 +; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fadd ptr %ptr, double %value seq_cst, align 8 + ret double %res +} + +define fp128 @test_atomicrmw_fadd_fp128_seq_cst_align16(ptr %ptr, fp128 %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fadd_fp128_seq_cst_align16: +; NOLSE: // %bb.0: +; NOLSE-NEXT: sub sp, sp, #96 +; NOLSE-NEXT: ldr q1, [x0] +; NOLSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; NOLSE-NEXT: mov x19, x0 +; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill +; NOLSE-NEXT: b .LBB6_2 +; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 +; NOLSE-NEXT: stp x12, x13, [sp, #32] +; NOLSE-NEXT: cmp x13, x10 +; NOLSE-NEXT: ldr q1, [sp, #32] +; NOLSE-NEXT: ccmp x12, x11, #0, eq +; NOLSE-NEXT: b.eq .LBB6_6 +; NOLSE-NEXT: .LBB6_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB6_3 Depth 2 +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; NOLSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; NOLSE-NEXT: bl __addtf3 +; NOLSE-NEXT: str q0, [sp, #48] +; NOLSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; NOLSE-NEXT: ldp x9, x8, [sp, #48] +; NOLSE-NEXT: str q0, [sp, #64] +; NOLSE-NEXT: ldp x11, x10, [sp, #64] +; NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxp x12, x13, [x19] +; NOLSE-NEXT: cmp x12, x11 +; NOLSE-NEXT: cset w14, ne +; NOLSE-NEXT: cmp x13, x10 +; NOLSE-NEXT: cinc w14, w14, ne +; NOLSE-NEXT: cbz w14, .LBB6_5 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x12, x13, [x19] +; NOLSE-NEXT: cbnz w14, .LBB6_3 +; NOLSE-NEXT: b .LBB6_1 +; NOLSE-NEXT: .LBB6_5: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x9, x8, [x19] +; NOLSE-NEXT: cbnz w14, .LBB6_3 +; NOLSE-NEXT: b .LBB6_1 +; NOLSE-NEXT: .LBB6_6: // %atomicrmw.end +; NOLSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: add sp, sp, #96 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fadd_fp128_seq_cst_align16: +; LSE: // %bb.0: +; LSE-NEXT: sub sp, sp, #96 +; LSE-NEXT: ldr q1, [x0] +; LSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; LSE-NEXT: mov x19, x0 +; LSE-NEXT: str q0, [sp] // 16-byte Folded Spill +; LSE-NEXT: .LBB6_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov v0.16b, v1.16b +; LSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; LSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; LSE-NEXT: bl __addtf3 +; LSE-NEXT: str q0, [sp, #48] +; LSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; LSE-NEXT: ldp x0, x1, [sp, #48] +; LSE-NEXT: str q0, [sp, #64] +; LSE-NEXT: ldp x2, x3, [sp, #64] +; LSE-NEXT: mov x4, x2 +; LSE-NEXT: mov x5, x3 +; LSE-NEXT: caspal x4, x5, x0, x1, [x19] +; LSE-NEXT: stp x4, x5, [sp, #32] +; LSE-NEXT: cmp x5, x3 +; LSE-NEXT: ldr q1, [sp, #32] +; LSE-NEXT: ccmp x4, x2, #0, eq +; LSE-NEXT: b.ne .LBB6_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; LSE-NEXT: mov v0.16b, v1.16b +; LSE-NEXT: add sp, sp, #96 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_fp128_seq_cst_align16: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: mov x19, x3 +; SOFTFP-NOLSE-NEXT: ldp x0, x1, [x0] +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x21, x2 +; SOFTFP-NOLSE-NEXT: b .LBB6_2 +; SOFTFP-NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: ccmp x0, x23, #0, eq +; SOFTFP-NOLSE-NEXT: b.eq .LBB6_6 +; SOFTFP-NOLSE-NEXT: .LBB6_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov x2, x21 +; SOFTFP-NOLSE-NEXT: mov x3, x19 +; SOFTFP-NOLSE-NEXT: mov x22, x1 +; SOFTFP-NOLSE-NEXT: mov x23, x0 +; SOFTFP-NOLSE-NEXT: bl __addtf3 +; SOFTFP-NOLSE-NEXT: mov x8, x0 +; SOFTFP-NOLSE-NEXT: mov x9, x1 +; SOFTFP-NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxp x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cmp x0, x23 +; SOFTFP-NOLSE-NEXT: cset w10, ne +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: cinc w10, w10, ne +; SOFTFP-NOLSE-NEXT: cbz w10, .LBB6_5 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w10, x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB6_3 +; SOFTFP-NOLSE-NEXT: b .LBB6_1 +; SOFTFP-NOLSE-NEXT: .LBB6_5: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w10, x8, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB6_3 +; SOFTFP-NOLSE-NEXT: b .LBB6_1 +; SOFTFP-NOLSE-NEXT: .LBB6_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fadd ptr %ptr, fp128 %value seq_cst, align 16 + ret fp128 %res +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fadd_v2f16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: fcvtl v1.4s, v0.4h +; NOLSE-NEXT: ldr s0, [x0] +; NOLSE-NEXT: b .LBB7_2 +; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.eq .LBB7_5 +; NOLSE-NEXT: .LBB7_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB7_3 Depth 2 +; NOLSE-NEXT: fcvtl v2.4s, v0.4h +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fadd v2.4s, v2.4s, v1.4s +; NOLSE-NEXT: fcvtn v2.4h, v2.4s +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w10, [x0] +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.ne .LBB7_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB7_3 +; NOLSE-NEXT: b .LBB7_1 +; NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fadd_v2f16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: fcvtl v1.4s, v0.4h +; LSE-NEXT: ldr s0, [x0] +; LSE-NEXT: .LBB7_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fcvtl v2.4s, v0.4h +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fadd v2.4s, v2.4s, v1.4s +; LSE-NEXT: fcvtn v2.4h, v2.4s +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casal w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8 +; LSE-NEXT: b.ne .LBB7_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_v2f16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x25, [sp, #-64]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: mov w22, w1 +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w19, w2 +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: b .LBB7_2 +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 +; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 +; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: and w0, w23, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w24 +; SOFTFP-NOLSE-NEXT: bl __addsf3 +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w25, w0 +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w25 +; SOFTFP-NOLSE-NEXT: bl __addsf3 +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 +; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 +; SOFTFP-NOLSE-NEXT: b .LBB7_1 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w23 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x25, [sp], #64 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fadd ptr %ptr, <2 x half> %value seq_cst, align 4 + ret <2 x half> %res +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bfloat> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fadd_v2bf16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: movi v1.4s, #1 +; NOLSE-NEXT: movi v2.4s, #127, msl #8 +; NOLSE-NEXT: shll v3.4s, v0.4h, #16 +; NOLSE-NEXT: ldr s0, [x0] +; NOLSE-NEXT: b .LBB8_2 +; NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.eq .LBB8_5 +; NOLSE-NEXT: .LBB8_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB8_3 Depth 2 +; NOLSE-NEXT: shll v4.4s, v0.4h, #16 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fadd v4.4s, v4.4s, v3.4s +; NOLSE-NEXT: ushr v5.4s, v4.4s, #16 +; NOLSE-NEXT: and v5.16b, v5.16b, v1.16b +; NOLSE-NEXT: add v4.4s, v5.4s, v4.4s +; NOLSE-NEXT: addhn v4.4h, v4.4s, v2.4s +; NOLSE-NEXT: fmov w8, s4 +; NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w10, [x0] +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.ne .LBB8_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB8_3 +; NOLSE-NEXT: b .LBB8_1 +; NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fadd_v2bf16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: movi v1.4s, #1 +; LSE-NEXT: movi v2.4s, #127, msl #8 +; LSE-NEXT: shll v3.4s, v0.4h, #16 +; LSE-NEXT: ldr s0, [x0] +; LSE-NEXT: .LBB8_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: shll v4.4s, v0.4h, #16 +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: fadd v4.4s, v4.4s, v3.4s +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: ushr v5.4s, v4.4s, #16 +; LSE-NEXT: and v5.16b, v5.16b, v1.16b +; LSE-NEXT: add v4.4s, v5.4s, v4.4s +; LSE-NEXT: addhn v4.4h, v4.4s, v2.4s +; LSE-NEXT: fmov w9, s4 +; LSE-NEXT: casal w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8 +; LSE-NEXT: b.ne .LBB8_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_v2bf16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w8, w1 +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 +; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: b .LBB8_2 +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 +; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 +; SOFTFP-NOLSE-NEXT: lsl w23, w1, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: bl __addsf3 +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: bl __addsf3 +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 +; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 +; SOFTFP-NOLSE-NEXT: b .LBB8_1 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value seq_cst, align 4 + ret <2 x bfloat> %res +} + +define <2 x float> @test_atomicrmw_fadd_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fadd_v2f32_seq_cst_align8: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr d1, [x0] +; NOLSE-NEXT: b .LBB9_2 +; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 +; NOLSE-NEXT: fmov d1, x10 +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.eq .LBB9_5 +; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; NOLSE-NEXT: fadd v2.2s, v1.2s, v0.2s +; NOLSE-NEXT: fmov x9, d1 +; NOLSE-NEXT: fmov x8, d2 +; NOLSE-NEXT: .LBB9_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr x10, [x0] +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.ne .LBB9_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, x8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB9_3 +; NOLSE-NEXT: b .LBB9_1 +; NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, d1 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fadd_v2f32_seq_cst_align8: +; LSE: // %bb.0: +; LSE-NEXT: ldr d1, [x0] +; LSE-NEXT: .LBB9_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fadd v2.2s, v1.2s, v0.2s +; LSE-NEXT: fmov x8, d1 +; LSE-NEXT: mov x10, x8 +; LSE-NEXT: fmov x9, d2 +; LSE-NEXT: casal x10, x9, [x0] +; LSE-NEXT: fmov d1, x10 +; LSE-NEXT: cmp x10, x8 +; LSE-NEXT: b.ne .LBB9_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: fmov d0, d1 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_v2f32_seq_cst_align8: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w19, w2 +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: b .LBB9_2 +; SOFTFP-NOLSE-NEXT: .LBB9_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 +; SOFTFP-NOLSE-NEXT: b.eq .LBB9_5 +; SOFTFP-NOLSE-NEXT: .LBB9_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w19 +; SOFTFP-NOLSE-NEXT: bl __addsf3 +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl __addsf3 +; SOFTFP-NOLSE-NEXT: mov w8, w23 +; SOFTFP-NOLSE-NEXT: mov w9, w0 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB9_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: b.ne .LBB9_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB9_3 +; SOFTFP-NOLSE-NEXT: b .LBB9_1 +; SOFTFP-NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fadd ptr %ptr, <2 x float> %value seq_cst, align 8 + ret <2 x float> %res +} + +define <2 x double> @test_atomicrmw_fadd_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fadd_v2f64_seq_cst_align8: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr q1, [x0] +; NOLSE-NEXT: b .LBB10_2 +; NOLSE-NEXT: .LBB10_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 +; NOLSE-NEXT: fmov d1, x12 +; NOLSE-NEXT: cmp x13, x9 +; NOLSE-NEXT: ccmp x12, x11, #0, eq +; NOLSE-NEXT: mov v1.d[1], x13 +; NOLSE-NEXT: b.eq .LBB10_6 +; NOLSE-NEXT: .LBB10_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; NOLSE-NEXT: fadd v2.2d, v1.2d, v0.2d +; NOLSE-NEXT: mov x9, v1.d[1] +; NOLSE-NEXT: fmov x11, d1 +; NOLSE-NEXT: mov x8, v2.d[1] +; NOLSE-NEXT: fmov x10, d2 +; NOLSE-NEXT: .LBB10_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxp x12, x13, [x0] +; NOLSE-NEXT: cmp x12, x11 +; NOLSE-NEXT: cset w14, ne +; NOLSE-NEXT: cmp x13, x9 +; NOLSE-NEXT: cinc w14, w14, ne +; NOLSE-NEXT: cbz w14, .LBB10_5 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x12, x13, [x0] +; NOLSE-NEXT: cbnz w14, .LBB10_3 +; NOLSE-NEXT: b .LBB10_1 +; NOLSE-NEXT: .LBB10_5: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x10, x8, [x0] +; NOLSE-NEXT: cbnz w14, .LBB10_3 +; NOLSE-NEXT: b .LBB10_1 +; NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fadd_v2f64_seq_cst_align8: +; LSE: // %bb.0: +; LSE-NEXT: ldr q1, [x0] +; LSE-NEXT: .LBB10_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fadd v2.2d, v1.2d, v0.2d +; LSE-NEXT: mov x3, v1.d[1] +; LSE-NEXT: fmov x2, d1 +; LSE-NEXT: mov x7, x3 +; LSE-NEXT: mov x5, v2.d[1] +; LSE-NEXT: mov x6, x2 +; LSE-NEXT: fmov x4, d2 +; LSE-NEXT: caspal x6, x7, x4, x5, [x0] +; LSE-NEXT: fmov d1, x6 +; LSE-NEXT: cmp x7, x3 +; LSE-NEXT: ccmp x6, x2, #0, eq +; LSE-NEXT: mov v1.d[1], x7 +; LSE-NEXT: b.ne .LBB10_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: mov v0.16b, v1.16b +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_v2f64_seq_cst_align8: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: mov x19, x3 +; SOFTFP-NOLSE-NEXT: ldp x0, x1, [x0] +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x21, x2 +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: b .LBB10_2 +; SOFTFP-NOLSE-NEXT: .LBB10_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: ccmp x0, x23, #0, eq +; SOFTFP-NOLSE-NEXT: b.eq .LBB10_6 +; SOFTFP-NOLSE-NEXT: .LBB10_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov x22, x1 +; SOFTFP-NOLSE-NEXT: mov x23, x0 +; SOFTFP-NOLSE-NEXT: mov x0, x1 +; SOFTFP-NOLSE-NEXT: mov x1, x19 +; SOFTFP-NOLSE-NEXT: bl __adddf3 +; SOFTFP-NOLSE-NEXT: mov x24, x0 +; SOFTFP-NOLSE-NEXT: mov x0, x23 +; SOFTFP-NOLSE-NEXT: mov x1, x21 +; SOFTFP-NOLSE-NEXT: bl __adddf3 +; SOFTFP-NOLSE-NEXT: mov x8, x0 +; SOFTFP-NOLSE-NEXT: .LBB10_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxp x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cmp x0, x23 +; SOFTFP-NOLSE-NEXT: cset w9, ne +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: cinc w9, w9, ne +; SOFTFP-NOLSE-NEXT: cbz w9, .LBB10_5 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w9, x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB10_3 +; SOFTFP-NOLSE-NEXT: b .LBB10_1 +; SOFTFP-NOLSE-NEXT: .LBB10_5: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w9, x8, x24, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB10_3 +; SOFTFP-NOLSE-NEXT: b .LBB10_1 +; SOFTFP-NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fadd ptr %ptr, <2 x double> %value seq_cst, align 16 + ret <2 x double> %res +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll new file mode 100644 index 0000000000000..998d8ae0c1de4 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll @@ -0,0 +1,1272 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=NOLSE %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+lse -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=LSE %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=-lse,-fp-armv8 -O1 < %s | FileCheck -check-prefix=SOFTFP-NOLSE %s + +; FIXME: Windows hosts assigns stack slots to different offsets for some reason. +; UNSUPPORTED: system-windows + +define half @test_atomicrmw_fmax_f16_seq_cst_align2(ptr %ptr, half %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align2: +; NOLSE: // %bb.0: +; NOLSE-NEXT: fcvt s1, h0 +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: b .LBB0_2 +; NOLSE-NEXT: .LBB0_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.eq .LBB0_5 +; NOLSE-NEXT: .LBB0_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; NOLSE-NEXT: fcvt s2, h0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fmaxnm s2, s2, s1 +; NOLSE-NEXT: fcvt h2, s2 +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB0_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w10, [x0] +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.ne .LBB0_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB0_3 +; NOLSE-NEXT: b .LBB0_1 +; NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align2: +; LSE: // %bb.0: +; LSE-NEXT: fcvt s1, h0 +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: .LBB0_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fcvt s2, h0 +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fmaxnm s2, s2, s1 +; LSE-NEXT: fcvt h2, s2 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casalh w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8, uxth +; LSE-NEXT: b.ne .LBB0_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align2: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: b .LBB0_2 +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 +; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: bl fmaxf +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 +; SOFTFP-NOLSE-NEXT: b .LBB0_1 +; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmax ptr %ptr, half %value seq_cst, align 2 + ret half %res +} + +define half @test_atomicrmw_fmax_f16_seq_cst_align4(ptr %ptr, half %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: fcvt s1, h0 +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: b .LBB1_2 +; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.eq .LBB1_5 +; NOLSE-NEXT: .LBB1_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; NOLSE-NEXT: fcvt s2, h0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fmaxnm s2, s2, s1 +; NOLSE-NEXT: fcvt h2, s2 +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB1_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w10, [x0] +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.ne .LBB1_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB1_3 +; NOLSE-NEXT: b .LBB1_1 +; NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: fcvt s1, h0 +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: .LBB1_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fcvt s2, h0 +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fmaxnm s2, s2, s1 +; LSE-NEXT: fcvt h2, s2 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casalh w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8, uxth +; LSE-NEXT: b.ne .LBB1_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: b .LBB1_2 +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 +; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: bl fmaxf +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 +; SOFTFP-NOLSE-NEXT: b .LBB1_1 +; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmax ptr %ptr, half %value seq_cst, align 4 + ret half %res +} + +define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align2: +; NOLSE: // %bb.0: +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: mov w8, #32767 // =0x7fff +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s1, w9 +; NOLSE-NEXT: b .LBB2_2 +; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 +; NOLSE-NEXT: fmov s0, w11 +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.eq .LBB2_5 +; NOLSE-NEXT: .LBB2_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB2_3 Depth 2 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fmaxnm s2, s2, s1 +; NOLSE-NEXT: fmov w9, s2 +; NOLSE-NEXT: ubfx w10, w9, #16, #1 +; NOLSE-NEXT: add w9, w9, w8 +; NOLSE-NEXT: add w9, w10, w9 +; NOLSE-NEXT: lsr w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fmov w10, s2 +; NOLSE-NEXT: .LBB2_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w11, [x0] +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.ne .LBB2_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w10, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB2_3 +; NOLSE-NEXT: b .LBB2_1 +; NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align2: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: mov w8, #32767 // =0x7fff +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s1, w9 +; LSE-NEXT: .LBB2_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fmaxnm s2, s2, s1 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: ubfx w10, w9, #16, #1 +; LSE-NEXT: add w9, w9, w8 +; LSE-NEXT: add w9, w10, w9 +; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: lsr w9, w9, #16 +; LSE-NEXT: mov w11, w10 +; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s0, w11 +; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: b.ne .LBB2_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align2: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: b .LBB2_2 +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 +; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl fmaxf +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 +; SOFTFP-NOLSE-NEXT: b .LBB2_1 +; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 2 + ret bfloat %res +} + +define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: mov w8, #32767 // =0x7fff +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s1, w9 +; NOLSE-NEXT: b .LBB3_2 +; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 +; NOLSE-NEXT: fmov s0, w11 +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.eq .LBB3_5 +; NOLSE-NEXT: .LBB3_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB3_3 Depth 2 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fmaxnm s2, s2, s1 +; NOLSE-NEXT: fmov w9, s2 +; NOLSE-NEXT: ubfx w10, w9, #16, #1 +; NOLSE-NEXT: add w9, w9, w8 +; NOLSE-NEXT: add w9, w10, w9 +; NOLSE-NEXT: lsr w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fmov w10, s2 +; NOLSE-NEXT: .LBB3_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w11, [x0] +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.ne .LBB3_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w10, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB3_3 +; NOLSE-NEXT: b .LBB3_1 +; NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: mov w8, #32767 // =0x7fff +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s1, w9 +; LSE-NEXT: .LBB3_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fmaxnm s2, s2, s1 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: ubfx w10, w9, #16, #1 +; LSE-NEXT: add w9, w9, w8 +; LSE-NEXT: add w9, w10, w9 +; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: lsr w9, w9, #16 +; LSE-NEXT: mov w11, w10 +; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s0, w11 +; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: b.ne .LBB3_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: b .LBB3_2 +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 +; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl fmaxf +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 +; SOFTFP-NOLSE-NEXT: b .LBB3_1 +; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 4 + ret bfloat %res +} + +define float @test_atomicrmw_fmax_f32_seq_cst_align4(ptr %ptr, float %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr s1, [x0] +; NOLSE-NEXT: b .LBB4_2 +; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 +; NOLSE-NEXT: fmov s1, w10 +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.eq .LBB4_5 +; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; NOLSE-NEXT: fmaxnm s2, s1, s0 +; NOLSE-NEXT: fmov w9, s1 +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB4_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w10, [x0] +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.ne .LBB4_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB4_3 +; NOLSE-NEXT: b .LBB4_1 +; NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; NOLSE-NEXT: fmov s0, s1 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: ldr s1, [x0] +; LSE-NEXT: .LBB4_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fmaxnm s2, s1, s0 +; LSE-NEXT: fmov w8, s1 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casal w10, w9, [x0] +; LSE-NEXT: fmov s1, w10 +; LSE-NEXT: cmp w10, w8 +; LSE-NEXT: b.ne .LBB4_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: fmov s0, s1 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr w20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: b .LBB4_2 +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 +; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl fmaxf +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 +; SOFTFP-NOLSE-NEXT: b .LBB4_1 +; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmax ptr %ptr, float %value seq_cst, align 4 + ret float %res +} + +define double @test_atomicrmw_fmax_f32_seq_cst_align8(ptr %ptr, double %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align8: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr d1, [x0] +; NOLSE-NEXT: b .LBB5_2 +; NOLSE-NEXT: .LBB5_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 +; NOLSE-NEXT: fmov d1, x10 +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.eq .LBB5_5 +; NOLSE-NEXT: .LBB5_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; NOLSE-NEXT: fmaxnm d2, d1, d0 +; NOLSE-NEXT: fmov x9, d1 +; NOLSE-NEXT: fmov x8, d2 +; NOLSE-NEXT: .LBB5_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr x10, [x0] +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.ne .LBB5_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, x8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB5_3 +; NOLSE-NEXT: b .LBB5_1 +; NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, d1 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align8: +; LSE: // %bb.0: +; LSE-NEXT: ldr d1, [x0] +; LSE-NEXT: .LBB5_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fmaxnm d2, d1, d0 +; LSE-NEXT: fmov x8, d1 +; LSE-NEXT: mov x10, x8 +; LSE-NEXT: fmov x9, d2 +; LSE-NEXT: casal x10, x9, [x0] +; LSE-NEXT: fmov d1, x10 +; LSE-NEXT: cmp x10, x8 +; LSE-NEXT: b.ne .LBB5_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: fmov d0, d1 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align8: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: mov x21, x1 +; SOFTFP-NOLSE-NEXT: b .LBB5_2 +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: mov x20, x8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 +; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: mov x1, x21 +; SOFTFP-NOLSE-NEXT: bl fmax +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 +; SOFTFP-NOLSE-NEXT: b .LBB5_1 +; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmax ptr %ptr, double %value seq_cst, align 8 + ret double %res +} + +define fp128 @test_atomicrmw_fmax_fp128_seq_cst_align16(ptr %ptr, fp128 %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmax_fp128_seq_cst_align16: +; NOLSE: // %bb.0: +; NOLSE-NEXT: sub sp, sp, #96 +; NOLSE-NEXT: ldr q1, [x0] +; NOLSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; NOLSE-NEXT: mov x19, x0 +; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill +; NOLSE-NEXT: b .LBB6_2 +; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 +; NOLSE-NEXT: stp x12, x13, [sp, #32] +; NOLSE-NEXT: cmp x13, x10 +; NOLSE-NEXT: ldr q1, [sp, #32] +; NOLSE-NEXT: ccmp x12, x11, #0, eq +; NOLSE-NEXT: b.eq .LBB6_6 +; NOLSE-NEXT: .LBB6_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB6_3 Depth 2 +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; NOLSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; NOLSE-NEXT: bl fmaxl +; NOLSE-NEXT: str q0, [sp, #48] +; NOLSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; NOLSE-NEXT: ldp x9, x8, [sp, #48] +; NOLSE-NEXT: str q0, [sp, #64] +; NOLSE-NEXT: ldp x11, x10, [sp, #64] +; NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxp x12, x13, [x19] +; NOLSE-NEXT: cmp x12, x11 +; NOLSE-NEXT: cset w14, ne +; NOLSE-NEXT: cmp x13, x10 +; NOLSE-NEXT: cinc w14, w14, ne +; NOLSE-NEXT: cbz w14, .LBB6_5 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x12, x13, [x19] +; NOLSE-NEXT: cbnz w14, .LBB6_3 +; NOLSE-NEXT: b .LBB6_1 +; NOLSE-NEXT: .LBB6_5: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x9, x8, [x19] +; NOLSE-NEXT: cbnz w14, .LBB6_3 +; NOLSE-NEXT: b .LBB6_1 +; NOLSE-NEXT: .LBB6_6: // %atomicrmw.end +; NOLSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: add sp, sp, #96 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmax_fp128_seq_cst_align16: +; LSE: // %bb.0: +; LSE-NEXT: sub sp, sp, #96 +; LSE-NEXT: ldr q1, [x0] +; LSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; LSE-NEXT: mov x19, x0 +; LSE-NEXT: str q0, [sp] // 16-byte Folded Spill +; LSE-NEXT: .LBB6_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov v0.16b, v1.16b +; LSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; LSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; LSE-NEXT: bl fmaxl +; LSE-NEXT: str q0, [sp, #48] +; LSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; LSE-NEXT: ldp x0, x1, [sp, #48] +; LSE-NEXT: str q0, [sp, #64] +; LSE-NEXT: ldp x2, x3, [sp, #64] +; LSE-NEXT: mov x4, x2 +; LSE-NEXT: mov x5, x3 +; LSE-NEXT: caspal x4, x5, x0, x1, [x19] +; LSE-NEXT: stp x4, x5, [sp, #32] +; LSE-NEXT: cmp x5, x3 +; LSE-NEXT: ldr q1, [sp, #32] +; LSE-NEXT: ccmp x4, x2, #0, eq +; LSE-NEXT: b.ne .LBB6_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; LSE-NEXT: mov v0.16b, v1.16b +; LSE-NEXT: add sp, sp, #96 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_fp128_seq_cst_align16: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: mov x19, x3 +; SOFTFP-NOLSE-NEXT: ldp x0, x1, [x0] +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x21, x2 +; SOFTFP-NOLSE-NEXT: b .LBB6_2 +; SOFTFP-NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: ccmp x0, x23, #0, eq +; SOFTFP-NOLSE-NEXT: b.eq .LBB6_6 +; SOFTFP-NOLSE-NEXT: .LBB6_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov x2, x21 +; SOFTFP-NOLSE-NEXT: mov x3, x19 +; SOFTFP-NOLSE-NEXT: mov x22, x1 +; SOFTFP-NOLSE-NEXT: mov x23, x0 +; SOFTFP-NOLSE-NEXT: bl fmaxl +; SOFTFP-NOLSE-NEXT: mov x8, x0 +; SOFTFP-NOLSE-NEXT: mov x9, x1 +; SOFTFP-NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxp x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cmp x0, x23 +; SOFTFP-NOLSE-NEXT: cset w10, ne +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: cinc w10, w10, ne +; SOFTFP-NOLSE-NEXT: cbz w10, .LBB6_5 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w10, x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB6_3 +; SOFTFP-NOLSE-NEXT: b .LBB6_1 +; SOFTFP-NOLSE-NEXT: .LBB6_5: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w10, x8, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB6_3 +; SOFTFP-NOLSE-NEXT: b .LBB6_1 +; SOFTFP-NOLSE-NEXT: .LBB6_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmax ptr %ptr, fp128 %value seq_cst, align 16 + ret fp128 %res +} + +define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmax_v2f16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NOLSE-NEXT: mov h1, v0.h[1] +; NOLSE-NEXT: fcvt s2, h0 +; NOLSE-NEXT: ldr s0, [x0] +; NOLSE-NEXT: fcvt s1, h1 +; NOLSE-NEXT: b .LBB7_2 +; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.eq .LBB7_5 +; NOLSE-NEXT: .LBB7_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB7_3 Depth 2 +; NOLSE-NEXT: mov h3, v0.h[1] +; NOLSE-NEXT: fcvt s4, h0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fcvt s3, h3 +; NOLSE-NEXT: fmaxnm s4, s4, s2 +; NOLSE-NEXT: fmaxnm s3, s3, s1 +; NOLSE-NEXT: fcvt h4, s4 +; NOLSE-NEXT: fcvt h3, s3 +; NOLSE-NEXT: mov v4.h[1], v3.h[0] +; NOLSE-NEXT: fmov w8, s4 +; NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w10, [x0] +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.ne .LBB7_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB7_3 +; NOLSE-NEXT: b .LBB7_1 +; NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmax_v2f16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $d0 killed $d0 def $q0 +; LSE-NEXT: mov h1, v0.h[1] +; LSE-NEXT: fcvt s2, h0 +; LSE-NEXT: ldr s0, [x0] +; LSE-NEXT: fcvt s1, h1 +; LSE-NEXT: .LBB7_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov h3, v0.h[1] +; LSE-NEXT: fcvt s4, h0 +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fcvt s3, h3 +; LSE-NEXT: fmaxnm s4, s4, s2 +; LSE-NEXT: fmaxnm s3, s3, s1 +; LSE-NEXT: fcvt h4, s4 +; LSE-NEXT: fcvt h3, s3 +; LSE-NEXT: mov v4.h[1], v3.h[0] +; LSE-NEXT: fmov w9, s4 +; LSE-NEXT: casal w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8 +; LSE-NEXT: b.ne .LBB7_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_v2f16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x25, [sp, #-64]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: mov w22, w1 +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w19, w2 +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: b .LBB7_2 +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 +; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 +; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: and w0, w23, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w24 +; SOFTFP-NOLSE-NEXT: bl fmaxf +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w25, w0 +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w25 +; SOFTFP-NOLSE-NEXT: bl fmaxf +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 +; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 +; SOFTFP-NOLSE-NEXT: b .LBB7_1 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w23 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x25, [sp], #64 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmax ptr %ptr, <2 x half> %value seq_cst, align 4 + ret <2 x half> %res +} + +define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bfloat> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NOLSE-NEXT: mov h1, v0.h[1] +; NOLSE-NEXT: fmov w10, s0 +; NOLSE-NEXT: mov w8, #32767 // =0x7fff +; NOLSE-NEXT: ldr s0, [x0] +; NOLSE-NEXT: lsl w10, w10, #16 +; NOLSE-NEXT: fmov w9, s1 +; NOLSE-NEXT: fmov s2, w10 +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s1, w9 +; NOLSE-NEXT: b .LBB8_2 +; NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 +; NOLSE-NEXT: fmov s0, w11 +; NOLSE-NEXT: cmp w11, w9 +; NOLSE-NEXT: b.eq .LBB8_5 +; NOLSE-NEXT: .LBB8_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB8_3 Depth 2 +; NOLSE-NEXT: mov h3, v0.h[1] +; NOLSE-NEXT: fmov w10, s0 +; NOLSE-NEXT: lsl w10, w10, #16 +; NOLSE-NEXT: fmov w9, s3 +; NOLSE-NEXT: fmov s4, w10 +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmaxnm s4, s4, s2 +; NOLSE-NEXT: fmov s3, w9 +; NOLSE-NEXT: fmaxnm s3, s3, s1 +; NOLSE-NEXT: fmov w10, s4 +; NOLSE-NEXT: ubfx w12, w10, #16, #1 +; NOLSE-NEXT: add w10, w10, w8 +; NOLSE-NEXT: fmov w9, s3 +; NOLSE-NEXT: add w10, w12, w10 +; NOLSE-NEXT: lsr w10, w10, #16 +; NOLSE-NEXT: ubfx w11, w9, #16, #1 +; NOLSE-NEXT: add w9, w9, w8 +; NOLSE-NEXT: fmov s4, w10 +; NOLSE-NEXT: add w9, w11, w9 +; NOLSE-NEXT: lsr w9, w9, #16 +; NOLSE-NEXT: fmov s3, w9 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: mov v4.h[1], v3.h[0] +; NOLSE-NEXT: fmov w10, s4 +; NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w11, [x0] +; NOLSE-NEXT: cmp w11, w9 +; NOLSE-NEXT: b.ne .LBB8_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, w10, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB8_3 +; NOLSE-NEXT: b .LBB8_1 +; NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $d0 killed $d0 def $q0 +; LSE-NEXT: mov h1, v0.h[1] +; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: mov w8, #32767 // =0x7fff +; LSE-NEXT: ldr s0, [x0] +; LSE-NEXT: lsl w10, w10, #16 +; LSE-NEXT: fmov w9, s1 +; LSE-NEXT: fmov s2, w10 +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s1, w9 +; LSE-NEXT: .LBB8_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov h3, v0.h[1] +; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: lsl w10, w10, #16 +; LSE-NEXT: fmov w9, s3 +; LSE-NEXT: fmov s4, w10 +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmaxnm s4, s4, s2 +; LSE-NEXT: fmov s3, w9 +; LSE-NEXT: fmaxnm s3, s3, s1 +; LSE-NEXT: fmov w10, s4 +; LSE-NEXT: ubfx w12, w10, #16, #1 +; LSE-NEXT: add w10, w10, w8 +; LSE-NEXT: fmov w9, s3 +; LSE-NEXT: add w10, w12, w10 +; LSE-NEXT: lsr w10, w10, #16 +; LSE-NEXT: ubfx w11, w9, #16, #1 +; LSE-NEXT: add w9, w9, w8 +; LSE-NEXT: fmov s4, w10 +; LSE-NEXT: add w9, w11, w9 +; LSE-NEXT: lsr w9, w9, #16 +; LSE-NEXT: fmov s3, w9 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: mov v4.h[1], v3.h[0] +; LSE-NEXT: mov w11, w9 +; LSE-NEXT: fmov w10, s4 +; LSE-NEXT: casal w11, w10, [x0] +; LSE-NEXT: fmov s0, w11 +; LSE-NEXT: cmp w11, w9 +; LSE-NEXT: b.ne .LBB8_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w8, w1 +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 +; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: b .LBB8_2 +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 +; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 +; SOFTFP-NOLSE-NEXT: lsl w23, w1, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: bl fmaxf +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: bl fmaxf +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 +; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 +; SOFTFP-NOLSE-NEXT: b .LBB8_1 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmax ptr %ptr, <2 x bfloat> %value seq_cst, align 4 + ret <2 x bfloat> %res +} + +define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmax_v2f32_seq_cst_align8: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr d1, [x0] +; NOLSE-NEXT: b .LBB9_2 +; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 +; NOLSE-NEXT: fmov d1, x10 +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.eq .LBB9_5 +; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; NOLSE-NEXT: fmaxnm v2.2s, v1.2s, v0.2s +; NOLSE-NEXT: fmov x9, d1 +; NOLSE-NEXT: fmov x8, d2 +; NOLSE-NEXT: .LBB9_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr x10, [x0] +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.ne .LBB9_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, x8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB9_3 +; NOLSE-NEXT: b .LBB9_1 +; NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, d1 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmax_v2f32_seq_cst_align8: +; LSE: // %bb.0: +; LSE-NEXT: ldr d1, [x0] +; LSE-NEXT: .LBB9_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fmaxnm v2.2s, v1.2s, v0.2s +; LSE-NEXT: fmov x8, d1 +; LSE-NEXT: mov x10, x8 +; LSE-NEXT: fmov x9, d2 +; LSE-NEXT: casal x10, x9, [x0] +; LSE-NEXT: fmov d1, x10 +; LSE-NEXT: cmp x10, x8 +; LSE-NEXT: b.ne .LBB9_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: fmov d0, d1 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_v2f32_seq_cst_align8: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w19, w2 +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: b .LBB9_2 +; SOFTFP-NOLSE-NEXT: .LBB9_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 +; SOFTFP-NOLSE-NEXT: b.eq .LBB9_5 +; SOFTFP-NOLSE-NEXT: .LBB9_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w19 +; SOFTFP-NOLSE-NEXT: bl fmaxf +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl fmaxf +; SOFTFP-NOLSE-NEXT: mov w8, w23 +; SOFTFP-NOLSE-NEXT: mov w9, w0 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB9_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: b.ne .LBB9_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB9_3 +; SOFTFP-NOLSE-NEXT: b .LBB9_1 +; SOFTFP-NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmax ptr %ptr, <2 x float> %value seq_cst, align 8 + ret <2 x float> %res +} + +define <2 x double> @test_atomicrmw_fmax_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmax_v2f64_seq_cst_align8: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr q1, [x0] +; NOLSE-NEXT: b .LBB10_2 +; NOLSE-NEXT: .LBB10_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 +; NOLSE-NEXT: fmov d1, x12 +; NOLSE-NEXT: cmp x13, x9 +; NOLSE-NEXT: ccmp x12, x11, #0, eq +; NOLSE-NEXT: mov v1.d[1], x13 +; NOLSE-NEXT: b.eq .LBB10_6 +; NOLSE-NEXT: .LBB10_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; NOLSE-NEXT: fmaxnm v2.2d, v1.2d, v0.2d +; NOLSE-NEXT: mov x9, v1.d[1] +; NOLSE-NEXT: fmov x11, d1 +; NOLSE-NEXT: mov x8, v2.d[1] +; NOLSE-NEXT: fmov x10, d2 +; NOLSE-NEXT: .LBB10_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxp x12, x13, [x0] +; NOLSE-NEXT: cmp x12, x11 +; NOLSE-NEXT: cset w14, ne +; NOLSE-NEXT: cmp x13, x9 +; NOLSE-NEXT: cinc w14, w14, ne +; NOLSE-NEXT: cbz w14, .LBB10_5 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x12, x13, [x0] +; NOLSE-NEXT: cbnz w14, .LBB10_3 +; NOLSE-NEXT: b .LBB10_1 +; NOLSE-NEXT: .LBB10_5: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x10, x8, [x0] +; NOLSE-NEXT: cbnz w14, .LBB10_3 +; NOLSE-NEXT: b .LBB10_1 +; NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmax_v2f64_seq_cst_align8: +; LSE: // %bb.0: +; LSE-NEXT: ldr q1, [x0] +; LSE-NEXT: .LBB10_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fmaxnm v2.2d, v1.2d, v0.2d +; LSE-NEXT: mov x3, v1.d[1] +; LSE-NEXT: fmov x2, d1 +; LSE-NEXT: mov x7, x3 +; LSE-NEXT: mov x5, v2.d[1] +; LSE-NEXT: mov x6, x2 +; LSE-NEXT: fmov x4, d2 +; LSE-NEXT: caspal x6, x7, x4, x5, [x0] +; LSE-NEXT: fmov d1, x6 +; LSE-NEXT: cmp x7, x3 +; LSE-NEXT: ccmp x6, x2, #0, eq +; LSE-NEXT: mov v1.d[1], x7 +; LSE-NEXT: b.ne .LBB10_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: mov v0.16b, v1.16b +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_v2f64_seq_cst_align8: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: mov x19, x3 +; SOFTFP-NOLSE-NEXT: ldp x0, x1, [x0] +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x21, x2 +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: b .LBB10_2 +; SOFTFP-NOLSE-NEXT: .LBB10_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: ccmp x0, x23, #0, eq +; SOFTFP-NOLSE-NEXT: b.eq .LBB10_6 +; SOFTFP-NOLSE-NEXT: .LBB10_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov x22, x1 +; SOFTFP-NOLSE-NEXT: mov x23, x0 +; SOFTFP-NOLSE-NEXT: mov x0, x1 +; SOFTFP-NOLSE-NEXT: mov x1, x19 +; SOFTFP-NOLSE-NEXT: bl fmax +; SOFTFP-NOLSE-NEXT: mov x24, x0 +; SOFTFP-NOLSE-NEXT: mov x0, x23 +; SOFTFP-NOLSE-NEXT: mov x1, x21 +; SOFTFP-NOLSE-NEXT: bl fmax +; SOFTFP-NOLSE-NEXT: mov x8, x0 +; SOFTFP-NOLSE-NEXT: .LBB10_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxp x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cmp x0, x23 +; SOFTFP-NOLSE-NEXT: cset w9, ne +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: cinc w9, w9, ne +; SOFTFP-NOLSE-NEXT: cbz w9, .LBB10_5 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w9, x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB10_3 +; SOFTFP-NOLSE-NEXT: b .LBB10_1 +; SOFTFP-NOLSE-NEXT: .LBB10_5: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w9, x8, x24, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB10_3 +; SOFTFP-NOLSE-NEXT: b .LBB10_1 +; SOFTFP-NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmax ptr %ptr, <2 x double> %value seq_cst, align 16 + ret <2 x double> %res +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll new file mode 100644 index 0000000000000..2697dbf5b2191 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll @@ -0,0 +1,1272 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=NOLSE %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+lse -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=LSE %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=-lse,-fp-armv8 -O1 < %s | FileCheck -check-prefix=SOFTFP-NOLSE %s + +; FIXME: Windows hosts assigns stack slots to different offsets for some reason. +; UNSUPPORTED: system-windows + +define half @test_atomicrmw_fmin_f16_seq_cst_align2(ptr %ptr, half %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align2: +; NOLSE: // %bb.0: +; NOLSE-NEXT: fcvt s1, h0 +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: b .LBB0_2 +; NOLSE-NEXT: .LBB0_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.eq .LBB0_5 +; NOLSE-NEXT: .LBB0_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; NOLSE-NEXT: fcvt s2, h0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fminnm s2, s2, s1 +; NOLSE-NEXT: fcvt h2, s2 +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB0_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w10, [x0] +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.ne .LBB0_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB0_3 +; NOLSE-NEXT: b .LBB0_1 +; NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align2: +; LSE: // %bb.0: +; LSE-NEXT: fcvt s1, h0 +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: .LBB0_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fcvt s2, h0 +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fminnm s2, s2, s1 +; LSE-NEXT: fcvt h2, s2 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casalh w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8, uxth +; LSE-NEXT: b.ne .LBB0_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align2: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: b .LBB0_2 +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 +; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: bl fminf +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 +; SOFTFP-NOLSE-NEXT: b .LBB0_1 +; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmin ptr %ptr, half %value seq_cst, align 2 + ret half %res +} + +define half @test_atomicrmw_fmin_f16_seq_cst_align4(ptr %ptr, half %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: fcvt s1, h0 +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: b .LBB1_2 +; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.eq .LBB1_5 +; NOLSE-NEXT: .LBB1_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; NOLSE-NEXT: fcvt s2, h0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fminnm s2, s2, s1 +; NOLSE-NEXT: fcvt h2, s2 +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB1_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w10, [x0] +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.ne .LBB1_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB1_3 +; NOLSE-NEXT: b .LBB1_1 +; NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: fcvt s1, h0 +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: .LBB1_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fcvt s2, h0 +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fminnm s2, s2, s1 +; LSE-NEXT: fcvt h2, s2 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casalh w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8, uxth +; LSE-NEXT: b.ne .LBB1_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: b .LBB1_2 +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 +; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: bl fminf +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 +; SOFTFP-NOLSE-NEXT: b .LBB1_1 +; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmin ptr %ptr, half %value seq_cst, align 4 + ret half %res +} + +define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align2: +; NOLSE: // %bb.0: +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: mov w8, #32767 // =0x7fff +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s1, w9 +; NOLSE-NEXT: b .LBB2_2 +; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 +; NOLSE-NEXT: fmov s0, w11 +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.eq .LBB2_5 +; NOLSE-NEXT: .LBB2_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB2_3 Depth 2 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fminnm s2, s2, s1 +; NOLSE-NEXT: fmov w9, s2 +; NOLSE-NEXT: ubfx w10, w9, #16, #1 +; NOLSE-NEXT: add w9, w9, w8 +; NOLSE-NEXT: add w9, w10, w9 +; NOLSE-NEXT: lsr w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fmov w10, s2 +; NOLSE-NEXT: .LBB2_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w11, [x0] +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.ne .LBB2_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w10, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB2_3 +; NOLSE-NEXT: b .LBB2_1 +; NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align2: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: mov w8, #32767 // =0x7fff +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s1, w9 +; LSE-NEXT: .LBB2_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fminnm s2, s2, s1 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: ubfx w10, w9, #16, #1 +; LSE-NEXT: add w9, w9, w8 +; LSE-NEXT: add w9, w10, w9 +; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: lsr w9, w9, #16 +; LSE-NEXT: mov w11, w10 +; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s0, w11 +; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: b.ne .LBB2_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align2: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: b .LBB2_2 +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 +; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl fminf +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 +; SOFTFP-NOLSE-NEXT: b .LBB2_1 +; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 2 + ret bfloat %res +} + +define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: mov w8, #32767 // =0x7fff +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s1, w9 +; NOLSE-NEXT: b .LBB3_2 +; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 +; NOLSE-NEXT: fmov s0, w11 +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.eq .LBB3_5 +; NOLSE-NEXT: .LBB3_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB3_3 Depth 2 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fminnm s2, s2, s1 +; NOLSE-NEXT: fmov w9, s2 +; NOLSE-NEXT: ubfx w10, w9, #16, #1 +; NOLSE-NEXT: add w9, w9, w8 +; NOLSE-NEXT: add w9, w10, w9 +; NOLSE-NEXT: lsr w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fmov w10, s2 +; NOLSE-NEXT: .LBB3_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w11, [x0] +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.ne .LBB3_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w10, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB3_3 +; NOLSE-NEXT: b .LBB3_1 +; NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: mov w8, #32767 // =0x7fff +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s1, w9 +; LSE-NEXT: .LBB3_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fminnm s2, s2, s1 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: ubfx w10, w9, #16, #1 +; LSE-NEXT: add w9, w9, w8 +; LSE-NEXT: add w9, w10, w9 +; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: lsr w9, w9, #16 +; LSE-NEXT: mov w11, w10 +; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s0, w11 +; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: b.ne .LBB3_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: b .LBB3_2 +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 +; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl fminf +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 +; SOFTFP-NOLSE-NEXT: b .LBB3_1 +; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 4 + ret bfloat %res +} + +define float @test_atomicrmw_fmin_f32_seq_cst_align4(ptr %ptr, float %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr s1, [x0] +; NOLSE-NEXT: b .LBB4_2 +; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 +; NOLSE-NEXT: fmov s1, w10 +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.eq .LBB4_5 +; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; NOLSE-NEXT: fminnm s2, s1, s0 +; NOLSE-NEXT: fmov w9, s1 +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB4_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w10, [x0] +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.ne .LBB4_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB4_3 +; NOLSE-NEXT: b .LBB4_1 +; NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; NOLSE-NEXT: fmov s0, s1 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: ldr s1, [x0] +; LSE-NEXT: .LBB4_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fminnm s2, s1, s0 +; LSE-NEXT: fmov w8, s1 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casal w10, w9, [x0] +; LSE-NEXT: fmov s1, w10 +; LSE-NEXT: cmp w10, w8 +; LSE-NEXT: b.ne .LBB4_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: fmov s0, s1 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr w20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: b .LBB4_2 +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 +; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl fminf +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 +; SOFTFP-NOLSE-NEXT: b .LBB4_1 +; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmin ptr %ptr, float %value seq_cst, align 4 + ret float %res +} + +define double @test_atomicrmw_fmin_f32_seq_cst_align8(ptr %ptr, double %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align8: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr d1, [x0] +; NOLSE-NEXT: b .LBB5_2 +; NOLSE-NEXT: .LBB5_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 +; NOLSE-NEXT: fmov d1, x10 +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.eq .LBB5_5 +; NOLSE-NEXT: .LBB5_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; NOLSE-NEXT: fminnm d2, d1, d0 +; NOLSE-NEXT: fmov x9, d1 +; NOLSE-NEXT: fmov x8, d2 +; NOLSE-NEXT: .LBB5_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr x10, [x0] +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.ne .LBB5_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, x8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB5_3 +; NOLSE-NEXT: b .LBB5_1 +; NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, d1 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align8: +; LSE: // %bb.0: +; LSE-NEXT: ldr d1, [x0] +; LSE-NEXT: .LBB5_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fminnm d2, d1, d0 +; LSE-NEXT: fmov x8, d1 +; LSE-NEXT: mov x10, x8 +; LSE-NEXT: fmov x9, d2 +; LSE-NEXT: casal x10, x9, [x0] +; LSE-NEXT: fmov d1, x10 +; LSE-NEXT: cmp x10, x8 +; LSE-NEXT: b.ne .LBB5_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: fmov d0, d1 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align8: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: mov x21, x1 +; SOFTFP-NOLSE-NEXT: b .LBB5_2 +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: mov x20, x8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 +; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: mov x1, x21 +; SOFTFP-NOLSE-NEXT: bl fmin +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 +; SOFTFP-NOLSE-NEXT: b .LBB5_1 +; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmin ptr %ptr, double %value seq_cst, align 8 + ret double %res +} + +define fp128 @test_atomicrmw_fmin_fp128_seq_cst_align16(ptr %ptr, fp128 %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmin_fp128_seq_cst_align16: +; NOLSE: // %bb.0: +; NOLSE-NEXT: sub sp, sp, #96 +; NOLSE-NEXT: ldr q1, [x0] +; NOLSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; NOLSE-NEXT: mov x19, x0 +; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill +; NOLSE-NEXT: b .LBB6_2 +; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 +; NOLSE-NEXT: stp x12, x13, [sp, #32] +; NOLSE-NEXT: cmp x13, x10 +; NOLSE-NEXT: ldr q1, [sp, #32] +; NOLSE-NEXT: ccmp x12, x11, #0, eq +; NOLSE-NEXT: b.eq .LBB6_6 +; NOLSE-NEXT: .LBB6_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB6_3 Depth 2 +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; NOLSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; NOLSE-NEXT: bl fminl +; NOLSE-NEXT: str q0, [sp, #48] +; NOLSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; NOLSE-NEXT: ldp x9, x8, [sp, #48] +; NOLSE-NEXT: str q0, [sp, #64] +; NOLSE-NEXT: ldp x11, x10, [sp, #64] +; NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxp x12, x13, [x19] +; NOLSE-NEXT: cmp x12, x11 +; NOLSE-NEXT: cset w14, ne +; NOLSE-NEXT: cmp x13, x10 +; NOLSE-NEXT: cinc w14, w14, ne +; NOLSE-NEXT: cbz w14, .LBB6_5 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x12, x13, [x19] +; NOLSE-NEXT: cbnz w14, .LBB6_3 +; NOLSE-NEXT: b .LBB6_1 +; NOLSE-NEXT: .LBB6_5: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x9, x8, [x19] +; NOLSE-NEXT: cbnz w14, .LBB6_3 +; NOLSE-NEXT: b .LBB6_1 +; NOLSE-NEXT: .LBB6_6: // %atomicrmw.end +; NOLSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: add sp, sp, #96 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmin_fp128_seq_cst_align16: +; LSE: // %bb.0: +; LSE-NEXT: sub sp, sp, #96 +; LSE-NEXT: ldr q1, [x0] +; LSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; LSE-NEXT: mov x19, x0 +; LSE-NEXT: str q0, [sp] // 16-byte Folded Spill +; LSE-NEXT: .LBB6_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov v0.16b, v1.16b +; LSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; LSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; LSE-NEXT: bl fminl +; LSE-NEXT: str q0, [sp, #48] +; LSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; LSE-NEXT: ldp x0, x1, [sp, #48] +; LSE-NEXT: str q0, [sp, #64] +; LSE-NEXT: ldp x2, x3, [sp, #64] +; LSE-NEXT: mov x4, x2 +; LSE-NEXT: mov x5, x3 +; LSE-NEXT: caspal x4, x5, x0, x1, [x19] +; LSE-NEXT: stp x4, x5, [sp, #32] +; LSE-NEXT: cmp x5, x3 +; LSE-NEXT: ldr q1, [sp, #32] +; LSE-NEXT: ccmp x4, x2, #0, eq +; LSE-NEXT: b.ne .LBB6_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; LSE-NEXT: mov v0.16b, v1.16b +; LSE-NEXT: add sp, sp, #96 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_fp128_seq_cst_align16: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: mov x19, x3 +; SOFTFP-NOLSE-NEXT: ldp x0, x1, [x0] +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x21, x2 +; SOFTFP-NOLSE-NEXT: b .LBB6_2 +; SOFTFP-NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: ccmp x0, x23, #0, eq +; SOFTFP-NOLSE-NEXT: b.eq .LBB6_6 +; SOFTFP-NOLSE-NEXT: .LBB6_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov x2, x21 +; SOFTFP-NOLSE-NEXT: mov x3, x19 +; SOFTFP-NOLSE-NEXT: mov x22, x1 +; SOFTFP-NOLSE-NEXT: mov x23, x0 +; SOFTFP-NOLSE-NEXT: bl fminl +; SOFTFP-NOLSE-NEXT: mov x8, x0 +; SOFTFP-NOLSE-NEXT: mov x9, x1 +; SOFTFP-NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxp x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cmp x0, x23 +; SOFTFP-NOLSE-NEXT: cset w10, ne +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: cinc w10, w10, ne +; SOFTFP-NOLSE-NEXT: cbz w10, .LBB6_5 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w10, x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB6_3 +; SOFTFP-NOLSE-NEXT: b .LBB6_1 +; SOFTFP-NOLSE-NEXT: .LBB6_5: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w10, x8, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB6_3 +; SOFTFP-NOLSE-NEXT: b .LBB6_1 +; SOFTFP-NOLSE-NEXT: .LBB6_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmin ptr %ptr, fp128 %value seq_cst, align 16 + ret fp128 %res +} + +define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmin_v2f16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NOLSE-NEXT: mov h1, v0.h[1] +; NOLSE-NEXT: fcvt s2, h0 +; NOLSE-NEXT: ldr s0, [x0] +; NOLSE-NEXT: fcvt s1, h1 +; NOLSE-NEXT: b .LBB7_2 +; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.eq .LBB7_5 +; NOLSE-NEXT: .LBB7_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB7_3 Depth 2 +; NOLSE-NEXT: mov h3, v0.h[1] +; NOLSE-NEXT: fcvt s4, h0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fcvt s3, h3 +; NOLSE-NEXT: fminnm s4, s4, s2 +; NOLSE-NEXT: fminnm s3, s3, s1 +; NOLSE-NEXT: fcvt h4, s4 +; NOLSE-NEXT: fcvt h3, s3 +; NOLSE-NEXT: mov v4.h[1], v3.h[0] +; NOLSE-NEXT: fmov w8, s4 +; NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w10, [x0] +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.ne .LBB7_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB7_3 +; NOLSE-NEXT: b .LBB7_1 +; NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmin_v2f16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $d0 killed $d0 def $q0 +; LSE-NEXT: mov h1, v0.h[1] +; LSE-NEXT: fcvt s2, h0 +; LSE-NEXT: ldr s0, [x0] +; LSE-NEXT: fcvt s1, h1 +; LSE-NEXT: .LBB7_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov h3, v0.h[1] +; LSE-NEXT: fcvt s4, h0 +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fcvt s3, h3 +; LSE-NEXT: fminnm s4, s4, s2 +; LSE-NEXT: fminnm s3, s3, s1 +; LSE-NEXT: fcvt h4, s4 +; LSE-NEXT: fcvt h3, s3 +; LSE-NEXT: mov v4.h[1], v3.h[0] +; LSE-NEXT: fmov w9, s4 +; LSE-NEXT: casal w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8 +; LSE-NEXT: b.ne .LBB7_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_v2f16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x25, [sp, #-64]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: mov w22, w1 +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w19, w2 +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: b .LBB7_2 +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 +; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 +; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: and w0, w23, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w24 +; SOFTFP-NOLSE-NEXT: bl fminf +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w25, w0 +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w25 +; SOFTFP-NOLSE-NEXT: bl fminf +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 +; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 +; SOFTFP-NOLSE-NEXT: b .LBB7_1 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w23 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x25, [sp], #64 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmin ptr %ptr, <2 x half> %value seq_cst, align 4 + ret <2 x half> %res +} + +define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bfloat> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NOLSE-NEXT: mov h1, v0.h[1] +; NOLSE-NEXT: fmov w10, s0 +; NOLSE-NEXT: mov w8, #32767 // =0x7fff +; NOLSE-NEXT: ldr s0, [x0] +; NOLSE-NEXT: lsl w10, w10, #16 +; NOLSE-NEXT: fmov w9, s1 +; NOLSE-NEXT: fmov s2, w10 +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s1, w9 +; NOLSE-NEXT: b .LBB8_2 +; NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 +; NOLSE-NEXT: fmov s0, w11 +; NOLSE-NEXT: cmp w11, w9 +; NOLSE-NEXT: b.eq .LBB8_5 +; NOLSE-NEXT: .LBB8_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB8_3 Depth 2 +; NOLSE-NEXT: mov h3, v0.h[1] +; NOLSE-NEXT: fmov w10, s0 +; NOLSE-NEXT: lsl w10, w10, #16 +; NOLSE-NEXT: fmov w9, s3 +; NOLSE-NEXT: fmov s4, w10 +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fminnm s4, s4, s2 +; NOLSE-NEXT: fmov s3, w9 +; NOLSE-NEXT: fminnm s3, s3, s1 +; NOLSE-NEXT: fmov w10, s4 +; NOLSE-NEXT: ubfx w12, w10, #16, #1 +; NOLSE-NEXT: add w10, w10, w8 +; NOLSE-NEXT: fmov w9, s3 +; NOLSE-NEXT: add w10, w12, w10 +; NOLSE-NEXT: lsr w10, w10, #16 +; NOLSE-NEXT: ubfx w11, w9, #16, #1 +; NOLSE-NEXT: add w9, w9, w8 +; NOLSE-NEXT: fmov s4, w10 +; NOLSE-NEXT: add w9, w11, w9 +; NOLSE-NEXT: lsr w9, w9, #16 +; NOLSE-NEXT: fmov s3, w9 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: mov v4.h[1], v3.h[0] +; NOLSE-NEXT: fmov w10, s4 +; NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w11, [x0] +; NOLSE-NEXT: cmp w11, w9 +; NOLSE-NEXT: b.ne .LBB8_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, w10, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB8_3 +; NOLSE-NEXT: b .LBB8_1 +; NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $d0 killed $d0 def $q0 +; LSE-NEXT: mov h1, v0.h[1] +; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: mov w8, #32767 // =0x7fff +; LSE-NEXT: ldr s0, [x0] +; LSE-NEXT: lsl w10, w10, #16 +; LSE-NEXT: fmov w9, s1 +; LSE-NEXT: fmov s2, w10 +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s1, w9 +; LSE-NEXT: .LBB8_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov h3, v0.h[1] +; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: lsl w10, w10, #16 +; LSE-NEXT: fmov w9, s3 +; LSE-NEXT: fmov s4, w10 +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fminnm s4, s4, s2 +; LSE-NEXT: fmov s3, w9 +; LSE-NEXT: fminnm s3, s3, s1 +; LSE-NEXT: fmov w10, s4 +; LSE-NEXT: ubfx w12, w10, #16, #1 +; LSE-NEXT: add w10, w10, w8 +; LSE-NEXT: fmov w9, s3 +; LSE-NEXT: add w10, w12, w10 +; LSE-NEXT: lsr w10, w10, #16 +; LSE-NEXT: ubfx w11, w9, #16, #1 +; LSE-NEXT: add w9, w9, w8 +; LSE-NEXT: fmov s4, w10 +; LSE-NEXT: add w9, w11, w9 +; LSE-NEXT: lsr w9, w9, #16 +; LSE-NEXT: fmov s3, w9 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: mov v4.h[1], v3.h[0] +; LSE-NEXT: mov w11, w9 +; LSE-NEXT: fmov w10, s4 +; LSE-NEXT: casal w11, w10, [x0] +; LSE-NEXT: fmov s0, w11 +; LSE-NEXT: cmp w11, w9 +; LSE-NEXT: b.ne .LBB8_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w8, w1 +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 +; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: b .LBB8_2 +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 +; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 +; SOFTFP-NOLSE-NEXT: lsl w23, w1, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: bl fminf +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: bl fminf +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 +; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 +; SOFTFP-NOLSE-NEXT: b .LBB8_1 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmin ptr %ptr, <2 x bfloat> %value seq_cst, align 4 + ret <2 x bfloat> %res +} + +define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmin_v2f32_seq_cst_align8: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr d1, [x0] +; NOLSE-NEXT: b .LBB9_2 +; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 +; NOLSE-NEXT: fmov d1, x10 +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.eq .LBB9_5 +; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; NOLSE-NEXT: fminnm v2.2s, v1.2s, v0.2s +; NOLSE-NEXT: fmov x9, d1 +; NOLSE-NEXT: fmov x8, d2 +; NOLSE-NEXT: .LBB9_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr x10, [x0] +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.ne .LBB9_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, x8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB9_3 +; NOLSE-NEXT: b .LBB9_1 +; NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, d1 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmin_v2f32_seq_cst_align8: +; LSE: // %bb.0: +; LSE-NEXT: ldr d1, [x0] +; LSE-NEXT: .LBB9_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fminnm v2.2s, v1.2s, v0.2s +; LSE-NEXT: fmov x8, d1 +; LSE-NEXT: mov x10, x8 +; LSE-NEXT: fmov x9, d2 +; LSE-NEXT: casal x10, x9, [x0] +; LSE-NEXT: fmov d1, x10 +; LSE-NEXT: cmp x10, x8 +; LSE-NEXT: b.ne .LBB9_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: fmov d0, d1 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_v2f32_seq_cst_align8: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w19, w2 +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: b .LBB9_2 +; SOFTFP-NOLSE-NEXT: .LBB9_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 +; SOFTFP-NOLSE-NEXT: b.eq .LBB9_5 +; SOFTFP-NOLSE-NEXT: .LBB9_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w19 +; SOFTFP-NOLSE-NEXT: bl fminf +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl fminf +; SOFTFP-NOLSE-NEXT: mov w8, w23 +; SOFTFP-NOLSE-NEXT: mov w9, w0 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB9_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: b.ne .LBB9_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB9_3 +; SOFTFP-NOLSE-NEXT: b .LBB9_1 +; SOFTFP-NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmin ptr %ptr, <2 x float> %value seq_cst, align 8 + ret <2 x float> %res +} + +define <2 x double> @test_atomicrmw_fmin_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fmin_v2f64_seq_cst_align8: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr q1, [x0] +; NOLSE-NEXT: b .LBB10_2 +; NOLSE-NEXT: .LBB10_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 +; NOLSE-NEXT: fmov d1, x12 +; NOLSE-NEXT: cmp x13, x9 +; NOLSE-NEXT: ccmp x12, x11, #0, eq +; NOLSE-NEXT: mov v1.d[1], x13 +; NOLSE-NEXT: b.eq .LBB10_6 +; NOLSE-NEXT: .LBB10_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; NOLSE-NEXT: fminnm v2.2d, v1.2d, v0.2d +; NOLSE-NEXT: mov x9, v1.d[1] +; NOLSE-NEXT: fmov x11, d1 +; NOLSE-NEXT: mov x8, v2.d[1] +; NOLSE-NEXT: fmov x10, d2 +; NOLSE-NEXT: .LBB10_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxp x12, x13, [x0] +; NOLSE-NEXT: cmp x12, x11 +; NOLSE-NEXT: cset w14, ne +; NOLSE-NEXT: cmp x13, x9 +; NOLSE-NEXT: cinc w14, w14, ne +; NOLSE-NEXT: cbz w14, .LBB10_5 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x12, x13, [x0] +; NOLSE-NEXT: cbnz w14, .LBB10_3 +; NOLSE-NEXT: b .LBB10_1 +; NOLSE-NEXT: .LBB10_5: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x10, x8, [x0] +; NOLSE-NEXT: cbnz w14, .LBB10_3 +; NOLSE-NEXT: b .LBB10_1 +; NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fmin_v2f64_seq_cst_align8: +; LSE: // %bb.0: +; LSE-NEXT: ldr q1, [x0] +; LSE-NEXT: .LBB10_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fminnm v2.2d, v1.2d, v0.2d +; LSE-NEXT: mov x3, v1.d[1] +; LSE-NEXT: fmov x2, d1 +; LSE-NEXT: mov x7, x3 +; LSE-NEXT: mov x5, v2.d[1] +; LSE-NEXT: mov x6, x2 +; LSE-NEXT: fmov x4, d2 +; LSE-NEXT: caspal x6, x7, x4, x5, [x0] +; LSE-NEXT: fmov d1, x6 +; LSE-NEXT: cmp x7, x3 +; LSE-NEXT: ccmp x6, x2, #0, eq +; LSE-NEXT: mov v1.d[1], x7 +; LSE-NEXT: b.ne .LBB10_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: mov v0.16b, v1.16b +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_v2f64_seq_cst_align8: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: mov x19, x3 +; SOFTFP-NOLSE-NEXT: ldp x0, x1, [x0] +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x21, x2 +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: b .LBB10_2 +; SOFTFP-NOLSE-NEXT: .LBB10_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: ccmp x0, x23, #0, eq +; SOFTFP-NOLSE-NEXT: b.eq .LBB10_6 +; SOFTFP-NOLSE-NEXT: .LBB10_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov x22, x1 +; SOFTFP-NOLSE-NEXT: mov x23, x0 +; SOFTFP-NOLSE-NEXT: mov x0, x1 +; SOFTFP-NOLSE-NEXT: mov x1, x19 +; SOFTFP-NOLSE-NEXT: bl fmin +; SOFTFP-NOLSE-NEXT: mov x24, x0 +; SOFTFP-NOLSE-NEXT: mov x0, x23 +; SOFTFP-NOLSE-NEXT: mov x1, x21 +; SOFTFP-NOLSE-NEXT: bl fmin +; SOFTFP-NOLSE-NEXT: mov x8, x0 +; SOFTFP-NOLSE-NEXT: .LBB10_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxp x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cmp x0, x23 +; SOFTFP-NOLSE-NEXT: cset w9, ne +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: cinc w9, w9, ne +; SOFTFP-NOLSE-NEXT: cbz w9, .LBB10_5 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w9, x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB10_3 +; SOFTFP-NOLSE-NEXT: b .LBB10_1 +; SOFTFP-NOLSE-NEXT: .LBB10_5: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w9, x8, x24, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB10_3 +; SOFTFP-NOLSE-NEXT: b .LBB10_1 +; SOFTFP-NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fmin ptr %ptr, <2 x double> %value seq_cst, align 16 + ret <2 x double> %res +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll new file mode 100644 index 0000000000000..f41ddcb81d5ca --- /dev/null +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll @@ -0,0 +1,1209 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=NOLSE %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+lse -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefix=LSE %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=-lse,-fp-armv8 -O1 < %s | FileCheck -check-prefix=SOFTFP-NOLSE %s + +define half @test_atomicrmw_fsub_f16_seq_cst_align2(ptr %ptr, half %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align2: +; NOLSE: // %bb.0: +; NOLSE-NEXT: fcvt s1, h0 +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: b .LBB0_2 +; NOLSE-NEXT: .LBB0_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.eq .LBB0_5 +; NOLSE-NEXT: .LBB0_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; NOLSE-NEXT: fcvt s2, h0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fsub s2, s2, s1 +; NOLSE-NEXT: fcvt h2, s2 +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB0_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w10, [x0] +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.ne .LBB0_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB0_3 +; NOLSE-NEXT: b .LBB0_1 +; NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align2: +; LSE: // %bb.0: +; LSE-NEXT: fcvt s1, h0 +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: .LBB0_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fcvt s2, h0 +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fsub s2, s2, s1 +; LSE-NEXT: fcvt h2, s2 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casalh w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8, uxth +; LSE-NEXT: b.ne .LBB0_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align2: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: b .LBB0_2 +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 +; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: bl __subsf3 +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 +; SOFTFP-NOLSE-NEXT: b .LBB0_1 +; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fsub ptr %ptr, half %value seq_cst, align 2 + ret half %res +} + +define half @test_atomicrmw_fsub_f16_seq_cst_align4(ptr %ptr, half %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: fcvt s1, h0 +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: b .LBB1_2 +; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.eq .LBB1_5 +; NOLSE-NEXT: .LBB1_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; NOLSE-NEXT: fcvt s2, h0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fsub s2, s2, s1 +; NOLSE-NEXT: fcvt h2, s2 +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB1_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w10, [x0] +; NOLSE-NEXT: cmp w10, w9, uxth +; NOLSE-NEXT: b.ne .LBB1_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB1_3 +; NOLSE-NEXT: b .LBB1_1 +; NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: fcvt s1, h0 +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: .LBB1_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fcvt s2, h0 +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fsub s2, s2, s1 +; LSE-NEXT: fcvt h2, s2 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casalh w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8, uxth +; LSE-NEXT: b.ne .LBB1_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: b .LBB1_2 +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 +; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: bl __subsf3 +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 +; SOFTFP-NOLSE-NEXT: b .LBB1_1 +; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fsub ptr %ptr, half %value seq_cst, align 4 + ret half %res +} + +define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align2: +; NOLSE: // %bb.0: +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: mov w8, #32767 // =0x7fff +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s1, w9 +; NOLSE-NEXT: b .LBB2_2 +; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 +; NOLSE-NEXT: fmov s0, w11 +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.eq .LBB2_5 +; NOLSE-NEXT: .LBB2_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB2_3 Depth 2 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fsub s2, s2, s1 +; NOLSE-NEXT: fmov w9, s2 +; NOLSE-NEXT: ubfx w10, w9, #16, #1 +; NOLSE-NEXT: add w9, w9, w8 +; NOLSE-NEXT: add w9, w10, w9 +; NOLSE-NEXT: lsr w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fmov w10, s2 +; NOLSE-NEXT: .LBB2_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w11, [x0] +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.ne .LBB2_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w10, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB2_3 +; NOLSE-NEXT: b .LBB2_1 +; NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align2: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: mov w8, #32767 // =0x7fff +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s1, w9 +; LSE-NEXT: .LBB2_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fsub s2, s2, s1 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: ubfx w10, w9, #16, #1 +; LSE-NEXT: add w9, w9, w8 +; LSE-NEXT: add w9, w10, w9 +; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: lsr w9, w9, #16 +; LSE-NEXT: mov w11, w10 +; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s0, w11 +; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: b.ne .LBB2_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align2: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: b .LBB2_2 +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 +; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl __subsf3 +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 +; SOFTFP-NOLSE-NEXT: b .LBB2_1 +; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 2 + ret bfloat %res +} + +define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: mov w8, #32767 // =0x7fff +; NOLSE-NEXT: ldr h0, [x0] +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s1, w9 +; NOLSE-NEXT: b .LBB3_2 +; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 +; NOLSE-NEXT: fmov s0, w11 +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.eq .LBB3_5 +; NOLSE-NEXT: .LBB3_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB3_3 Depth 2 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: lsl w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fsub s2, s2, s1 +; NOLSE-NEXT: fmov w9, s2 +; NOLSE-NEXT: ubfx w10, w9, #16, #1 +; NOLSE-NEXT: add w9, w9, w8 +; NOLSE-NEXT: add w9, w10, w9 +; NOLSE-NEXT: lsr w9, w9, #16 +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fmov w10, s2 +; NOLSE-NEXT: .LBB3_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxrh w11, [x0] +; NOLSE-NEXT: cmp w11, w9, uxth +; NOLSE-NEXT: b.ne .LBB3_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 +; NOLSE-NEXT: stlxrh wzr, w10, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB3_3 +; NOLSE-NEXT: b .LBB3_1 +; NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: mov w8, #32767 // =0x7fff +; LSE-NEXT: ldr h0, [x0] +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s1, w9 +; LSE-NEXT: .LBB3_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fsub s2, s2, s1 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: ubfx w10, w9, #16, #1 +; LSE-NEXT: add w9, w9, w8 +; LSE-NEXT: add w9, w10, w9 +; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: lsr w9, w9, #16 +; LSE-NEXT: mov w11, w10 +; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s0, w11 +; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: b.ne .LBB3_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: b .LBB3_2 +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 +; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl __subsf3 +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 +; SOFTFP-NOLSE-NEXT: b .LBB3_1 +; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 4 + ret bfloat %res +} + +define float @test_atomicrmw_fsub_f32_seq_cst_align4(ptr %ptr, float %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr s1, [x0] +; NOLSE-NEXT: b .LBB4_2 +; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 +; NOLSE-NEXT: fmov s1, w10 +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.eq .LBB4_5 +; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; NOLSE-NEXT: fsub s2, s1, s0 +; NOLSE-NEXT: fmov w9, s1 +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB4_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w10, [x0] +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.ne .LBB4_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB4_3 +; NOLSE-NEXT: b .LBB4_1 +; NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; NOLSE-NEXT: fmov s0, s1 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: ldr s1, [x0] +; LSE-NEXT: .LBB4_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fsub s2, s1, s0 +; LSE-NEXT: fmov w8, s1 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casal w10, w9, [x0] +; LSE-NEXT: fmov s1, w10 +; LSE-NEXT: cmp w10, w8 +; LSE-NEXT: b.ne .LBB4_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: fmov s0, s1 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr w20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: b .LBB4_2 +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 +; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl __subsf3 +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 +; SOFTFP-NOLSE-NEXT: b .LBB4_1 +; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fsub ptr %ptr, float %value seq_cst, align 4 + ret float %res +} + +define double @test_atomicrmw_fsub_f32_seq_cst_align8(ptr %ptr, double %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align8: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr d1, [x0] +; NOLSE-NEXT: b .LBB5_2 +; NOLSE-NEXT: .LBB5_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 +; NOLSE-NEXT: fmov d1, x10 +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.eq .LBB5_5 +; NOLSE-NEXT: .LBB5_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; NOLSE-NEXT: fsub d2, d1, d0 +; NOLSE-NEXT: fmov x9, d1 +; NOLSE-NEXT: fmov x8, d2 +; NOLSE-NEXT: .LBB5_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr x10, [x0] +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.ne .LBB5_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, x8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB5_3 +; NOLSE-NEXT: b .LBB5_1 +; NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, d1 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align8: +; LSE: // %bb.0: +; LSE-NEXT: ldr d1, [x0] +; LSE-NEXT: .LBB5_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fsub d2, d1, d0 +; LSE-NEXT: fmov x8, d1 +; LSE-NEXT: mov x10, x8 +; LSE-NEXT: fmov x9, d2 +; LSE-NEXT: casal x10, x9, [x0] +; LSE-NEXT: fmov d1, x10 +; LSE-NEXT: cmp x10, x8 +; LSE-NEXT: b.ne .LBB5_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: fmov d0, d1 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align8: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x20, [x0] +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: mov x21, x1 +; SOFTFP-NOLSE-NEXT: b .LBB5_2 +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: mov x20, x8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 +; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: mov x1, x21 +; SOFTFP-NOLSE-NEXT: bl __subdf3 +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 +; SOFTFP-NOLSE-NEXT: b .LBB5_1 +; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fsub ptr %ptr, double %value seq_cst, align 8 + ret double %res +} + +define fp128 @test_atomicrmw_fsub_fp128_seq_cst_align16(ptr %ptr, fp128 %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fsub_fp128_seq_cst_align16: +; NOLSE: // %bb.0: +; NOLSE-NEXT: sub sp, sp, #96 +; NOLSE-NEXT: ldr q1, [x0] +; NOLSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; NOLSE-NEXT: mov x19, x0 +; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill +; NOLSE-NEXT: b .LBB6_2 +; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 +; NOLSE-NEXT: stp x12, x13, [sp, #32] +; NOLSE-NEXT: cmp x13, x10 +; NOLSE-NEXT: ldr q1, [sp, #32] +; NOLSE-NEXT: ccmp x12, x11, #0, eq +; NOLSE-NEXT: b.eq .LBB6_6 +; NOLSE-NEXT: .LBB6_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB6_3 Depth 2 +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; NOLSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; NOLSE-NEXT: bl __subtf3 +; NOLSE-NEXT: str q0, [sp, #48] +; NOLSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; NOLSE-NEXT: ldp x9, x8, [sp, #48] +; NOLSE-NEXT: str q0, [sp, #64] +; NOLSE-NEXT: ldp x11, x10, [sp, #64] +; NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxp x12, x13, [x19] +; NOLSE-NEXT: cmp x12, x11 +; NOLSE-NEXT: cset w14, ne +; NOLSE-NEXT: cmp x13, x10 +; NOLSE-NEXT: cinc w14, w14, ne +; NOLSE-NEXT: cbz w14, .LBB6_5 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x12, x13, [x19] +; NOLSE-NEXT: cbnz w14, .LBB6_3 +; NOLSE-NEXT: b .LBB6_1 +; NOLSE-NEXT: .LBB6_5: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x9, x8, [x19] +; NOLSE-NEXT: cbnz w14, .LBB6_3 +; NOLSE-NEXT: b .LBB6_1 +; NOLSE-NEXT: .LBB6_6: // %atomicrmw.end +; NOLSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: add sp, sp, #96 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fsub_fp128_seq_cst_align16: +; LSE: // %bb.0: +; LSE-NEXT: sub sp, sp, #96 +; LSE-NEXT: ldr q1, [x0] +; LSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; LSE-NEXT: mov x19, x0 +; LSE-NEXT: str q0, [sp] // 16-byte Folded Spill +; LSE-NEXT: .LBB6_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov v0.16b, v1.16b +; LSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; LSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; LSE-NEXT: bl __subtf3 +; LSE-NEXT: str q0, [sp, #48] +; LSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; LSE-NEXT: ldp x0, x1, [sp, #48] +; LSE-NEXT: str q0, [sp, #64] +; LSE-NEXT: ldp x2, x3, [sp, #64] +; LSE-NEXT: mov x4, x2 +; LSE-NEXT: mov x5, x3 +; LSE-NEXT: caspal x4, x5, x0, x1, [x19] +; LSE-NEXT: stp x4, x5, [sp, #32] +; LSE-NEXT: cmp x5, x3 +; LSE-NEXT: ldr q1, [sp, #32] +; LSE-NEXT: ccmp x4, x2, #0, eq +; LSE-NEXT: b.ne .LBB6_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; LSE-NEXT: mov v0.16b, v1.16b +; LSE-NEXT: add sp, sp, #96 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_fp128_seq_cst_align16: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: mov x19, x3 +; SOFTFP-NOLSE-NEXT: ldp x0, x1, [x0] +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x21, x2 +; SOFTFP-NOLSE-NEXT: b .LBB6_2 +; SOFTFP-NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: ccmp x0, x23, #0, eq +; SOFTFP-NOLSE-NEXT: b.eq .LBB6_6 +; SOFTFP-NOLSE-NEXT: .LBB6_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov x2, x21 +; SOFTFP-NOLSE-NEXT: mov x3, x19 +; SOFTFP-NOLSE-NEXT: mov x22, x1 +; SOFTFP-NOLSE-NEXT: mov x23, x0 +; SOFTFP-NOLSE-NEXT: bl __subtf3 +; SOFTFP-NOLSE-NEXT: mov x8, x0 +; SOFTFP-NOLSE-NEXT: mov x9, x1 +; SOFTFP-NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxp x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cmp x0, x23 +; SOFTFP-NOLSE-NEXT: cset w10, ne +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: cinc w10, w10, ne +; SOFTFP-NOLSE-NEXT: cbz w10, .LBB6_5 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w10, x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB6_3 +; SOFTFP-NOLSE-NEXT: b .LBB6_1 +; SOFTFP-NOLSE-NEXT: .LBB6_5: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w10, x8, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB6_3 +; SOFTFP-NOLSE-NEXT: b .LBB6_1 +; SOFTFP-NOLSE-NEXT: .LBB6_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fsub ptr %ptr, fp128 %value seq_cst, align 16 + ret fp128 %res +} + +define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fsub_v2f16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: fcvtl v1.4s, v0.4h +; NOLSE-NEXT: ldr s0, [x0] +; NOLSE-NEXT: b .LBB7_2 +; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.eq .LBB7_5 +; NOLSE-NEXT: .LBB7_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB7_3 Depth 2 +; NOLSE-NEXT: fcvtl v2.4s, v0.4h +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fsub v2.4s, v2.4s, v1.4s +; NOLSE-NEXT: fcvtn v2.4h, v2.4s +; NOLSE-NEXT: fmov w8, s2 +; NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w10, [x0] +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.ne .LBB7_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB7_3 +; NOLSE-NEXT: b .LBB7_1 +; NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fsub_v2f16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: fcvtl v1.4s, v0.4h +; LSE-NEXT: ldr s0, [x0] +; LSE-NEXT: .LBB7_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fcvtl v2.4s, v0.4h +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: fsub v2.4s, v2.4s, v1.4s +; LSE-NEXT: fcvtn v2.4h, v2.4s +; LSE-NEXT: fmov w9, s2 +; LSE-NEXT: casal w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8 +; LSE-NEXT: b.ne .LBB7_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_v2f16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: stp x30, x25, [sp, #-64]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: mov w22, w1 +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w19, w2 +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: b .LBB7_2 +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 +; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 +; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: and w0, w23, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w24 +; SOFTFP-NOLSE-NEXT: bl __subsf3 +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w25, w0 +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w25 +; SOFTFP-NOLSE-NEXT: bl __subsf3 +; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 +; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 +; SOFTFP-NOLSE-NEXT: b .LBB7_1 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w23 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x25, [sp], #64 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fsub ptr %ptr, <2 x half> %value seq_cst, align 4 + ret <2 x half> %res +} + +define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bfloat> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fsub_v2bf16_seq_cst_align4: +; NOLSE: // %bb.0: +; NOLSE-NEXT: movi v1.4s, #1 +; NOLSE-NEXT: movi v2.4s, #127, msl #8 +; NOLSE-NEXT: shll v3.4s, v0.4h, #16 +; NOLSE-NEXT: ldr s0, [x0] +; NOLSE-NEXT: b .LBB8_2 +; NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 +; NOLSE-NEXT: fmov s0, w10 +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.eq .LBB8_5 +; NOLSE-NEXT: .LBB8_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB8_3 Depth 2 +; NOLSE-NEXT: shll v4.4s, v0.4h, #16 +; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: fsub v4.4s, v4.4s, v3.4s +; NOLSE-NEXT: ushr v5.4s, v4.4s, #16 +; NOLSE-NEXT: and v5.16b, v5.16b, v1.16b +; NOLSE-NEXT: add v4.4s, v5.4s, v4.4s +; NOLSE-NEXT: addhn v4.4h, v4.4s, v2.4s +; NOLSE-NEXT: fmov w8, s4 +; NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr w10, [x0] +; NOLSE-NEXT: cmp w10, w9 +; NOLSE-NEXT: b.ne .LBB8_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, w8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB8_3 +; NOLSE-NEXT: b .LBB8_1 +; NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fsub_v2bf16_seq_cst_align4: +; LSE: // %bb.0: +; LSE-NEXT: movi v1.4s, #1 +; LSE-NEXT: movi v2.4s, #127, msl #8 +; LSE-NEXT: shll v3.4s, v0.4h, #16 +; LSE-NEXT: ldr s0, [x0] +; LSE-NEXT: .LBB8_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: shll v4.4s, v0.4h, #16 +; LSE-NEXT: fmov w8, s0 +; LSE-NEXT: fsub v4.4s, v4.4s, v3.4s +; LSE-NEXT: mov w10, w8 +; LSE-NEXT: ushr v5.4s, v4.4s, #16 +; LSE-NEXT: and v5.16b, v5.16b, v1.16b +; LSE-NEXT: add v4.4s, v5.4s, v4.4s +; LSE-NEXT: addhn v4.4h, v4.4s, v2.4s +; LSE-NEXT: fmov w9, s4 +; LSE-NEXT: casal w10, w9, [x0] +; LSE-NEXT: fmov s0, w10 +; LSE-NEXT: cmp w10, w8 +; LSE-NEXT: b.ne .LBB8_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_v2bf16_seq_cst_align4: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w8, w1 +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 +; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 +; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: b .LBB8_2 +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 +; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 +; SOFTFP-NOLSE-NEXT: lsl w23, w1, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: bl __subsf3 +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: bl __subsf3 +; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 +; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 +; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 +; SOFTFP-NOLSE-NEXT: b .LBB8_1 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fsub ptr %ptr, <2 x bfloat> %value seq_cst, align 4 + ret <2 x bfloat> %res +} + +define <2 x float> @test_atomicrmw_fsub_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fsub_v2f32_seq_cst_align8: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr d1, [x0] +; NOLSE-NEXT: b .LBB9_2 +; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 +; NOLSE-NEXT: fmov d1, x10 +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.eq .LBB9_5 +; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; NOLSE-NEXT: fsub v2.2s, v1.2s, v0.2s +; NOLSE-NEXT: fmov x9, d1 +; NOLSE-NEXT: fmov x8, d2 +; NOLSE-NEXT: .LBB9_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxr x10, [x0] +; NOLSE-NEXT: cmp x10, x9 +; NOLSE-NEXT: b.ne .LBB9_1 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 +; NOLSE-NEXT: stlxr wzr, x8, [x0] +; NOLSE-NEXT: cbnz wzr, .LBB9_3 +; NOLSE-NEXT: b .LBB9_1 +; NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, d1 +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fsub_v2f32_seq_cst_align8: +; LSE: // %bb.0: +; LSE-NEXT: ldr d1, [x0] +; LSE-NEXT: .LBB9_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fsub v2.2s, v1.2s, v0.2s +; LSE-NEXT: fmov x8, d1 +; LSE-NEXT: mov x10, x8 +; LSE-NEXT: fmov x9, d2 +; LSE-NEXT: casal x10, x9, [x0] +; LSE-NEXT: fmov d1, x10 +; LSE-NEXT: cmp x10, x8 +; LSE-NEXT: b.ne .LBB9_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: fmov d0, d1 +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_v2f32_seq_cst_align8: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w19, w2 +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: b .LBB9_2 +; SOFTFP-NOLSE-NEXT: .LBB9_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 +; SOFTFP-NOLSE-NEXT: b.eq .LBB9_5 +; SOFTFP-NOLSE-NEXT: .LBB9_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w19 +; SOFTFP-NOLSE-NEXT: bl __subsf3 +; SOFTFP-NOLSE-NEXT: mov w24, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: bl __subsf3 +; SOFTFP-NOLSE-NEXT: mov w8, w23 +; SOFTFP-NOLSE-NEXT: mov w9, w0 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB9_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: b.ne .LBB9_1 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB9_3 +; SOFTFP-NOLSE-NEXT: b .LBB9_1 +; SOFTFP-NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fsub ptr %ptr, <2 x float> %value seq_cst, align 8 + ret <2 x float> %res +} + +define <2 x double> @test_atomicrmw_fsub_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 { +; NOLSE-LABEL: test_atomicrmw_fsub_v2f64_seq_cst_align8: +; NOLSE: // %bb.0: +; NOLSE-NEXT: ldr q1, [x0] +; NOLSE-NEXT: b .LBB10_2 +; NOLSE-NEXT: .LBB10_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 +; NOLSE-NEXT: fmov d1, x12 +; NOLSE-NEXT: cmp x13, x9 +; NOLSE-NEXT: ccmp x12, x11, #0, eq +; NOLSE-NEXT: mov v1.d[1], x13 +; NOLSE-NEXT: b.eq .LBB10_6 +; NOLSE-NEXT: .LBB10_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; NOLSE-NEXT: fsub v2.2d, v1.2d, v0.2d +; NOLSE-NEXT: mov x9, v1.d[1] +; NOLSE-NEXT: fmov x11, d1 +; NOLSE-NEXT: mov x8, v2.d[1] +; NOLSE-NEXT: fmov x10, d2 +; NOLSE-NEXT: .LBB10_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxp x12, x13, [x0] +; NOLSE-NEXT: cmp x12, x11 +; NOLSE-NEXT: cset w14, ne +; NOLSE-NEXT: cmp x13, x9 +; NOLSE-NEXT: cinc w14, w14, ne +; NOLSE-NEXT: cbz w14, .LBB10_5 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x12, x13, [x0] +; NOLSE-NEXT: cbnz w14, .LBB10_3 +; NOLSE-NEXT: b .LBB10_1 +; NOLSE-NEXT: .LBB10_5: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x10, x8, [x0] +; NOLSE-NEXT: cbnz w14, .LBB10_3 +; NOLSE-NEXT: b .LBB10_1 +; NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: ret +; +; LSE-LABEL: test_atomicrmw_fsub_v2f64_seq_cst_align8: +; LSE: // %bb.0: +; LSE-NEXT: ldr q1, [x0] +; LSE-NEXT: .LBB10_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: fsub v2.2d, v1.2d, v0.2d +; LSE-NEXT: mov x3, v1.d[1] +; LSE-NEXT: fmov x2, d1 +; LSE-NEXT: mov x7, x3 +; LSE-NEXT: mov x5, v2.d[1] +; LSE-NEXT: mov x6, x2 +; LSE-NEXT: fmov x4, d2 +; LSE-NEXT: caspal x6, x7, x4, x5, [x0] +; LSE-NEXT: fmov d1, x6 +; LSE-NEXT: cmp x7, x3 +; LSE-NEXT: ccmp x6, x2, #0, eq +; LSE-NEXT: mov v1.d[1], x7 +; LSE-NEXT: b.ne .LBB10_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: mov v0.16b, v1.16b +; LSE-NEXT: ret +; +; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_v2f64_seq_cst_align8: +; SOFTFP-NOLSE: // %bb.0: +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x20, x0 +; SOFTFP-NOLSE-NEXT: mov x19, x3 +; SOFTFP-NOLSE-NEXT: ldp x0, x1, [x0] +; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov x21, x2 +; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: b .LBB10_2 +; SOFTFP-NOLSE-NEXT: .LBB10_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: ccmp x0, x23, #0, eq +; SOFTFP-NOLSE-NEXT: b.eq .LBB10_6 +; SOFTFP-NOLSE-NEXT: .LBB10_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 +; SOFTFP-NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; SOFTFP-NOLSE-NEXT: mov x22, x1 +; SOFTFP-NOLSE-NEXT: mov x23, x0 +; SOFTFP-NOLSE-NEXT: mov x0, x1 +; SOFTFP-NOLSE-NEXT: mov x1, x19 +; SOFTFP-NOLSE-NEXT: bl __subdf3 +; SOFTFP-NOLSE-NEXT: mov x24, x0 +; SOFTFP-NOLSE-NEXT: mov x0, x23 +; SOFTFP-NOLSE-NEXT: mov x1, x21 +; SOFTFP-NOLSE-NEXT: bl __subdf3 +; SOFTFP-NOLSE-NEXT: mov x8, x0 +; SOFTFP-NOLSE-NEXT: .LBB10_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 +; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; SOFTFP-NOLSE-NEXT: ldaxp x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cmp x0, x23 +; SOFTFP-NOLSE-NEXT: cset w9, ne +; SOFTFP-NOLSE-NEXT: cmp x1, x22 +; SOFTFP-NOLSE-NEXT: cinc w9, w9, ne +; SOFTFP-NOLSE-NEXT: cbz w9, .LBB10_5 +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w9, x0, x1, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB10_3 +; SOFTFP-NOLSE-NEXT: b .LBB10_1 +; SOFTFP-NOLSE-NEXT: .LBB10_5: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 +; SOFTFP-NOLSE-NEXT: stlxp w9, x8, x24, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB10_3 +; SOFTFP-NOLSE-NEXT: b .LBB10_1 +; SOFTFP-NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ret + %res = atomicrmw fsub ptr %ptr, <2 x double> %value seq_cst, align 16 + ret <2 x double> %res +} + +attributes #0 = { nounwind }