diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index d81fe54fe844c..287bd00aeba82 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -426,28 +426,54 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) { static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI, BitVector &RUs, const uint32_t *Mask) { - BitVector ClobberedRUs(TRI.getNumRegUnits(), true); + // FIXME: This intentionally works in reverse due to some issues with the + // Register Units infrastructure. + // + // This is used to apply callee-saved-register masks to the clobbered regunits + // mask. + // + // The right way to approach this is to start with a BitVector full of ones, + // then reset all the bits of the regunits of each register that is set in the + // mask (registers preserved), then OR the resulting bits with the Clobbers + // mask. This correctly prioritizes the saved registers, so if a RU is shared + // between a register that is preserved, and one that is NOT preserved, that + // RU will not be set in the output vector (the clobbers). + // + // What we have to do for now is the opposite: we have to assume that the + // regunits of all registers that are NOT preserved are clobbered, even if + // those regunits are preserved by another register. So if a RU is shared + // like described previously, that RU will be set. + // + // This is to work around an issue which appears in AArch64, but isn't + // exclusive to that target: AArch64's Qn registers (128 bits) have Dn + // register (lower 64 bits). A few Dn registers are preserved by some calling + // conventions, but Qn and Dn share exactly the same reg units. + // + // If we do this the right way, Qn will be marked as NOT clobbered even though + // its upper 64 bits are NOT preserved. The conservative approach handles this + // correctly at the cost of some missed optimizations on other targets. + // + // This is caused by how RegUnits are handled within TableGen. Ideally, Qn + // should have an extra RegUnit to model the "unknown" bits not covered by the + // subregs. + BitVector RUsFromRegsNotInMask(TRI.getNumRegUnits()); const unsigned NumRegs = TRI.getNumRegs(); const unsigned MaskWords = (NumRegs + 31) / 32; for (unsigned K = 0; K < MaskWords; ++K) { const uint32_t Word = Mask[K]; - if (!Word) - continue; - for (unsigned Bit = 0; Bit < 32; ++Bit) { const unsigned PhysReg = (K * 32) + Bit; if (PhysReg == NumRegs) break; - // Check if we have a valid PhysReg that is set in the mask. - if ((Word >> Bit) & 1) { + if (PhysReg && !((Word >> Bit) & 1)) { for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI) - ClobberedRUs.reset(*RUI); + RUsFromRegsNotInMask.set(*RUI); } } } - RUs |= ClobberedRUs; + RUs |= RUsFromRegsNotInMask; } /// Examine the instruction for potentai LICM candidate. Also diff --git a/llvm/test/CodeGen/AArch64/mlicm-csr-mask.mir b/llvm/test/CodeGen/AArch64/mlicm-csr-mask.mir new file mode 100644 index 0000000000000..f6a0abfdc410b --- /dev/null +++ b/llvm/test/CodeGen/AArch64/mlicm-csr-mask.mir @@ -0,0 +1,49 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=aarch64-unknown-linux-gnu -run-pass=greedy,machinelicm -verify-machineinstrs -debug -o - %s | FileCheck %s + +# FIXME: Running RA is needed otherwise it runs pre-RA LICM. +--- +name: test +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: test + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $x0, $w1, $x2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: B %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $x0, $w1, $x2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $q11 = MOVIv4i32 2, 8 + ; CHECK-NEXT: BL &memset, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit $w1, implicit $x2, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: renamable $q10 = MVNIv4i32 4, 0 + ; CHECK-NEXT: $xzr = SUBSXri $x0, 1, 0, implicit-def $nzcv + ; CHECK-NEXT: Bcc 11, %bb.1, implicit $nzcv + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $q10, $q11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $q0 = COPY $q10 + ; CHECK-NEXT: $q1 = COPY $q11 + bb.0: + liveins: $x0, $w1, $x2 + B %bb.1 + + bb.1: + liveins: $x0, $w1, $x2 + renamable $q11 = MOVIv4i32 2, 8 + BL &memset, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit $w1, implicit $x2, implicit-def $sp, implicit-def $x0 + renamable $q10 = MVNIv4i32 4, 0 + $xzr = SUBSXri $x0, 1, 0, implicit-def $nzcv + Bcc 11, %bb.1, implicit $nzcv + B %bb.2 + + bb.2: + liveins: $q10, $q11 + $q0 = COPY $q10 + $q1 = COPY $q11 +... diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index 7799b9509ceb0..da8aa54469835 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -886,12 +886,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s62, 30 ; GCN-NEXT: v_writelane_b32 v40, s63, 31 ; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: s_movk_i32 s4, 0x7b ; GCN-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s8, v0 ; GCN-NEXT: v_readfirstlane_b32 s9, v1 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] ; GCN-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GCN-NEXT: s_movk_i32 s4, 0x7b ; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: s_xor_b64 exec, exec, s[10:11] @@ -980,12 +980,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s62, 30 ; GISEL-NEXT: v_writelane_b32 v40, s63, 31 ; GISEL-NEXT: s_mov_b64 s[6:7], exec -; GISEL-NEXT: s_movk_i32 s4, 0x7b ; GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s8, v0 ; GISEL-NEXT: v_readfirstlane_b32 s9, v1 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] ; GISEL-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GISEL-NEXT: s_movk_i32 s4, 0x7b ; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: s_xor_b64 exec, exec, s[10:11]