diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index d81fe54fe844c..287bd00aeba82 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -426,28 +426,54 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
 static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI,
                                                 BitVector &RUs,
                                                 const uint32_t *Mask) {
-  BitVector ClobberedRUs(TRI.getNumRegUnits(), true);
+  // FIXME: This intentionally works in reverse due to some issues with the
+  // Register Units infrastructure.
+  //
+  // This is used to apply callee-saved-register masks to the clobbered regunits
+  // mask.
+  //
+  // The right way to approach this is to start with a BitVector full of ones,
+  // then reset all the bits of the regunits of each register that is set in the
+  // mask (registers preserved), then OR the resulting bits with the Clobbers
+  // mask. This correctly prioritizes the saved registers, so if a RU is shared
+  // between a register that is preserved, and one that is NOT preserved, that
+  // RU will not be set in the output vector (the clobbers).
+  //
+  // What we have to do for now is the opposite: we have to assume that the
+  // regunits of all registers that are NOT preserved are clobbered, even if
+  // those regunits are preserved by another register. So if a RU is shared
+  // like described previously, that RU will be set.
+  //
+  // This is to work around an issue which appears in AArch64, but isn't
+  // exclusive to that target: AArch64's Qn registers (128 bits) have Dn
+  // register (lower 64 bits). A few Dn registers are preserved by some calling
+  // conventions, but Qn and Dn share exactly the same reg units.
+  //
+  // If we do this the right way, Qn will be marked as NOT clobbered even though
+  // its upper 64 bits are NOT preserved. The conservative approach handles this
+  // correctly at the cost of some missed optimizations on other targets.
+  //
+  // This is caused by how RegUnits are handled within TableGen. Ideally, Qn
+  // should have an extra RegUnit to model the "unknown" bits not covered by the
+  // subregs.
+  BitVector RUsFromRegsNotInMask(TRI.getNumRegUnits());
   const unsigned NumRegs = TRI.getNumRegs();
   const unsigned MaskWords = (NumRegs + 31) / 32;
   for (unsigned K = 0; K < MaskWords; ++K) {
     const uint32_t Word = Mask[K];
-    if (!Word)
-      continue;
-
     for (unsigned Bit = 0; Bit < 32; ++Bit) {
       const unsigned PhysReg = (K * 32) + Bit;
       if (PhysReg == NumRegs)
         break;
 
-      // Check if we have a valid PhysReg that is set in the mask.
-      if ((Word >> Bit) & 1) {
+      if (PhysReg && !((Word >> Bit) & 1)) {
         for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI)
-          ClobberedRUs.reset(*RUI);
+          RUsFromRegsNotInMask.set(*RUI);
       }
     }
   }
 
-  RUs |= ClobberedRUs;
+  RUs |= RUsFromRegsNotInMask;
 }
 
 /// Examine the instruction for potentai LICM candidate. Also
diff --git a/llvm/test/CodeGen/AArch64/mlicm-csr-mask.mir b/llvm/test/CodeGen/AArch64/mlicm-csr-mask.mir
new file mode 100644
index 0000000000000..f6a0abfdc410b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/mlicm-csr-mask.mir
@@ -0,0 +1,49 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64-unknown-linux-gnu -run-pass=greedy,machinelicm -verify-machineinstrs -debug -o - %s | FileCheck %s
+
+# FIXME: Running RA is needed otherwise it runs pre-RA LICM.
+---
+name: test
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $x0, $w1, $x2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $x0, $w1, $x2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $q11 = MOVIv4i32 2, 8
+  ; CHECK-NEXT:   BL &memset, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit $w1, implicit $x2, implicit-def $sp, implicit-def $x0
+  ; CHECK-NEXT:   renamable $q10 = MVNIv4i32 4, 0
+  ; CHECK-NEXT:   $xzr = SUBSXri $x0, 1, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 11, %bb.1, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   liveins: $q10, $q11
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $q0 = COPY $q10
+  ; CHECK-NEXT:   $q1 = COPY $q11
+  bb.0:
+    liveins: $x0, $w1, $x2
+    B %bb.1
+
+  bb.1:
+    liveins: $x0, $w1, $x2
+    renamable $q11 = MOVIv4i32 2, 8
+    BL &memset, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit $w1, implicit $x2, implicit-def $sp, implicit-def $x0
+    renamable $q10 = MVNIv4i32 4, 0
+    $xzr = SUBSXri $x0, 1, 0, implicit-def $nzcv
+    Bcc 11, %bb.1, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+    liveins: $q10, $q11
+    $q0 = COPY $q10
+    $q1 = COPY $q11
+...
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 7799b9509ceb0..da8aa54469835 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -886,12 +886,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GCN-NEXT:    v_writelane_b32 v40, s62, 30
 ; GCN-NEXT:    v_writelane_b32 v40, s63, 31
 ; GCN-NEXT:    s_mov_b64 s[6:7], exec
-; GCN-NEXT:    s_movk_i32 s4, 0x7b
 ; GCN-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v0
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v1
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GCN-NEXT:    s_and_saveexec_b64 s[10:11], vcc
+; GCN-NEXT:    s_movk_i32 s4, 0x7b
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN-NEXT:    s_xor_b64 exec, exec, s[10:11]
@@ -980,12 +980,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GISEL-NEXT:    v_writelane_b32 v40, s62, 30
 ; GISEL-NEXT:    v_writelane_b32 v40, s63, 31
 ; GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GISEL-NEXT:    s_movk_i32 s4, 0x7b
 ; GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s8, v0
 ; GISEL-NEXT:    v_readfirstlane_b32 s9, v1
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GISEL-NEXT:    s_and_saveexec_b64 s[10:11], vcc
+; GISEL-NEXT:    s_movk_i32 s4, 0x7b
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GISEL-NEXT:    s_xor_b64 exec, exec, s[10:11]