[SelectOpt] Optimise big select groups in the latch of a non-inner loop to branches #119728

igogo-x86 · 2024-12-12T17:36:46Z

Loop latches often have a loop-carried dependency, and if they have several SelectLike instructions in one select group, it is usually profitable to convert it to branches rather than keep selects.

llvmbot · 2024-12-12T17:37:23Z

@llvm/pr-subscribers-backend-aarch64

Author: Igor Kirillov (igogo-x86)

Changes

Loop latches often have a loop-carried dependency, and if they have several SelectLike instructions in one select group, it is usually profitable to convert it to branches rather than keep selects.

Full diff: https://github.com/llvm/llvm-project/pull/119728.diff

2 Files Affected:

(modified) llvm/lib/CodeGen/SelectOptimize.cpp (+12)
(modified) llvm/test/CodeGen/AArch64/selectopt.ll (+121)

diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp
index 98321a395a506a..06201b00f93461 100644
--- a/llvm/lib/CodeGen/SelectOptimize.cpp
+++ b/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -1041,6 +1041,18 @@ bool SelectOptimizeImpl::isConvertToBranchProfitableBase(
     return true;
   }
 
+  // If latch has a select group with several elements, it is usually profitable
+  // to convert it to branches. We let `optimizeSelectsInnerLoops` decide if
+  // conversion is profitable for innermost loops.
+  auto *BB = SI.getI()->getParent();
+  auto *L = LI->getLoopFor(BB);
+  if (L && !L->isInnermost() && L->getLoopLatch() == BB &&
+      ASI.Selects.size() >= 3) {
+    OR << "Converted to branch because select group in the latch block is big.";
+    EmitAndPrintRemark(ORE, OR);
+    return true;
+  }
+
   ORmiss << "Not profitable to convert to branch (base heuristic).";
   EmitAndPrintRemark(ORE, ORmiss);
   return false;
diff --git a/llvm/test/CodeGen/AArch64/selectopt.ll b/llvm/test/CodeGen/AArch64/selectopt.ll
index 54309dca3b8345..d72a956e08d0c1 100644
--- a/llvm/test/CodeGen/AArch64/selectopt.ll
+++ b/llvm/test/CodeGen/AArch64/selectopt.ll
@@ -875,3 +875,124 @@ if.end:
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
+
+declare i64 @payload(i64, ptr, ptr, i64)
+
+define void @outer_latch_heuristic(ptr %dst, ptr %src, i64 %p, i64 %dim) {
+; CHECKOO-LABEL: @outer_latch_heuristic(
+; CHECKOO-NEXT:  entry:
+; CHECKOO-NEXT:    br label [[OUTER_LOOP:%.*]]
+; CHECKOO:       outer.loop:
+; CHECKOO-NEXT:    [[K_020_US:%.*]] = phi i64 [ [[INC7_US:%.*]], [[SELECT_END:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECKOO-NEXT:    [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[SELECT_END]] ], [ 0, [[ENTRY]] ]
+; CHECKOO-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[SELECT_END]] ], [ 0, [[ENTRY]] ]
+; CHECKOO-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC:%.*]], i64 [[I]]
+; CHECKOO-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
+; CHECKOO-NEXT:    [[ARRAYIDX1_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[J]]
+; CHECKOO-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX1_US]], align 8
+; CHECKOO-NEXT:    br label [[INNER_LOOP:%.*]]
+; CHECKOO:       inner.loop:
+; CHECKOO-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[DIM:%.*]], [[OUTER_LOOP]] ], [ [[LSR_IV_NEXT:%.*]], [[INNER_LOOP]] ]
+; CHECKOO-NEXT:    [[DIFF_04_I_US:%.*]] = phi i64 [ [[CALL_I_US:%.*]], [[INNER_LOOP]] ], [ 0, [[OUTER_LOOP]] ]
+; CHECKOO-NEXT:    [[CALL_I_US]] = tail call i64 @payload(i64 [[DIFF_04_I_US]], ptr [[TMP0]], ptr [[TMP1]], i64 [[P:%.*]])
+; CHECKOO-NEXT:    [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1
+; CHECKOO-NEXT:    [[EXITCOND_NOT_I_US:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
+; CHECKOO-NEXT:    br i1 [[EXITCOND_NOT_I_US]], label [[LATCH:%.*]], label [[INNER_LOOP]]
+; CHECKOO:       latch:
+; CHECKOO-NEXT:    [[CMP2_US:%.*]] = icmp sgt i64 [[CALL_I_US]], -1
+; CHECKOO-NEXT:    [[DIFF_0_LCSSA_I_LOBIT_US:%.*]] = lshr i64 [[CALL_I_US]], 63
+; CHECKOO-NEXT:    [[CMP2_US_FROZEN:%.*]] = freeze i1 [[CMP2_US]]
+; CHECKOO-NEXT:    br i1 [[CMP2_US_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_FALSE_SINK:%.*]]
+; CHECKOO:       select.true.sink:
+; CHECKOO-NEXT:    [[TMP2:%.*]] = add nsw i64 [[J]], 1
+; CHECKOO-NEXT:    br label [[SELECT_END]]
+; CHECKOO:       select.false.sink:
+; CHECKOO-NEXT:    [[TMP3:%.*]] = add nsw i64 1, [[I]]
+; CHECKOO-NEXT:    br label [[SELECT_END]]
+; CHECKOO:       select.end:
+; CHECKOO-NEXT:    [[I_NEXT]] = phi i64 [ [[I]], [[SELECT_TRUE_SINK]] ], [ [[TMP3]], [[SELECT_FALSE_SINK]] ]
+; CHECKOO-NEXT:    [[J_NEXT]] = phi i64 [ [[TMP2]], [[SELECT_TRUE_SINK]] ], [ [[J]], [[SELECT_FALSE_SINK]] ]
+; CHECKOO-NEXT:    [[COND_IN_US:%.*]] = phi ptr [ [[ARRAYIDX1_US]], [[SELECT_TRUE_SINK]] ], [ [[ARRAYIDX_US]], [[SELECT_FALSE_SINK]] ]
+; CHECKOO-NEXT:    [[INC4_US:%.*]] = zext i1 [[CMP2_US]] to i64
+; CHECKOO-NEXT:    [[COND_US:%.*]] = load ptr, ptr [[COND_IN_US]], align 8
+; CHECKOO-NEXT:    [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[DST:%.*]], i64 [[K_020_US]]
+; CHECKOO-NEXT:    store ptr [[COND_US]], ptr [[ARRAYIDX6_US]], align 8
+; CHECKOO-NEXT:    [[INC7_US]] = add i64 [[K_020_US]], 1
+; CHECKOO-NEXT:    [[EXITCOND23_NOT:%.*]] = icmp eq i64 [[K_020_US]], 1000
+; CHECKOO-NEXT:    br i1 [[EXITCOND23_NOT]], label [[EXIT:%.*]], label [[OUTER_LOOP]]
+; CHECKOO:       exit:
+; CHECKOO-NEXT:    ret void
+;
+; CHECKII-LABEL: @outer_latch_heuristic(
+; CHECKII-NEXT:  entry:
+; CHECKII-NEXT:    br label [[OUTER_LOOP:%.*]]
+; CHECKII:       outer.loop:
+; CHECKII-NEXT:    [[K_020_US:%.*]] = phi i64 [ [[INC7_US:%.*]], [[LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECKII-NEXT:    [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[LATCH]] ], [ 0, [[ENTRY]] ]
+; CHECKII-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH]] ], [ 0, [[ENTRY]] ]
+; CHECKII-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC:%.*]], i64 [[I]]
+; CHECKII-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
+; CHECKII-NEXT:    [[ARRAYIDX1_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[J]]
+; CHECKII-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX1_US]], align 8
+; CHECKII-NEXT:    br label [[INNER_LOOP:%.*]]
+; CHECKII:       inner.loop:
+; CHECKII-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[DIM:%.*]], [[OUTER_LOOP]] ], [ [[LSR_IV_NEXT:%.*]], [[INNER_LOOP]] ]
+; CHECKII-NEXT:    [[DIFF_04_I_US:%.*]] = phi i64 [ [[CALL_I_US:%.*]], [[INNER_LOOP]] ], [ 0, [[OUTER_LOOP]] ]
+; CHECKII-NEXT:    [[CALL_I_US]] = tail call i64 @payload(i64 [[DIFF_04_I_US]], ptr [[TMP0]], ptr [[TMP1]], i64 [[P:%.*]])
+; CHECKII-NEXT:    [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1
+; CHECKII-NEXT:    [[EXITCOND_NOT_I_US:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
+; CHECKII-NEXT:    br i1 [[EXITCOND_NOT_I_US]], label [[LATCH]], label [[INNER_LOOP]]
+; CHECKII:       latch:
+; CHECKII-NEXT:    [[CMP2_US:%.*]] = icmp sgt i64 [[CALL_I_US]], -1
+; CHECKII-NEXT:    [[DIFF_0_LCSSA_I_LOBIT_US:%.*]] = lshr i64 [[CALL_I_US]], 63
+; CHECKII-NEXT:    [[I_NEXT]] = add nsw i64 [[DIFF_0_LCSSA_I_LOBIT_US]], [[I]]
+; CHECKII-NEXT:    [[INC4_US:%.*]] = zext i1 [[CMP2_US]] to i64
+; CHECKII-NEXT:    [[J_NEXT]] = add nsw i64 [[J]], [[INC4_US]]
+; CHECKII-NEXT:    [[COND_IN_US:%.*]] = select i1 [[CMP2_US]], ptr [[ARRAYIDX1_US]], ptr [[ARRAYIDX_US]]
+; CHECKII-NEXT:    [[COND_US:%.*]] = load ptr, ptr [[COND_IN_US]], align 8
+; CHECKII-NEXT:    [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[DST:%.*]], i64 [[K_020_US]]
+; CHECKII-NEXT:    store ptr [[COND_US]], ptr [[ARRAYIDX6_US]], align 8
+; CHECKII-NEXT:    [[INC7_US]] = add i64 [[K_020_US]], 1
+; CHECKII-NEXT:    [[EXITCOND23_NOT:%.*]] = icmp eq i64 [[K_020_US]], 1000
+; CHECKII-NEXT:    br i1 [[EXITCOND23_NOT]], label [[EXIT:%.*]], label [[OUTER_LOOP]]
+; CHECKII:       exit:
+; CHECKII-NEXT:    ret void
+;
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %k.020.us = phi i64 [ %inc7.us, %latch ], [ 0, %entry ]
+  %j = phi i64 [ %j.next, %latch ], [ 0, %entry ]
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %arrayidx.us = getelementptr inbounds ptr, ptr %src, i64 %i
+  %4 = load ptr, ptr %arrayidx.us, align 8
+  %arrayidx1.us = getelementptr inbounds ptr, ptr %src, i64 %j
+  %5 = load ptr, ptr %arrayidx1.us, align 8
+  br label %inner.loop
+
+inner.loop:
+  %lsr.iv = phi i64 [ %dim, %outer.loop ], [ %lsr.iv.next, %inner.loop ]
+  %diff.04.i.us = phi i64 [ %call.i.us, %inner.loop ], [ 0, %outer.loop ]
+  %call.i.us = tail call i64 @payload(i64 %diff.04.i.us, ptr %4, ptr %5, i64 %p)
+  %lsr.iv.next = add i64 %lsr.iv, -1
+  %exitcond.not.i.us = icmp eq i64 %lsr.iv.next, 0
+  br i1 %exitcond.not.i.us, label %latch, label %inner.loop
+
+latch:
+  %cmp2.us = icmp sgt i64 %call.i.us, -1
+  %diff.0.lcssa.i.lobit.us = lshr i64 %call.i.us, 63
+  %i.next = add nsw i64 %diff.0.lcssa.i.lobit.us, %i
+  %inc4.us = zext i1 %cmp2.us to i64
+  %j.next = add nsw i64 %j, %inc4.us
+  %cond.in.us = select i1 %cmp2.us, ptr %arrayidx1.us, ptr %arrayidx.us
+  %cond.us = load ptr, ptr %cond.in.us, align 8
+  %arrayidx6.us = getelementptr inbounds ptr, ptr %dst, i64 %k.020.us
+  store ptr %cond.us, ptr %arrayidx6.us, align 8
+  %inc7.us = add i64 %k.020.us, 1
+  %exitcond23.not = icmp eq i64 %k.020.us, 1000
+  br i1 %exitcond23.not, label %exit, label %outer.loop
+
+exit:
+  ret void
+}

igogo-x86 · 2024-12-16T15:06:15Z

The test shows an example of code that gets massive performance improvements on real-life benchmarks, and the heuristic is just as dumb as possible to achieve that effect.

Alternatively, we could check that a group has select-like instructions and convert it in any outer block, not only latch. I guess we better let the model decide what to do for inner loops.
A more complicated heuristic would be to convert if select-like instructions in a big group have loop-carried dependencies:

for () {
  for () {
     ... a[i]...
  }
  if (cond)
    ++i;
}

davemgreen · 2024-12-17T10:48:27Z

This sounds like a bit like tail-duplication if the latch has a select that is profitable to convert. This probably doesn't apply to larger loop latches though, is it possible to represent it in terms of tail duplication and limit it to relatively small latches where the select is dominating?

igogo-x86 · 2024-12-17T13:05:37Z

You can look at the test. SelectGroup consists of 6 instructions at the beginning of the latch, and the latch has 12 instructions in total. It doesn't look like a small latch to me.

davemgreen

You can look at the test. SelectGroup consists of 6 instructions at the beginning of the latch, and the latch has 12 instructions in total. It doesn't look like a small latch to me.

1/2 of the instructions sounds like quite a high proportion. I was thinking about larger latches (hundreds of instructions) that are essentially just like any other blocks and not particularly special for being latches.

The tail duplication I was thinking of isn't triggering in the test file though, if that is representative https://godbolt.org/z/jT68zsevE (even with enough options to prevent the ifcvting). It might be nice if it did. I guess that there are paths through the program that can lead to csel stalls where they are avoided with branches, and this might be one of them even though it is not part of an inner loop. In that way latches are special and we have certainly seen similar issues in other (inner) loops before. Trying to calculate all the possible paths sounds too expensive.

In practice I think this sounds OK to me (even if like you say it is a bit narrow) - in that it will likely not do much harm anywhere and should lead to perf improvements in the cases you have seen, and I like the limit of at least 3 instructions in the select group. Big groups will be generally more profitable to convert.

I think this sounds OK to me.

…op to branches Loop latches often have a loop-carried dependency, and if they have several SelectLike instructions in one select group, it is usually profitable to convert it to branches rather than keep selects.

llvm-ci · 2024-12-25T14:04:46Z

LLVM Buildbot has detected a new failure on builder clang-aarch64-sve-vls-2stage running on linaro-g3-02 while building llvm at step 6 "build stage 1".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/4/builds/4387

Here is the relevant piece of the build log for the reference

Step 6 (build stage 1) failure: 'ninja' (failure)
...
[7740/8796] Linking CXX static library lib/libMLIRFuncToSPIRV.a
[7741/8796] Linking CXX static library lib/libMLIRSCFTransformOps.a
[7742/8796] Linking CXX static library lib/libMLIRIndexToSPIRV.a
[7743/8796] Linking CXX static library lib/libMLIRUBToSPIRV.a
[7744/8796] Linking CXX static library lib/libMLIRMemRefToSPIRV.a
[7745/8796] Linking CXX static library lib/libMLIRMathToSPIRV.a
[7746/8796] Linking CXX static library lib/libMLIRArithToSPIRV.a
[7747/8796] Building CXX object tools/flang/lib/Semantics/CMakeFiles/FortranSemantics.dir/canonicalize-do.cpp.o
[7748/8796] Linking CXX static library lib/libMLIRSCFToSPIRV.a
[7749/8796] Linking CXX static library lib/libMLIRVectorToSPIRV.a
command timed out: 1200 seconds without output running [b'ninja'], attempting to kill
process killed by signal 9
program finished with exit code -1
elapsedTime=3616.097244

…op to branches (llvm#119728) Loop latches often have a loop-carried dependency, and if they have several SelectLike instructions in one select group, it is usually profitable to convert it to branches rather than keep selects. (cherry picked from commit 3469996)

igogo-x86 requested review from davemgreen and sapostolakis December 12, 2024 17:36

llvmbot added the backend:AArch64 label Dec 12, 2024

davemgreen approved these changes Dec 20, 2024

View reviewed changes

igogo-x86 force-pushed the selectopt-selectgroup-in-latch-heuristic branch from 558eb16 to 2176243 Compare December 25, 2024 11:33

igogo-x86 merged commit 3469996 into llvm:main Dec 25, 2024
8 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[SelectOpt] Optimise big select groups in the latch of a non-inner loop to branches #119728

[SelectOpt] Optimise big select groups in the latch of a non-inner loop to branches #119728

Uh oh!

igogo-x86 commented Dec 12, 2024

Uh oh!

llvmbot commented Dec 12, 2024

Uh oh!

igogo-x86 commented Dec 16, 2024

Uh oh!

davemgreen commented Dec 17, 2024

Uh oh!

igogo-x86 commented Dec 17, 2024

Uh oh!

davemgreen left a comment

Uh oh!

Uh oh!

llvm-ci commented Dec 25, 2024

Uh oh!

Uh oh!

[SelectOpt] Optimise big select groups in the latch of a non-inner loop to branches #119728

[SelectOpt] Optimise big select groups in the latch of a non-inner loop to branches #119728

Uh oh!

Conversation

igogo-x86 commented Dec 12, 2024

Uh oh!

llvmbot commented Dec 12, 2024

Uh oh!

igogo-x86 commented Dec 16, 2024

Uh oh!

davemgreen commented Dec 17, 2024

Uh oh!

igogo-x86 commented Dec 17, 2024

Uh oh!

davemgreen left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

llvm-ci commented Dec 25, 2024

Uh oh!

Uh oh!