Skip to content

Commit 3469996

Browse files
authored
[SelectOpt] Optimise big select groups in the latch of a non-inner loop to branches (llvm#119728)
Loop latches often have a loop-carried dependency, and if they have several SelectLike instructions in one select group, it is usually profitable to convert it to branches rather than keep selects.
1 parent 4884b1b commit 3469996

File tree

2 files changed

+133
-0
lines changed

2 files changed

+133
-0
lines changed

llvm/lib/CodeGen/SelectOptimize.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1044,6 +1044,18 @@ bool SelectOptimizeImpl::isConvertToBranchProfitableBase(
10441044
return true;
10451045
}
10461046

1047+
// If latch has a select group with several elements, it is usually profitable
1048+
// to convert it to branches. We let `optimizeSelectsInnerLoops` decide if
1049+
// conversion is profitable for innermost loops.
1050+
auto *BB = SI.getI()->getParent();
1051+
auto *L = LI->getLoopFor(BB);
1052+
if (L && !L->isInnermost() && L->getLoopLatch() == BB &&
1053+
ASI.Selects.size() >= 3) {
1054+
OR << "Converted to branch because select group in the latch block is big.";
1055+
EmitAndPrintRemark(ORE, OR);
1056+
return true;
1057+
}
1058+
10471059
ORmiss << "Not profitable to convert to branch (base heuristic).";
10481060
EmitAndPrintRemark(ORE, ORmiss);
10491061
return false;

llvm/test/CodeGen/AArch64/selectopt.ll

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -875,3 +875,124 @@ if.end:
875875
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
876876
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
877877
}
878+
879+
declare i64 @payload(i64, ptr, ptr, i64)
880+
881+
define void @outer_latch_heuristic(ptr %dst, ptr %src, i64 %p, i64 %dim) {
882+
; CHECKOO-LABEL: @outer_latch_heuristic(
883+
; CHECKOO-NEXT: entry:
884+
; CHECKOO-NEXT: br label [[OUTER_LOOP:%.*]]
885+
; CHECKOO: outer.loop:
886+
; CHECKOO-NEXT: [[K_020_US:%.*]] = phi i64 [ [[INC7_US:%.*]], [[SELECT_END:%.*]] ], [ 0, [[ENTRY:%.*]] ]
887+
; CHECKOO-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[SELECT_END]] ], [ 0, [[ENTRY]] ]
888+
; CHECKOO-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[SELECT_END]] ], [ 0, [[ENTRY]] ]
889+
; CHECKOO-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC:%.*]], i64 [[I]]
890+
; CHECKOO-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
891+
; CHECKOO-NEXT: [[ARRAYIDX1_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[J]]
892+
; CHECKOO-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX1_US]], align 8
893+
; CHECKOO-NEXT: br label [[INNER_LOOP:%.*]]
894+
; CHECKOO: inner.loop:
895+
; CHECKOO-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[DIM:%.*]], [[OUTER_LOOP]] ], [ [[LSR_IV_NEXT:%.*]], [[INNER_LOOP]] ]
896+
; CHECKOO-NEXT: [[DIFF_04_I_US:%.*]] = phi i64 [ [[CALL_I_US:%.*]], [[INNER_LOOP]] ], [ 0, [[OUTER_LOOP]] ]
897+
; CHECKOO-NEXT: [[CALL_I_US]] = tail call i64 @payload(i64 [[DIFF_04_I_US]], ptr [[TMP0]], ptr [[TMP1]], i64 [[P:%.*]])
898+
; CHECKOO-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1
899+
; CHECKOO-NEXT: [[EXITCOND_NOT_I_US:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
900+
; CHECKOO-NEXT: br i1 [[EXITCOND_NOT_I_US]], label [[LATCH:%.*]], label [[INNER_LOOP]]
901+
; CHECKOO: latch:
902+
; CHECKOO-NEXT: [[CMP2_US:%.*]] = icmp sgt i64 [[CALL_I_US]], -1
903+
; CHECKOO-NEXT: [[DIFF_0_LCSSA_I_LOBIT_US:%.*]] = lshr i64 [[CALL_I_US]], 63
904+
; CHECKOO-NEXT: [[CMP2_US_FROZEN:%.*]] = freeze i1 [[CMP2_US]]
905+
; CHECKOO-NEXT: br i1 [[CMP2_US_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_FALSE_SINK:%.*]]
906+
; CHECKOO: select.true.sink:
907+
; CHECKOO-NEXT: [[TMP2:%.*]] = add nsw i64 [[J]], 1
908+
; CHECKOO-NEXT: br label [[SELECT_END]]
909+
; CHECKOO: select.false.sink:
910+
; CHECKOO-NEXT: [[TMP3:%.*]] = add nsw i64 1, [[I]]
911+
; CHECKOO-NEXT: br label [[SELECT_END]]
912+
; CHECKOO: select.end:
913+
; CHECKOO-NEXT: [[I_NEXT]] = phi i64 [ [[I]], [[SELECT_TRUE_SINK]] ], [ [[TMP3]], [[SELECT_FALSE_SINK]] ]
914+
; CHECKOO-NEXT: [[J_NEXT]] = phi i64 [ [[TMP2]], [[SELECT_TRUE_SINK]] ], [ [[J]], [[SELECT_FALSE_SINK]] ]
915+
; CHECKOO-NEXT: [[COND_IN_US:%.*]] = phi ptr [ [[ARRAYIDX1_US]], [[SELECT_TRUE_SINK]] ], [ [[ARRAYIDX_US]], [[SELECT_FALSE_SINK]] ]
916+
; CHECKOO-NEXT: [[INC4_US:%.*]] = zext i1 [[CMP2_US]] to i64
917+
; CHECKOO-NEXT: [[COND_US:%.*]] = load ptr, ptr [[COND_IN_US]], align 8
918+
; CHECKOO-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[DST:%.*]], i64 [[K_020_US]]
919+
; CHECKOO-NEXT: store ptr [[COND_US]], ptr [[ARRAYIDX6_US]], align 8
920+
; CHECKOO-NEXT: [[INC7_US]] = add i64 [[K_020_US]], 1
921+
; CHECKOO-NEXT: [[EXITCOND23_NOT:%.*]] = icmp eq i64 [[K_020_US]], 1000
922+
; CHECKOO-NEXT: br i1 [[EXITCOND23_NOT]], label [[EXIT:%.*]], label [[OUTER_LOOP]]
923+
; CHECKOO: exit:
924+
; CHECKOO-NEXT: ret void
925+
;
926+
; CHECKII-LABEL: @outer_latch_heuristic(
927+
; CHECKII-NEXT: entry:
928+
; CHECKII-NEXT: br label [[OUTER_LOOP:%.*]]
929+
; CHECKII: outer.loop:
930+
; CHECKII-NEXT: [[K_020_US:%.*]] = phi i64 [ [[INC7_US:%.*]], [[LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
931+
; CHECKII-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[LATCH]] ], [ 0, [[ENTRY]] ]
932+
; CHECKII-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH]] ], [ 0, [[ENTRY]] ]
933+
; CHECKII-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC:%.*]], i64 [[I]]
934+
; CHECKII-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
935+
; CHECKII-NEXT: [[ARRAYIDX1_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[J]]
936+
; CHECKII-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX1_US]], align 8
937+
; CHECKII-NEXT: br label [[INNER_LOOP:%.*]]
938+
; CHECKII: inner.loop:
939+
; CHECKII-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[DIM:%.*]], [[OUTER_LOOP]] ], [ [[LSR_IV_NEXT:%.*]], [[INNER_LOOP]] ]
940+
; CHECKII-NEXT: [[DIFF_04_I_US:%.*]] = phi i64 [ [[CALL_I_US:%.*]], [[INNER_LOOP]] ], [ 0, [[OUTER_LOOP]] ]
941+
; CHECKII-NEXT: [[CALL_I_US]] = tail call i64 @payload(i64 [[DIFF_04_I_US]], ptr [[TMP0]], ptr [[TMP1]], i64 [[P:%.*]])
942+
; CHECKII-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1
943+
; CHECKII-NEXT: [[EXITCOND_NOT_I_US:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
944+
; CHECKII-NEXT: br i1 [[EXITCOND_NOT_I_US]], label [[LATCH]], label [[INNER_LOOP]]
945+
; CHECKII: latch:
946+
; CHECKII-NEXT: [[CMP2_US:%.*]] = icmp sgt i64 [[CALL_I_US]], -1
947+
; CHECKII-NEXT: [[DIFF_0_LCSSA_I_LOBIT_US:%.*]] = lshr i64 [[CALL_I_US]], 63
948+
; CHECKII-NEXT: [[I_NEXT]] = add nsw i64 [[DIFF_0_LCSSA_I_LOBIT_US]], [[I]]
949+
; CHECKII-NEXT: [[INC4_US:%.*]] = zext i1 [[CMP2_US]] to i64
950+
; CHECKII-NEXT: [[J_NEXT]] = add nsw i64 [[J]], [[INC4_US]]
951+
; CHECKII-NEXT: [[COND_IN_US:%.*]] = select i1 [[CMP2_US]], ptr [[ARRAYIDX1_US]], ptr [[ARRAYIDX_US]]
952+
; CHECKII-NEXT: [[COND_US:%.*]] = load ptr, ptr [[COND_IN_US]], align 8
953+
; CHECKII-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[DST:%.*]], i64 [[K_020_US]]
954+
; CHECKII-NEXT: store ptr [[COND_US]], ptr [[ARRAYIDX6_US]], align 8
955+
; CHECKII-NEXT: [[INC7_US]] = add i64 [[K_020_US]], 1
956+
; CHECKII-NEXT: [[EXITCOND23_NOT:%.*]] = icmp eq i64 [[K_020_US]], 1000
957+
; CHECKII-NEXT: br i1 [[EXITCOND23_NOT]], label [[EXIT:%.*]], label [[OUTER_LOOP]]
958+
; CHECKII: exit:
959+
; CHECKII-NEXT: ret void
960+
;
961+
entry:
962+
br label %outer.loop
963+
964+
outer.loop:
965+
%k.020.us = phi i64 [ %inc7.us, %latch ], [ 0, %entry ]
966+
%j = phi i64 [ %j.next, %latch ], [ 0, %entry ]
967+
%i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
968+
%arrayidx.us = getelementptr inbounds ptr, ptr %src, i64 %i
969+
%4 = load ptr, ptr %arrayidx.us, align 8
970+
%arrayidx1.us = getelementptr inbounds ptr, ptr %src, i64 %j
971+
%5 = load ptr, ptr %arrayidx1.us, align 8
972+
br label %inner.loop
973+
974+
inner.loop:
975+
%lsr.iv = phi i64 [ %dim, %outer.loop ], [ %lsr.iv.next, %inner.loop ]
976+
%diff.04.i.us = phi i64 [ %call.i.us, %inner.loop ], [ 0, %outer.loop ]
977+
%call.i.us = tail call i64 @payload(i64 %diff.04.i.us, ptr %4, ptr %5, i64 %p)
978+
%lsr.iv.next = add i64 %lsr.iv, -1
979+
%exitcond.not.i.us = icmp eq i64 %lsr.iv.next, 0
980+
br i1 %exitcond.not.i.us, label %latch, label %inner.loop
981+
982+
latch:
983+
%cmp2.us = icmp sgt i64 %call.i.us, -1
984+
%diff.0.lcssa.i.lobit.us = lshr i64 %call.i.us, 63
985+
%i.next = add nsw i64 %diff.0.lcssa.i.lobit.us, %i
986+
%inc4.us = zext i1 %cmp2.us to i64
987+
%j.next = add nsw i64 %j, %inc4.us
988+
%cond.in.us = select i1 %cmp2.us, ptr %arrayidx1.us, ptr %arrayidx.us
989+
%cond.us = load ptr, ptr %cond.in.us, align 8
990+
%arrayidx6.us = getelementptr inbounds ptr, ptr %dst, i64 %k.020.us
991+
store ptr %cond.us, ptr %arrayidx6.us, align 8
992+
%inc7.us = add i64 %k.020.us, 1
993+
%exitcond23.not = icmp eq i64 %k.020.us, 1000
994+
br i1 %exitcond23.not, label %exit, label %outer.loop
995+
996+
exit:
997+
ret void
998+
}

0 commit comments

Comments
 (0)