Skip to content

[SelectOpt] Optimise big select groups in the latch of a non-inner loop to branches #119728

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions llvm/lib/CodeGen/SelectOptimize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1044,6 +1044,18 @@ bool SelectOptimizeImpl::isConvertToBranchProfitableBase(
return true;
}

// If latch has a select group with several elements, it is usually profitable
// to convert it to branches. We let `optimizeSelectsInnerLoops` decide if
// conversion is profitable for innermost loops.
auto *BB = SI.getI()->getParent();
auto *L = LI->getLoopFor(BB);
if (L && !L->isInnermost() && L->getLoopLatch() == BB &&
ASI.Selects.size() >= 3) {
OR << "Converted to branch because select group in the latch block is big.";
EmitAndPrintRemark(ORE, OR);
return true;
}

ORmiss << "Not profitable to convert to branch (base heuristic).";
EmitAndPrintRemark(ORE, ORmiss);
return false;
Expand Down
121 changes: 121 additions & 0 deletions llvm/test/CodeGen/AArch64/selectopt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -875,3 +875,124 @@ if.end:
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}

declare i64 @payload(i64, ptr, ptr, i64)

define void @outer_latch_heuristic(ptr %dst, ptr %src, i64 %p, i64 %dim) {
; CHECKOO-LABEL: @outer_latch_heuristic(
; CHECKOO-NEXT: entry:
; CHECKOO-NEXT: br label [[OUTER_LOOP:%.*]]
; CHECKOO: outer.loop:
; CHECKOO-NEXT: [[K_020_US:%.*]] = phi i64 [ [[INC7_US:%.*]], [[SELECT_END:%.*]] ], [ 0, [[ENTRY:%.*]] ]
; CHECKOO-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[SELECT_END]] ], [ 0, [[ENTRY]] ]
; CHECKOO-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[SELECT_END]] ], [ 0, [[ENTRY]] ]
; CHECKOO-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC:%.*]], i64 [[I]]
; CHECKOO-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
; CHECKOO-NEXT: [[ARRAYIDX1_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[J]]
; CHECKOO-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX1_US]], align 8
; CHECKOO-NEXT: br label [[INNER_LOOP:%.*]]
; CHECKOO: inner.loop:
; CHECKOO-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[DIM:%.*]], [[OUTER_LOOP]] ], [ [[LSR_IV_NEXT:%.*]], [[INNER_LOOP]] ]
; CHECKOO-NEXT: [[DIFF_04_I_US:%.*]] = phi i64 [ [[CALL_I_US:%.*]], [[INNER_LOOP]] ], [ 0, [[OUTER_LOOP]] ]
; CHECKOO-NEXT: [[CALL_I_US]] = tail call i64 @payload(i64 [[DIFF_04_I_US]], ptr [[TMP0]], ptr [[TMP1]], i64 [[P:%.*]])
; CHECKOO-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1
; CHECKOO-NEXT: [[EXITCOND_NOT_I_US:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
; CHECKOO-NEXT: br i1 [[EXITCOND_NOT_I_US]], label [[LATCH:%.*]], label [[INNER_LOOP]]
; CHECKOO: latch:
; CHECKOO-NEXT: [[CMP2_US:%.*]] = icmp sgt i64 [[CALL_I_US]], -1
; CHECKOO-NEXT: [[DIFF_0_LCSSA_I_LOBIT_US:%.*]] = lshr i64 [[CALL_I_US]], 63
; CHECKOO-NEXT: [[CMP2_US_FROZEN:%.*]] = freeze i1 [[CMP2_US]]
; CHECKOO-NEXT: br i1 [[CMP2_US_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_FALSE_SINK:%.*]]
; CHECKOO: select.true.sink:
; CHECKOO-NEXT: [[TMP2:%.*]] = add nsw i64 [[J]], 1
; CHECKOO-NEXT: br label [[SELECT_END]]
; CHECKOO: select.false.sink:
; CHECKOO-NEXT: [[TMP3:%.*]] = add nsw i64 1, [[I]]
; CHECKOO-NEXT: br label [[SELECT_END]]
; CHECKOO: select.end:
; CHECKOO-NEXT: [[I_NEXT]] = phi i64 [ [[I]], [[SELECT_TRUE_SINK]] ], [ [[TMP3]], [[SELECT_FALSE_SINK]] ]
; CHECKOO-NEXT: [[J_NEXT]] = phi i64 [ [[TMP2]], [[SELECT_TRUE_SINK]] ], [ [[J]], [[SELECT_FALSE_SINK]] ]
; CHECKOO-NEXT: [[COND_IN_US:%.*]] = phi ptr [ [[ARRAYIDX1_US]], [[SELECT_TRUE_SINK]] ], [ [[ARRAYIDX_US]], [[SELECT_FALSE_SINK]] ]
; CHECKOO-NEXT: [[INC4_US:%.*]] = zext i1 [[CMP2_US]] to i64
; CHECKOO-NEXT: [[COND_US:%.*]] = load ptr, ptr [[COND_IN_US]], align 8
; CHECKOO-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[DST:%.*]], i64 [[K_020_US]]
; CHECKOO-NEXT: store ptr [[COND_US]], ptr [[ARRAYIDX6_US]], align 8
; CHECKOO-NEXT: [[INC7_US]] = add i64 [[K_020_US]], 1
; CHECKOO-NEXT: [[EXITCOND23_NOT:%.*]] = icmp eq i64 [[K_020_US]], 1000
; CHECKOO-NEXT: br i1 [[EXITCOND23_NOT]], label [[EXIT:%.*]], label [[OUTER_LOOP]]
; CHECKOO: exit:
; CHECKOO-NEXT: ret void
;
; CHECKII-LABEL: @outer_latch_heuristic(
; CHECKII-NEXT: entry:
; CHECKII-NEXT: br label [[OUTER_LOOP:%.*]]
; CHECKII: outer.loop:
; CHECKII-NEXT: [[K_020_US:%.*]] = phi i64 [ [[INC7_US:%.*]], [[LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
; CHECKII-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[LATCH]] ], [ 0, [[ENTRY]] ]
; CHECKII-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH]] ], [ 0, [[ENTRY]] ]
; CHECKII-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC:%.*]], i64 [[I]]
; CHECKII-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
; CHECKII-NEXT: [[ARRAYIDX1_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[J]]
; CHECKII-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX1_US]], align 8
; CHECKII-NEXT: br label [[INNER_LOOP:%.*]]
; CHECKII: inner.loop:
; CHECKII-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[DIM:%.*]], [[OUTER_LOOP]] ], [ [[LSR_IV_NEXT:%.*]], [[INNER_LOOP]] ]
; CHECKII-NEXT: [[DIFF_04_I_US:%.*]] = phi i64 [ [[CALL_I_US:%.*]], [[INNER_LOOP]] ], [ 0, [[OUTER_LOOP]] ]
; CHECKII-NEXT: [[CALL_I_US]] = tail call i64 @payload(i64 [[DIFF_04_I_US]], ptr [[TMP0]], ptr [[TMP1]], i64 [[P:%.*]])
; CHECKII-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1
; CHECKII-NEXT: [[EXITCOND_NOT_I_US:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
; CHECKII-NEXT: br i1 [[EXITCOND_NOT_I_US]], label [[LATCH]], label [[INNER_LOOP]]
; CHECKII: latch:
; CHECKII-NEXT: [[CMP2_US:%.*]] = icmp sgt i64 [[CALL_I_US]], -1
; CHECKII-NEXT: [[DIFF_0_LCSSA_I_LOBIT_US:%.*]] = lshr i64 [[CALL_I_US]], 63
; CHECKII-NEXT: [[I_NEXT]] = add nsw i64 [[DIFF_0_LCSSA_I_LOBIT_US]], [[I]]
; CHECKII-NEXT: [[INC4_US:%.*]] = zext i1 [[CMP2_US]] to i64
; CHECKII-NEXT: [[J_NEXT]] = add nsw i64 [[J]], [[INC4_US]]
; CHECKII-NEXT: [[COND_IN_US:%.*]] = select i1 [[CMP2_US]], ptr [[ARRAYIDX1_US]], ptr [[ARRAYIDX_US]]
; CHECKII-NEXT: [[COND_US:%.*]] = load ptr, ptr [[COND_IN_US]], align 8
; CHECKII-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[DST:%.*]], i64 [[K_020_US]]
; CHECKII-NEXT: store ptr [[COND_US]], ptr [[ARRAYIDX6_US]], align 8
; CHECKII-NEXT: [[INC7_US]] = add i64 [[K_020_US]], 1
; CHECKII-NEXT: [[EXITCOND23_NOT:%.*]] = icmp eq i64 [[K_020_US]], 1000
; CHECKII-NEXT: br i1 [[EXITCOND23_NOT]], label [[EXIT:%.*]], label [[OUTER_LOOP]]
; CHECKII: exit:
; CHECKII-NEXT: ret void
;
entry:
br label %outer.loop

outer.loop:
%k.020.us = phi i64 [ %inc7.us, %latch ], [ 0, %entry ]
%j = phi i64 [ %j.next, %latch ], [ 0, %entry ]
%i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
%arrayidx.us = getelementptr inbounds ptr, ptr %src, i64 %i
%4 = load ptr, ptr %arrayidx.us, align 8
%arrayidx1.us = getelementptr inbounds ptr, ptr %src, i64 %j
%5 = load ptr, ptr %arrayidx1.us, align 8
br label %inner.loop

inner.loop:
%lsr.iv = phi i64 [ %dim, %outer.loop ], [ %lsr.iv.next, %inner.loop ]
%diff.04.i.us = phi i64 [ %call.i.us, %inner.loop ], [ 0, %outer.loop ]
%call.i.us = tail call i64 @payload(i64 %diff.04.i.us, ptr %4, ptr %5, i64 %p)
%lsr.iv.next = add i64 %lsr.iv, -1
%exitcond.not.i.us = icmp eq i64 %lsr.iv.next, 0
br i1 %exitcond.not.i.us, label %latch, label %inner.loop

latch:
%cmp2.us = icmp sgt i64 %call.i.us, -1
%diff.0.lcssa.i.lobit.us = lshr i64 %call.i.us, 63
%i.next = add nsw i64 %diff.0.lcssa.i.lobit.us, %i
%inc4.us = zext i1 %cmp2.us to i64
%j.next = add nsw i64 %j, %inc4.us
%cond.in.us = select i1 %cmp2.us, ptr %arrayidx1.us, ptr %arrayidx.us
%cond.us = load ptr, ptr %cond.in.us, align 8
%arrayidx6.us = getelementptr inbounds ptr, ptr %dst, i64 %k.020.us
store ptr %cond.us, ptr %arrayidx6.us, align 8
%inc7.us = add i64 %k.020.us, 1
%exitcond23.not = icmp eq i64 %k.020.us, 1000
br i1 %exitcond23.not, label %exit, label %outer.loop

exit:
ret void
}
Loading