Skip to content

Commit 413a66f

Browse files
[LV, VP]VP intrinsics support for the Loop Vectorizer + adding new tail-folding mode using EVL. (#76172)
This patch introduces generating VP intrinsics in the Loop Vectorizer. Currently the Loop Vectorizer supports vector predication in a very limited capacity via tail-folding and masked load/store/gather/scatter intrinsics. However, this does not let architectures with active vector length predication support take advantage of their capabilities. Architectures with general masked predication support also can only take advantage of predication on memory operations. By having a way for the Loop Vectorizer to generate Vector Predication intrinsics, which (will) provide a target-independent way to model predicated vector instructions. These architectures can make better use of their predication capabilities. Our first approach (implemented in this patch) builds on top of the existing tail-folding mechanism in the LV (just adds a new tail-folding mode using EVL), but instead of generating masked intrinsics for memory operations it generates VP intrinsics for loads/stores instructions. The patch adds a new VPlanTransforms to replace the wide header predicate compare with EVL and updates codegen for load/stores to use VP store/load with EVL. Other important part of this approach is how the Explicit Vector Length is computed. (VP intrinsics define this vector length parameter as Explicit Vector Length (EVL)). We use an experimental intrinsic `get_vector_length`, that can be lowered to architecture specific instruction(s) to compute EVL. Also, added a new recipe to emit instructions for computing EVL. Using VPlan in this way will eventually help build and compare VPlans corresponding to different strategies and alternatives. Differential Revision: https://reviews.llvm.org/D99750
1 parent bffc0b6 commit 413a66f

27 files changed

+2124
-69
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,10 @@ enum class TailFoldingStyle {
190190
/// Use predicate to control both data and control flow, but modify
191191
/// the trip count so that a runtime overflow check can be avoided
192192
/// and such that the scalar epilogue loop can always be removed.
193-
DataAndControlFlowWithoutRuntimeCheck
193+
DataAndControlFlowWithoutRuntimeCheck,
194+
/// Use predicated EVL instructions for tail-folding.
195+
/// Indicates that VP intrinsics should be used.
196+
DataWithEVL,
194197
};
195198

196199
struct TailFoldingInfo {

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,10 @@ RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
245245
return TTI::TCC_Free;
246246
}
247247

248+
bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const {
249+
return ST->hasVInstructions();
250+
}
251+
248252
TargetTransformInfo::PopcntSupportKind
249253
RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) {
250254
assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,22 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
7878
const APInt &Imm, Type *Ty,
7979
TTI::TargetCostKind CostKind);
8080

81+
/// \name EVL Support for predicated vectorization.
82+
/// Whether the target supports the %evl parameter of VP intrinsic efficiently
83+
/// in hardware, for the given opcode and type/alignment. (see LLVM Language
84+
/// Reference - "Vector Predication Intrinsics",
85+
/// https://llvm.org/docs/LangRef.html#vector-predication-intrinsics and
86+
/// "IR-level VP intrinsics",
87+
/// https://llvm.org/docs/Proposals/VectorPredication.html#ir-level-vp-intrinsics).
88+
/// \param Opcode the opcode of the instruction checked for predicated version
89+
/// support.
90+
/// \param DataType the type of the instruction with the \p Opcode checked for
91+
/// prediction support.
92+
/// \param Alignment the alignment for memory access operation checked for
93+
/// predicated version support.
94+
bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
95+
Align Alignment) const;
96+
8197
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
8298

8399
bool shouldExpandReduction(const IntrinsicInst *II) const;

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 180 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@
124124
#include "llvm/IR/User.h"
125125
#include "llvm/IR/Value.h"
126126
#include "llvm/IR/ValueHandle.h"
127+
#include "llvm/IR/VectorBuilder.h"
127128
#include "llvm/IR/Verifier.h"
128129
#include "llvm/Support/Casting.h"
129130
#include "llvm/Support/CommandLine.h"
@@ -248,10 +249,12 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
248249
clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
249250
"Create lane mask using active.lane.mask intrinsic, and use "
250251
"it for both data and control flow"),
251-
clEnumValN(
252-
TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
253-
"data-and-control-without-rt-check",
254-
"Similar to data-and-control, but remove the runtime check")));
252+
clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
253+
"data-and-control-without-rt-check",
254+
"Similar to data-and-control, but remove the runtime check"),
255+
clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
256+
"Use predicated EVL instructions for tail folding. If EVL "
257+
"is unsupported, fallback to data-without-lane-mask.")));
255258

256259
static cl::opt<bool> MaximizeBandwidth(
257260
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
@@ -1505,29 +1508,62 @@ class LoopVectorizationCostModel {
15051508

15061509
/// Returns the TailFoldingStyle that is best for the current loop.
15071510
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1508-
return IVUpdateMayOverflow ? ChosenTailFoldingStyle.first
1509-
: ChosenTailFoldingStyle.second;
1511+
if (!ChosenTailFoldingStyle)
1512+
return TailFoldingStyle::None;
1513+
return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1514+
: ChosenTailFoldingStyle->second;
15101515
}
15111516

15121517
/// Selects and saves TailFoldingStyle for 2 options - if IV update may
15131518
/// overflow or not.
1514-
void setTailFoldingStyles() {
1515-
assert(ChosenTailFoldingStyle.first == TailFoldingStyle::None &&
1516-
ChosenTailFoldingStyle.second == TailFoldingStyle::None &&
1517-
"Tail folding must not be selected yet.");
1518-
if (!Legal->prepareToFoldTailByMasking())
1519+
/// \param IsScalableVF true if scalable vector factors enabled.
1520+
/// \param UserIC User specific interleave count.
1521+
void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1522+
assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1523+
if (!Legal->prepareToFoldTailByMasking()) {
1524+
ChosenTailFoldingStyle =
1525+
std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
15191526
return;
1527+
}
15201528

1521-
if (ForceTailFoldingStyle.getNumOccurrences()) {
1522-
ChosenTailFoldingStyle.first = ChosenTailFoldingStyle.second =
1523-
ForceTailFoldingStyle;
1529+
if (!ForceTailFoldingStyle.getNumOccurrences()) {
1530+
ChosenTailFoldingStyle = std::make_pair(
1531+
TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1532+
TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
15241533
return;
15251534
}
15261535

1527-
ChosenTailFoldingStyle.first =
1528-
TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true);
1529-
ChosenTailFoldingStyle.second =
1530-
TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false);
1536+
// Set styles when forced.
1537+
ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1538+
ForceTailFoldingStyle.getValue());
1539+
if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1540+
return;
1541+
// Override forced styles if needed.
1542+
// FIXME: use actual opcode/data type for analysis here.
1543+
// FIXME: Investigate opportunity for fixed vector factor.
1544+
bool EVLIsLegal =
1545+
IsScalableVF && UserIC <= 1 &&
1546+
TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1547+
!EnableVPlanNativePath &&
1548+
// FIXME: implement support for max safe dependency distance.
1549+
Legal->isSafeForAnyVectorWidth() &&
1550+
// FIXME: remove this once reductions are supported.
1551+
Legal->getReductionVars().empty();
1552+
if (!EVLIsLegal) {
1553+
// If for some reason EVL mode is unsupported, fallback to
1554+
// DataWithoutLaneMask to try to vectorize the loop with folded tail
1555+
// in a generic way.
1556+
ChosenTailFoldingStyle =
1557+
std::make_pair(TailFoldingStyle::DataWithoutLaneMask,
1558+
TailFoldingStyle::DataWithoutLaneMask);
1559+
LLVM_DEBUG(
1560+
dbgs()
1561+
<< "LV: Preference for VP intrinsics indicated. Will "
1562+
"not try to generate VP Intrinsics "
1563+
<< (UserIC > 1
1564+
? "since interleave count specified is greater than 1.\n"
1565+
: "due to non-interleaving reasons.\n"));
1566+
}
15311567
}
15321568

15331569
/// Returns true if all loop blocks should be masked to fold tail loop.
@@ -1544,6 +1580,18 @@ class LoopVectorizationCostModel {
15441580
return foldTailByMasking() || Legal->blockNeedsPredication(BB);
15451581
}
15461582

1583+
/// Returns true if VP intrinsics with explicit vector length support should
1584+
/// be generated in the tail folded loop.
1585+
bool foldTailWithEVL() const {
1586+
return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL &&
1587+
// FIXME: remove this once vp_reverse is supported.
1588+
none_of(
1589+
WideningDecisions,
1590+
[](const std::pair<std::pair<Instruction *, ElementCount>,
1591+
std::pair<InstWidening, InstructionCost>>
1592+
&Data) { return Data.second.first == CM_Widen_Reverse; });
1593+
}
1594+
15471595
/// Returns true if the Phi is part of an inloop reduction.
15481596
bool isInLoopReduction(PHINode *Phi) const {
15491597
return InLoopReductions.contains(Phi);
@@ -1688,8 +1736,8 @@ class LoopVectorizationCostModel {
16881736

16891737
/// Control finally chosen tail folding style. The first element is used if
16901738
/// the IV update may overflow, the second element - if it does not.
1691-
std::pair<TailFoldingStyle, TailFoldingStyle> ChosenTailFoldingStyle =
1692-
std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
1739+
std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1740+
ChosenTailFoldingStyle;
16931741

16941742
/// A map holding scalar costs for different vectorization factors. The
16951743
/// presence of a cost for an instruction in the mapping indicates that the
@@ -4647,9 +4695,24 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
46474695
// found modulo the vectorization factor is not zero, try to fold the tail
46484696
// by masking.
46494697
// FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4650-
setTailFoldingStyles();
4651-
if (foldTailByMasking())
4698+
setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
4699+
if (foldTailByMasking()) {
4700+
if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
4701+
LLVM_DEBUG(
4702+
dbgs()
4703+
<< "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4704+
"try to generate VP Intrinsics with scalable vector "
4705+
"factors only.\n");
4706+
// Tail folded loop using VP intrinsics restricts the VF to be scalable
4707+
// for now.
4708+
// TODO: extend it for fixed vectors, if required.
4709+
assert(MaxFactors.ScalableVF.isScalable() &&
4710+
"Expected scalable vector factor.");
4711+
4712+
MaxFactors.FixedVF = ElementCount::getFixed(1);
4713+
}
46524714
return MaxFactors;
4715+
}
46534716

46544717
// If there was a tail-folding hint/switch, but we can't fold the tail by
46554718
// masking, fallback to a vectorization with a scalar epilogue.
@@ -5257,6 +5320,13 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
52575320
if (!isScalarEpilogueAllowed())
52585321
return 1;
52595322

5323+
// Do not interleave if EVL is preferred and no User IC is specified.
5324+
if (foldTailWithEVL()) {
5325+
LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
5326+
"Unroll factor forced to be 1.\n");
5327+
return 1;
5328+
}
5329+
52605330
// We used the distance for the interleave count.
52615331
if (!Legal->isSafeForAnyVectorWidth())
52625332
return 1;
@@ -8487,6 +8557,9 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
84878557
VPlanTransforms::truncateToMinimalBitwidths(
84888558
*Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
84898559
VPlanTransforms::optimize(*Plan, *PSE.getSE());
8560+
// TODO: try to put it close to addActiveLaneMask().
8561+
if (CM.foldTailWithEVL())
8562+
VPlanTransforms::addExplicitVectorLength(*Plan);
84908563
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
84918564
VPlans.push_back(std::move(Plan));
84928565
}
@@ -9179,7 +9252,7 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
91799252
State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
91809253

91819254
Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9182-
Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9255+
Value *CanonicalIV = State.get(getOperand(1), VPIteration(0, 0));
91839256
Value *DerivedIV = emitTransformedIndex(
91849257
State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
91859258
Kind, cast_if_present<BinaryOperator>(FPBinOp));
@@ -9307,6 +9380,52 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
93079380
State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
93089381
}
93099382

9383+
/// Creates either vp_store or vp_scatter intrinsics calls to represent
9384+
/// predicated store/scatter.
9385+
static Instruction *
9386+
lowerStoreUsingVectorIntrinsics(IRBuilderBase &Builder, Value *Addr,
9387+
Value *StoredVal, bool IsScatter, Value *Mask,
9388+
Value *EVL, const Align &Alignment) {
9389+
CallInst *Call;
9390+
if (IsScatter) {
9391+
Call = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
9392+
Intrinsic::vp_scatter,
9393+
{StoredVal, Addr, Mask, EVL});
9394+
} else {
9395+
VectorBuilder VBuilder(Builder);
9396+
VBuilder.setEVL(EVL).setMask(Mask);
9397+
Call = cast<CallInst>(VBuilder.createVectorInstruction(
9398+
Instruction::Store, Type::getVoidTy(EVL->getContext()),
9399+
{StoredVal, Addr}));
9400+
}
9401+
Call->addParamAttr(
9402+
1, Attribute::getWithAlignment(Call->getContext(), Alignment));
9403+
return Call;
9404+
}
9405+
9406+
/// Creates either vp_load or vp_gather intrinsics calls to represent
9407+
/// predicated load/gather.
9408+
static Instruction *lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder,
9409+
VectorType *DataTy,
9410+
Value *Addr, bool IsGather,
9411+
Value *Mask, Value *EVL,
9412+
const Align &Alignment) {
9413+
CallInst *Call;
9414+
if (IsGather) {
9415+
Call =
9416+
Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
9417+
nullptr, "wide.masked.gather");
9418+
} else {
9419+
VectorBuilder VBuilder(Builder);
9420+
VBuilder.setEVL(EVL).setMask(Mask);
9421+
Call = cast<CallInst>(VBuilder.createVectorInstruction(
9422+
Instruction::Load, DataTy, Addr, "vp.op.load"));
9423+
}
9424+
Call->addParamAttr(
9425+
0, Attribute::getWithAlignment(Call->getContext(), Alignment));
9426+
return Call;
9427+
}
9428+
93109429
void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
93119430
VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
93129431

@@ -9345,7 +9464,25 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
93459464
for (unsigned Part = 0; Part < State.UF; ++Part) {
93469465
Instruction *NewSI = nullptr;
93479466
Value *StoredVal = State.get(StoredValue, Part);
9348-
if (CreateGatherScatter) {
9467+
// TODO: split this into several classes for better design.
9468+
if (State.EVL) {
9469+
assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9470+
"explicit vector length.");
9471+
assert(cast<VPInstruction>(State.EVL)->getOpcode() ==
9472+
VPInstruction::ExplicitVectorLength &&
9473+
"EVL must be VPInstruction::ExplicitVectorLength.");
9474+
Value *EVL = State.get(State.EVL, VPIteration(0, 0));
9475+
// If EVL is not nullptr, then EVL must be a valid value set during plan
9476+
// creation, possibly default value = whole vector register length. EVL
9477+
// is created only if TTI prefers predicated vectorization, thus if EVL
9478+
// is not nullptr it also implies preference for predicated
9479+
// vectorization.
9480+
// FIXME: Support reverse store after vp_reverse is added.
9481+
Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9482+
NewSI = lowerStoreUsingVectorIntrinsics(
9483+
Builder, State.get(getAddr(), Part, !CreateGatherScatter),
9484+
StoredVal, CreateGatherScatter, MaskPart, EVL, Alignment);
9485+
} else if (CreateGatherScatter) {
93499486
Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
93509487
Value *VectorGep = State.get(getAddr(), Part);
93519488
NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
@@ -9375,7 +9512,25 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
93759512
State.setDebugLocFrom(getDebugLoc());
93769513
for (unsigned Part = 0; Part < State.UF; ++Part) {
93779514
Value *NewLI;
9378-
if (CreateGatherScatter) {
9515+
// TODO: split this into several classes for better design.
9516+
if (State.EVL) {
9517+
assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9518+
"explicit vector length.");
9519+
assert(cast<VPInstruction>(State.EVL)->getOpcode() ==
9520+
VPInstruction::ExplicitVectorLength &&
9521+
"EVL must be VPInstruction::ExplicitVectorLength.");
9522+
Value *EVL = State.get(State.EVL, VPIteration(0, 0));
9523+
// If EVL is not nullptr, then EVL must be a valid value set during plan
9524+
// creation, possibly default value = whole vector register length. EVL
9525+
// is created only if TTI prefers predicated vectorization, thus if EVL
9526+
// is not nullptr it also implies preference for predicated
9527+
// vectorization.
9528+
// FIXME: Support reverse loading after vp_reverse is added.
9529+
Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9530+
NewLI = lowerLoadUsingVectorIntrinsics(
9531+
Builder, DataTy, State.get(getAddr(), Part, !CreateGatherScatter),
9532+
CreateGatherScatter, MaskPart, EVL, Alignment);
9533+
} else if (CreateGatherScatter) {
93799534
Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
93809535
Value *VectorGep = State.get(getAddr(), Part);
93819536
NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -871,13 +871,15 @@ void VPlan::execute(VPTransformState *State) {
871871
// only a single part is generated, which provides the last part from the
872872
// previous iteration. For non-ordered reductions all UF parts are
873873
// generated.
874-
bool SinglePartNeeded = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
875-
isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) ||
876-
(isa<VPReductionPHIRecipe>(PhiR) &&
877-
cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
878-
bool NeedsScalar = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
879-
(isa<VPReductionPHIRecipe>(PhiR) &&
880-
cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
874+
bool SinglePartNeeded =
875+
isa<VPCanonicalIVPHIRecipe>(PhiR) ||
876+
isa<VPFirstOrderRecurrencePHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
877+
(isa<VPReductionPHIRecipe>(PhiR) &&
878+
cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
879+
bool NeedsScalar =
880+
isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
881+
(isa<VPReductionPHIRecipe>(PhiR) &&
882+
cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
881883
unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
882884

883885
for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {

0 commit comments

Comments
 (0)