124
124
#include " llvm/IR/User.h"
125
125
#include " llvm/IR/Value.h"
126
126
#include " llvm/IR/ValueHandle.h"
127
+ #include " llvm/IR/VectorBuilder.h"
127
128
#include " llvm/IR/Verifier.h"
128
129
#include " llvm/Support/Casting.h"
129
130
#include " llvm/Support/CommandLine.h"
@@ -248,10 +249,12 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
248
249
clEnumValN(TailFoldingStyle::DataAndControlFlow, " data-and-control" ,
249
250
" Create lane mask using active.lane.mask intrinsic, and use "
250
251
" it for both data and control flow" ),
251
- clEnumValN(
252
- TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
253
- " data-and-control-without-rt-check" ,
254
- " Similar to data-and-control, but remove the runtime check" )));
252
+ clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
253
+ " data-and-control-without-rt-check" ,
254
+ " Similar to data-and-control, but remove the runtime check" ),
255
+ clEnumValN(TailFoldingStyle::DataWithEVL, " data-with-evl" ,
256
+ " Use predicated EVL instructions for tail folding. If EVL "
257
+ " is unsupported, fallback to data-without-lane-mask." )));
255
258
256
259
static cl::opt<bool > MaximizeBandwidth (
257
260
" vectorizer-maximize-bandwidth" , cl::init(false ), cl::Hidden,
@@ -1505,29 +1508,62 @@ class LoopVectorizationCostModel {
1505
1508
1506
1509
// / Returns the TailFoldingStyle that is best for the current loop.
1507
1510
TailFoldingStyle getTailFoldingStyle (bool IVUpdateMayOverflow = true ) const {
1508
- return IVUpdateMayOverflow ? ChosenTailFoldingStyle.first
1509
- : ChosenTailFoldingStyle.second ;
1511
+ if (!ChosenTailFoldingStyle)
1512
+ return TailFoldingStyle::None;
1513
+ return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1514
+ : ChosenTailFoldingStyle->second ;
1510
1515
}
1511
1516
1512
1517
// / Selects and saves TailFoldingStyle for 2 options - if IV update may
1513
1518
// / overflow or not.
1514
- void setTailFoldingStyles () {
1515
- assert (ChosenTailFoldingStyle.first == TailFoldingStyle::None &&
1516
- ChosenTailFoldingStyle.second == TailFoldingStyle::None &&
1517
- " Tail folding must not be selected yet." );
1518
- if (!Legal->prepareToFoldTailByMasking ())
1519
+ // / \param IsScalableVF true if scalable vector factors enabled.
1520
+ // / \param UserIC User specific interleave count.
1521
+ void setTailFoldingStyles (bool IsScalableVF, unsigned UserIC) {
1522
+ assert (!ChosenTailFoldingStyle && " Tail folding must not be selected yet." );
1523
+ if (!Legal->prepareToFoldTailByMasking ()) {
1524
+ ChosenTailFoldingStyle =
1525
+ std::make_pair (TailFoldingStyle::None, TailFoldingStyle::None);
1519
1526
return ;
1527
+ }
1520
1528
1521
- if (ForceTailFoldingStyle.getNumOccurrences ()) {
1522
- ChosenTailFoldingStyle.first = ChosenTailFoldingStyle.second =
1523
- ForceTailFoldingStyle;
1529
+ if (!ForceTailFoldingStyle.getNumOccurrences ()) {
1530
+ ChosenTailFoldingStyle = std::make_pair (
1531
+ TTI.getPreferredTailFoldingStyle (/* IVUpdateMayOverflow=*/ true ),
1532
+ TTI.getPreferredTailFoldingStyle (/* IVUpdateMayOverflow=*/ false ));
1524
1533
return ;
1525
1534
}
1526
1535
1527
- ChosenTailFoldingStyle.first =
1528
- TTI.getPreferredTailFoldingStyle (/* IVUpdateMayOverflow=*/ true );
1529
- ChosenTailFoldingStyle.second =
1530
- TTI.getPreferredTailFoldingStyle (/* IVUpdateMayOverflow=*/ false );
1536
+ // Set styles when forced.
1537
+ ChosenTailFoldingStyle = std::make_pair (ForceTailFoldingStyle.getValue (),
1538
+ ForceTailFoldingStyle.getValue ());
1539
+ if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1540
+ return ;
1541
+ // Override forced styles if needed.
1542
+ // FIXME: use actual opcode/data type for analysis here.
1543
+ // FIXME: Investigate opportunity for fixed vector factor.
1544
+ bool EVLIsLegal =
1545
+ IsScalableVF && UserIC <= 1 &&
1546
+ TTI.hasActiveVectorLength (0 , nullptr , Align ()) &&
1547
+ !EnableVPlanNativePath &&
1548
+ // FIXME: implement support for max safe dependency distance.
1549
+ Legal->isSafeForAnyVectorWidth () &&
1550
+ // FIXME: remove this once reductions are supported.
1551
+ Legal->getReductionVars ().empty ();
1552
+ if (!EVLIsLegal) {
1553
+ // If for some reason EVL mode is unsupported, fallback to
1554
+ // DataWithoutLaneMask to try to vectorize the loop with folded tail
1555
+ // in a generic way.
1556
+ ChosenTailFoldingStyle =
1557
+ std::make_pair (TailFoldingStyle::DataWithoutLaneMask,
1558
+ TailFoldingStyle::DataWithoutLaneMask);
1559
+ LLVM_DEBUG (
1560
+ dbgs ()
1561
+ << " LV: Preference for VP intrinsics indicated. Will "
1562
+ " not try to generate VP Intrinsics "
1563
+ << (UserIC > 1
1564
+ ? " since interleave count specified is greater than 1.\n "
1565
+ : " due to non-interleaving reasons.\n " ));
1566
+ }
1531
1567
}
1532
1568
1533
1569
// / Returns true if all loop blocks should be masked to fold tail loop.
@@ -1544,6 +1580,18 @@ class LoopVectorizationCostModel {
1544
1580
return foldTailByMasking () || Legal->blockNeedsPredication (BB);
1545
1581
}
1546
1582
1583
+ // / Returns true if VP intrinsics with explicit vector length support should
1584
+ // / be generated in the tail folded loop.
1585
+ bool foldTailWithEVL () const {
1586
+ return getTailFoldingStyle () == TailFoldingStyle::DataWithEVL &&
1587
+ // FIXME: remove this once vp_reverse is supported.
1588
+ none_of (
1589
+ WideningDecisions,
1590
+ [](const std::pair<std::pair<Instruction *, ElementCount>,
1591
+ std::pair<InstWidening, InstructionCost>>
1592
+ &Data) { return Data.second .first == CM_Widen_Reverse; });
1593
+ }
1594
+
1547
1595
// / Returns true if the Phi is part of an inloop reduction.
1548
1596
bool isInLoopReduction (PHINode *Phi) const {
1549
1597
return InLoopReductions.contains (Phi);
@@ -1688,8 +1736,8 @@ class LoopVectorizationCostModel {
1688
1736
1689
1737
// / Control finally chosen tail folding style. The first element is used if
1690
1738
// / the IV update may overflow, the second element - if it does not.
1691
- std::pair<TailFoldingStyle, TailFoldingStyle> ChosenTailFoldingStyle =
1692
- std::make_pair (TailFoldingStyle::None, TailFoldingStyle::None) ;
1739
+ std::optional<std:: pair<TailFoldingStyle, TailFoldingStyle>>
1740
+ ChosenTailFoldingStyle ;
1693
1741
1694
1742
// / A map holding scalar costs for different vectorization factors. The
1695
1743
// / presence of a cost for an instruction in the mapping indicates that the
@@ -4647,9 +4695,24 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4647
4695
// found modulo the vectorization factor is not zero, try to fold the tail
4648
4696
// by masking.
4649
4697
// FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4650
- setTailFoldingStyles ();
4651
- if (foldTailByMasking ())
4698
+ setTailFoldingStyles (MaxFactors.ScalableVF .isScalable (), UserIC);
4699
+ if (foldTailByMasking ()) {
4700
+ if (getTailFoldingStyle () == TailFoldingStyle::DataWithEVL) {
4701
+ LLVM_DEBUG (
4702
+ dbgs ()
4703
+ << " LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4704
+ " try to generate VP Intrinsics with scalable vector "
4705
+ " factors only.\n " );
4706
+ // Tail folded loop using VP intrinsics restricts the VF to be scalable
4707
+ // for now.
4708
+ // TODO: extend it for fixed vectors, if required.
4709
+ assert (MaxFactors.ScalableVF .isScalable () &&
4710
+ " Expected scalable vector factor." );
4711
+
4712
+ MaxFactors.FixedVF = ElementCount::getFixed (1 );
4713
+ }
4652
4714
return MaxFactors;
4715
+ }
4653
4716
4654
4717
// If there was a tail-folding hint/switch, but we can't fold the tail by
4655
4718
// masking, fallback to a vectorization with a scalar epilogue.
@@ -5257,6 +5320,13 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5257
5320
if (!isScalarEpilogueAllowed ())
5258
5321
return 1 ;
5259
5322
5323
+ // Do not interleave if EVL is preferred and no User IC is specified.
5324
+ if (foldTailWithEVL ()) {
5325
+ LLVM_DEBUG (dbgs () << " LV: Preference for VP intrinsics indicated. "
5326
+ " Unroll factor forced to be 1.\n " );
5327
+ return 1 ;
5328
+ }
5329
+
5260
5330
// We used the distance for the interleave count.
5261
5331
if (!Legal->isSafeForAnyVectorWidth ())
5262
5332
return 1 ;
@@ -8487,6 +8557,9 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8487
8557
VPlanTransforms::truncateToMinimalBitwidths (
8488
8558
*Plan, CM.getMinimalBitwidths (), PSE.getSE ()->getContext ());
8489
8559
VPlanTransforms::optimize (*Plan, *PSE.getSE ());
8560
+ // TODO: try to put it close to addActiveLaneMask().
8561
+ if (CM.foldTailWithEVL ())
8562
+ VPlanTransforms::addExplicitVectorLength (*Plan);
8490
8563
assert (verifyVPlanIsValid (*Plan) && " VPlan is invalid" );
8491
8564
VPlans.push_back (std::move (Plan));
8492
8565
}
@@ -9179,7 +9252,7 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
9179
9252
State.Builder .setFastMathFlags (FPBinOp->getFastMathFlags ());
9180
9253
9181
9254
Value *Step = State.get (getStepValue (), VPIteration (0 , 0 ));
9182
- Value *CanonicalIV = State.get (getCanonicalIV ( ), VPIteration (0 , 0 ));
9255
+ Value *CanonicalIV = State.get (getOperand ( 1 ), VPIteration (0 , 0 ));
9183
9256
Value *DerivedIV = emitTransformedIndex (
9184
9257
State.Builder , CanonicalIV, getStartValue ()->getLiveInIRValue (), Step,
9185
9258
Kind, cast_if_present<BinaryOperator>(FPBinOp));
@@ -9307,6 +9380,52 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
9307
9380
State.ILV ->scalarizeInstruction (UI, this , VPIteration (Part, Lane), State);
9308
9381
}
9309
9382
9383
+ // / Creates either vp_store or vp_scatter intrinsics calls to represent
9384
+ // / predicated store/scatter.
9385
+ static Instruction *
9386
+ lowerStoreUsingVectorIntrinsics (IRBuilderBase &Builder, Value *Addr,
9387
+ Value *StoredVal, bool IsScatter, Value *Mask,
9388
+ Value *EVL, const Align &Alignment) {
9389
+ CallInst *Call;
9390
+ if (IsScatter) {
9391
+ Call = Builder.CreateIntrinsic (Type::getVoidTy (EVL->getContext ()),
9392
+ Intrinsic::vp_scatter,
9393
+ {StoredVal, Addr, Mask, EVL});
9394
+ } else {
9395
+ VectorBuilder VBuilder (Builder);
9396
+ VBuilder.setEVL (EVL).setMask (Mask);
9397
+ Call = cast<CallInst>(VBuilder.createVectorInstruction (
9398
+ Instruction::Store, Type::getVoidTy (EVL->getContext ()),
9399
+ {StoredVal, Addr}));
9400
+ }
9401
+ Call->addParamAttr (
9402
+ 1 , Attribute::getWithAlignment (Call->getContext (), Alignment));
9403
+ return Call;
9404
+ }
9405
+
9406
+ // / Creates either vp_load or vp_gather intrinsics calls to represent
9407
+ // / predicated load/gather.
9408
+ static Instruction *lowerLoadUsingVectorIntrinsics (IRBuilderBase &Builder,
9409
+ VectorType *DataTy,
9410
+ Value *Addr, bool IsGather,
9411
+ Value *Mask, Value *EVL,
9412
+ const Align &Alignment) {
9413
+ CallInst *Call;
9414
+ if (IsGather) {
9415
+ Call =
9416
+ Builder.CreateIntrinsic (DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
9417
+ nullptr , " wide.masked.gather" );
9418
+ } else {
9419
+ VectorBuilder VBuilder (Builder);
9420
+ VBuilder.setEVL (EVL).setMask (Mask);
9421
+ Call = cast<CallInst>(VBuilder.createVectorInstruction (
9422
+ Instruction::Load, DataTy, Addr, " vp.op.load" ));
9423
+ }
9424
+ Call->addParamAttr (
9425
+ 0 , Attribute::getWithAlignment (Call->getContext (), Alignment));
9426
+ return Call;
9427
+ }
9428
+
9310
9429
void VPWidenMemoryInstructionRecipe::execute (VPTransformState &State) {
9311
9430
VPValue *StoredValue = isStore () ? getStoredValue () : nullptr ;
9312
9431
@@ -9345,7 +9464,25 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9345
9464
for (unsigned Part = 0 ; Part < State.UF ; ++Part) {
9346
9465
Instruction *NewSI = nullptr ;
9347
9466
Value *StoredVal = State.get (StoredValue, Part);
9348
- if (CreateGatherScatter) {
9467
+ // TODO: split this into several classes for better design.
9468
+ if (State.EVL ) {
9469
+ assert (State.UF == 1 && " Expected only UF == 1 when vectorizing with "
9470
+ " explicit vector length." );
9471
+ assert (cast<VPInstruction>(State.EVL )->getOpcode () ==
9472
+ VPInstruction::ExplicitVectorLength &&
9473
+ " EVL must be VPInstruction::ExplicitVectorLength." );
9474
+ Value *EVL = State.get (State.EVL , VPIteration (0 , 0 ));
9475
+ // If EVL is not nullptr, then EVL must be a valid value set during plan
9476
+ // creation, possibly default value = whole vector register length. EVL
9477
+ // is created only if TTI prefers predicated vectorization, thus if EVL
9478
+ // is not nullptr it also implies preference for predicated
9479
+ // vectorization.
9480
+ // FIXME: Support reverse store after vp_reverse is added.
9481
+ Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr ;
9482
+ NewSI = lowerStoreUsingVectorIntrinsics (
9483
+ Builder, State.get (getAddr (), Part, !CreateGatherScatter),
9484
+ StoredVal, CreateGatherScatter, MaskPart, EVL, Alignment);
9485
+ } else if (CreateGatherScatter) {
9349
9486
Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr ;
9350
9487
Value *VectorGep = State.get (getAddr (), Part);
9351
9488
NewSI = Builder.CreateMaskedScatter (StoredVal, VectorGep, Alignment,
@@ -9375,7 +9512,25 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9375
9512
State.setDebugLocFrom (getDebugLoc ());
9376
9513
for (unsigned Part = 0 ; Part < State.UF ; ++Part) {
9377
9514
Value *NewLI;
9378
- if (CreateGatherScatter) {
9515
+ // TODO: split this into several classes for better design.
9516
+ if (State.EVL ) {
9517
+ assert (State.UF == 1 && " Expected only UF == 1 when vectorizing with "
9518
+ " explicit vector length." );
9519
+ assert (cast<VPInstruction>(State.EVL )->getOpcode () ==
9520
+ VPInstruction::ExplicitVectorLength &&
9521
+ " EVL must be VPInstruction::ExplicitVectorLength." );
9522
+ Value *EVL = State.get (State.EVL , VPIteration (0 , 0 ));
9523
+ // If EVL is not nullptr, then EVL must be a valid value set during plan
9524
+ // creation, possibly default value = whole vector register length. EVL
9525
+ // is created only if TTI prefers predicated vectorization, thus if EVL
9526
+ // is not nullptr it also implies preference for predicated
9527
+ // vectorization.
9528
+ // FIXME: Support reverse loading after vp_reverse is added.
9529
+ Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr ;
9530
+ NewLI = lowerLoadUsingVectorIntrinsics (
9531
+ Builder, DataTy, State.get (getAddr (), Part, !CreateGatherScatter),
9532
+ CreateGatherScatter, MaskPart, EVL, Alignment);
9533
+ } else if (CreateGatherScatter) {
9379
9534
Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr ;
9380
9535
Value *VectorGep = State.get (getAddr (), Part);
9381
9536
NewLI = Builder.CreateMaskedGather (DataTy, VectorGep, Alignment, MaskPart,
0 commit comments